diff --git a/.gitattributes b/.gitattributes
index 64c3ea819..cfe6a375b 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -3,4 +3,4 @@
 *.ipynb linguist-detectable=false
 *RAFT.pdf filter=lfs diff=lfs merge=lfs -text
 *.gif filter=lfs diff=lfs merge=lfs -text
-assets/*.gif filter=lfs diff=lfs merge=lfs -text
+docs/figs/*.gif filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index 697c3e822..1e2c5f31d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,12 +18,13 @@ log/
 regression_test/*/new_output_models
 regression_test/*/new_log
 output_dir/
+tests_out
 
 # data files
 data/
 
 # output models
-output_models/
+output_models
 adapter_model/
 
 # Distribution / packaging
@@ -168,9 +169,6 @@ debug.env
 #ctags
 tags
 
-# pre-commit
-.pre-commit*
-
 # .lock
 *.lock
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..8355e194c
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,8 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: "v0.11.4"
+    hooks:
+      - id: ruff
+        args: ["--fix", "--show-fixes", "--output-format=full"]
+        exclude: ^.*\.(ipynb)$
+      - id: ruff-format
\ No newline at end of file
diff --git a/README.md b/README.md
index fce1d8756..c68bc605a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <p align="center" width="50%">
-<img src="assets/logo.png" alt="LMFlow" style="width: 50%; min-width: 200px; display: block; margin: auto; background-color: transparent;">
+<img src="docs/assets/logo.png" alt="LMFlow" style="width: 50%; min-width: 200px; display: block; margin: auto; background-color: transparent;">
 </p>
 
 # LMFlow
@@ -26,25 +26,26 @@
 An extensible, convenient, and efficient toolbox for finetuning large machine learning models, designed to be user-friendly, speedy and reliable, and accessible to the entire community.
 
 <p align="center" width="100%">
-<img src="assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
+<img src="docs/assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
 </p>
 
 ## Latest News
+> [!IMPORTANT]
+> * :exclamation: [2025-07-09] We have a major update to LMFlow with full Accelerate support and extensive streamlining. If you're looking for the previous version, please use `git checkout v0.0.10`, or check out the [v0.0.10 branch](https://github.com/OptimalScale/LMFlow/tree/v0.0.10). View all releases [here](https://github.com/OptimalScale/LMFlow/tags).
 
-* [2025-03-18] With full support for Accelerate and lots of streamlining, LMFlow-nightly is now available! Feel free to try out the latest features and improvements by `git checkout lmflow-nightly`.
 * [2024-12-02] Support [Hymba](https://github.com/NVlabs/hymba), a new family of small language models featuring a hybrid-head parallel architecture. Check out [Post-training Hymba](https://github.com/OptimalScale/LMFlow/tree/main/experimental/Hymba) for more details.
 * [2024-07-01] 🏆 LMFlow receives the [**Best Demo Paper Award**](https://docs.google.com/presentation/d/1TVDooAZqkNObz5ysVhDFtqnnVHR-u8wqYvgix-gzPMs/edit#slide=id.g2e55907bbcc_0_70) at **NAACL 2024**! 🎉
 * [2024-06-30] Expanding Optimization Options! We now support custom optimizer training with a variety of optimizers. Dive into the details and try out the new features with our updated script at [custom_optimizers](https://github.com/OptimalScale/LMFlow/blob/main/scripts/run_finetune_with_custom_optim.sh).
 * [2024-04-25] :rocket: Support conversation template! We've preset the latest [Llama-3](https://huggingface.co/meta-llama/Meta-Llama-3-70B) and [Phi-3](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) conversation templates as well as some frequently used templates such as `chatml` (see all templates [here](https://optimalscale.github.io/LMFlow/examples/DATASETS.html#conversation-template)), and we are working on adding more preset templates. Adding corresponding `--conversation_template` in the shell script and you are all set! :rocket:
-* [2024-03-27] Support [LISA](https://arxiv.org/abs/2403.17919), enabling 7B training in 24G memory without offloading! 
-* [2023-09-11] Support [speculative decoding](https://arxiv.org/abs/2211.17192). Check out [speculative_decoding](https://github.com/OptimalScale/LMFlow/blob/main/scripts/speculative_decoding/README.md) for the usage and acceleration details.
-* [2023-08-14] Support long context inference with position interpolation (Linear & NTK scaling ) for LLaMA models. Check out [postion_interpolation](https://github.com/OptimalScale/LMFlow/blob/main/readme/Position_Interpolation.md) for more details.
 
 <details> <summary>More news...</summary>
 
+* [2024-03-27] Support [LISA](https://arxiv.org/abs/2403.17919), enabling 7B training in 24G memory without offloading! 
+* [2023-09-11] Support [speculative decoding](https://arxiv.org/abs/2211.17192). Check out [speculative_decoding](https://github.com/OptimalScale/LMFlow/blob/main/scripts/speculative_decoding/README.md) for the usage and acceleration details.
+* [2023-08-14] Support long context inference with position interpolation (Linear & NTK scaling ) for LLaMA models. Check out [postion_interpolation](https://github.com/OptimalScale/LMFlow/blob/main/readme/Position_Interpolation.md) for more details.
 * [2023-08-07] Support [Flash Attention-2](https://crfm.stanford.edu/2023/07/17/flash2.html). Check out [flash_attention](https://github.com/OptimalScale/LMFlow/blob/main/readme/flash_attn2.md) for more details.
 * [2023-08-02] Support [Llama2](https://ai.meta.com/llama/), [ChatGLM2](https://huggingface.co/THUDM/chatglm2-6b), and [Baichuan](https://huggingface.co/baichuan-inc/Baichuan-7B) models.
-* [2023-07-23] [LMFlow multimodal chatbot](https://github.com/OptimalScale/LMFlow/blob/main/scripts/run_vis_chatbot_gradio_minigpt4.sh) is now available! Support multimodal inputs of images and texts. [Online Demo](http://multimodal.lmflow.online) is also provided (We hold the service on a single GPU, hence one may experience "queuing" or "application busy" sometimes when multiple users are accessing at the same time, please wait and attempt again later when such event happens)![image](https://github.com/OptimalScale/LMFlow/blob/rpan-vision-encoder/assets/multimodal-chatbot-demo.gif)
+* [2023-07-23] [LMFlow multimodal chatbot](https://github.com/OptimalScale/LMFlow/blob/main/scripts/run_vis_chatbot_gradio_minigpt4.sh) is now available! Support multimodal inputs of images and texts. [Online Demo](http://multimodal.lmflow.online) is also provided (We hold the service on a single GPU, hence one may experience "queuing" or "application busy" sometimes when multiple users are accessing at the same time, please wait and attempt again later when such event happens)![image](https://github.com/OptimalScale/LMFlow/blob/rpan-vision-encoder/docs/assets/multimodal-chatbot-demo.gif)
 * [2023-06-22]  [LMFlow paper](https://arxiv.org/abs/2306.12420) is out! Check out our implementation details at https://arxiv.org/abs/2306.12420
 * [2023-06-16] Our finetuned Robin-33B-V2 scored an impressive 64.1 on the Huggingface LLM leaderboard in our offline evaluation, outperforming major open-source LLMs! All checkpoints (7B, 13B, 33B, and 65B) are [released](https://huggingface.co/OptimalScale)! Checkout the performance [here](https://medium.com/@hkust.ml/robin-v2-launches-achieves-unparalleled-performance-on-openllm-4f6886e822c1).
 * [2023-06-07] LMFlow is now officially available on PyPI! Install it with `pip install lmflow-finetune`!
@@ -69,11 +70,11 @@ An extensible, convenient, and efficient toolbox for finetuning large machine le
 - [LMFlow](#lmflow)
   - [Latest News](#latest-news)
   - [Table of Contents](#table-of-contents)
-  - [Supported Models](#supported-models)
   - [Quick Start](#quick-start)
     - [Setup](#setup)
     - [Prepare Dataset](#prepare-dataset)
     - [Finetuning](#finetuning)
+      - [Estimated Hardware Requirement](#estimated-hardware-requirement)
       - [Full Finetuning](#full-finetuning)
       - [LISA](#lisa)
       - [LoRA](#lora)
@@ -85,21 +86,6 @@ An extensible, convenient, and efficient toolbox for finetuning large machine le
   - [License](#license)
   - [Citation](#citation)
 
-## Supported Models
-
-See all conversation template details [here](https://optimalscale.github.io/LMFlow/examples/supported_conversation_template.html).
-
-|  Model  | Conversation Template |
-|  :---:  | :-------------------: |
-| DeepSeek | `deepseek` <br> `deepseek_v2` <br> `deepseek_r1` <br> `deepseek_r1_distill` <br> `deepseek_v3` |
-| Gemma | `gemma` |
-| Hymba | `hymba` |
-| InternLM2 | `internlm2` |
-| LLaMA | `llama2` <br> `llama3` <br> `llama3_for_tool`|
-| Phi | `phi3` |
-| Qwen | `qwen2` <br> `qwen2_for_tool` <br> `qwen2_5` <br> `qwen2_5_1m` <br> `qwen2_5_math` <br> `qwen_qwq` |
-| Yi | `yi` <br> `yi1_5` |
-| Zephyr | `zephyr` |
 
 ## Quick Start
 
@@ -108,7 +94,18 @@ See all conversation template details [here](https://optimalscale.github.io/LMFl
 Our package has been tested on Linux OS (Ubuntu 20.04). Other OS platforms (MacOS, Windows) are not fully tested, where you may encounter unexpected errors. If you are using LMFlow for the first time, we recommend you to try on a Linux machine or Google Colab.
 
 ```bash
-git clone -b v0.0.9 https://github.com/OptimalScale/LMFlow.git
+git clone -b v1.0.0 https://github.com/OptimalScale/LMFlow.git
+cd LMFlow
+conda create -n lmflow python=3.9 -y
+conda activate lmflow
+conda install mpi4py
+pip install -e .
+```
+
+<details><summary> Looking for a previous version? </summary>
+
+```bash
+git clone -b v0.0.10 https://github.com/OptimalScale/LMFlow.git
 cd LMFlow
 conda create -n lmflow python=3.9 -y
 conda activate lmflow
@@ -116,7 +113,9 @@ conda install mpi4py
 pip install -e .
 ```
 
-<details><summary> for CUDA versions 10.3-11.7 </summary>
+</details>
+
+<details><summary> For CUDA versions 10.3-11.7 </summary>
 
 ```bash
 git clone -b v0.0.5 https://github.com/OptimalScale/LMFlow.git
@@ -162,6 +161,16 @@ Please refer to our [doc](https://optimalscale.github.io/LMFlow/examples/DATASET
 
 ### Finetuning
 
+#### Estimated Hardware Requirement
+
+| Method                 | 0.5B |  3B  |  7B  |  14B  |  30B  |  70B  |  `x`B   |
+| ---------------------- | ---- | ---- | ---- | ----- | ----- | ----- | ------- |
+| Full `bf16`/`fp16`     |  9GB | 55GB |120GB | 240GB | 600GB | 1200GB| `18x`GB |
+| LoRA                   |  1GB | 6GB  | 16GB |  32GB |  64GB | 160GB |  `2x`GB |
+| QLoRA `quant_bit=8`    | 0.7GB| 3GB  | 10GB |  20GB |  40GB |   80GB|  `x`GB  |
+| QLoRA `quant_bit=4`    | 0.4GB| 1.5GB|  6GB |  12GB |  24GB |   48GB| `x/2`GB |
+
+
 #### Full Finetuning
 
 Full training updates all the parameters to finetune a language model.
diff --git a/configs/accelerate_fsdp_config.yaml b/configs/accelerate_fsdp_config.yaml
new file mode 100644
index 000000000..9b0199849
--- /dev/null
+++ b/configs/accelerate_fsdp_config.yaml
@@ -0,0 +1,29 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+
+fsdp_config:
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_min_num_params: 1000000
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_forward_prefetch: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+
+downcast_bf16: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8 # NOTE: distributed_type should be `NO` if you're training on a single GPU
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+main_process_port: 1204
\ No newline at end of file
diff --git a/configs/accelerate_singlegpu_config.yaml b/configs/accelerate_singlegpu_config.yaml
new file mode 100644
index 000000000..d74fc4e63
--- /dev/null
+++ b/configs/accelerate_singlegpu_config.yaml
@@ -0,0 +1,29 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: 'NO'
+
+fsdp_config:
+  fsdp_auto_wrap_policy: SIZE_BASED_WRAP
+  fsdp_min_num_params: 1000000
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_forward_prefetch: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: 'NO_SHARD'
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+
+downcast_bf16: true
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+main_process_port: 1204
\ No newline at end of file
diff --git a/configs/accelerator_multigpu_config.yaml b/configs/archive/accelerate_multigpu_config.yaml
similarity index 100%
rename from configs/accelerator_multigpu_config.yaml
rename to configs/archive/accelerate_multigpu_config.yaml
diff --git a/configs/accelerator_singlegpu_config.yaml b/configs/archive/accelerate_singlegpu_config.yaml
similarity index 100%
rename from configs/accelerator_singlegpu_config.yaml
rename to configs/archive/accelerate_singlegpu_config.yaml
diff --git a/configs/ds_config_chatbot.json b/configs/archive/ds_config_chatbot.json
similarity index 100%
rename from configs/ds_config_chatbot.json
rename to configs/archive/ds_config_chatbot.json
diff --git a/configs/ds_config_eval.json b/configs/archive/ds_config_eval.json
similarity index 100%
rename from configs/ds_config_eval.json
rename to configs/archive/ds_config_eval.json
diff --git a/configs/ds_config_multimodal.json b/configs/archive/ds_config_multimodal.json
similarity index 100%
rename from configs/ds_config_multimodal.json
rename to configs/archive/ds_config_multimodal.json
diff --git a/configs/ds_config_vis_chatbot.json b/configs/archive/ds_config_vis_chatbot.json
similarity index 100%
rename from configs/ds_config_vis_chatbot.json
rename to configs/archive/ds_config_vis_chatbot.json
diff --git a/configs/ds_config_zero0_no_offload.json b/configs/deepspeed/zero0_no_offload.json
similarity index 100%
rename from configs/ds_config_zero0_no_offload.json
rename to configs/deepspeed/zero0_no_offload.json
diff --git a/configs/ds_config_zero2.json b/configs/deepspeed/zero2.json
similarity index 100%
rename from configs/ds_config_zero2.json
rename to configs/deepspeed/zero2.json
diff --git a/configs/ds_config_zero2_no_offload.json b/configs/deepspeed/zero2_no_offload.json
similarity index 100%
rename from configs/ds_config_zero2_no_offload.json
rename to configs/deepspeed/zero2_no_offload.json
diff --git a/configs/ds_config_zero3.json b/configs/deepspeed/zero3.json
similarity index 100%
rename from configs/ds_config_zero3.json
rename to configs/deepspeed/zero3.json
diff --git a/configs/ds_config_zero3_for_eval.json b/configs/deepspeed/zero3_for_eval.json
similarity index 100%
rename from configs/ds_config_zero3_for_eval.json
rename to configs/deepspeed/zero3_for_eval.json
diff --git a/configs/ds_config_zero3_no_offload.json b/configs/deepspeed/zero3_no_offload.json
similarity index 100%
rename from configs/ds_config_zero3_no_offload.json
rename to configs/deepspeed/zero3_no_offload.json
diff --git a/configs/iterative_dpo.yaml b/configs/iterative_dpo.yaml
index 1b18e1d03..d70a41ead 100644
--- a/configs/iterative_dpo.yaml
+++ b/configs/iterative_dpo.yaml
@@ -17,7 +17,6 @@ preprocessing_num_workers: 16
 output_dir: ./output_models/iterative_dpo
 run_name: iterative_dpo
 random_seed: 42
-use_accelerator: True
 enable_distributed_inference: True
 distributed_inference_num_instances: 8
 initial_iter_idx: 0 # 0 refers to the first dataset in dataset_path_list
diff --git a/contrib/langchain/retrieval_chatbot.py b/contrib/langchain/retrieval_chatbot.py
index f8bf7c5fb..97a0434ca 100644
--- a/contrib/langchain/retrieval_chatbot.py
+++ b/contrib/langchain/retrieval_chatbot.py
@@ -1,38 +1,36 @@
-from langchain_openai import ChatOpenAI
+import argparse
+import logging
+import os
+import re
+from pathlib import Path
+
 from langchain_anthropic import ChatAnthropic
-from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain_community.llms import HuggingFacePipeline
+from langchain_chroma import Chroma
 from langchain_community.chat_message_histories import ChatMessageHistory
+
+# retrieval usage
+from langchain_community.document_loaders import TextLoader, WebBaseLoader
+from langchain_community.llms import HuggingFacePipeline
 from langchain_core.chat_history import BaseChatMessageHistory
+from langchain_core.messages import SystemMessage
+from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
 from langchain_core.runnables import Runnable
 from langchain_core.runnables.history import RunnableWithMessageHistory
-from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, MessagesPlaceholder
-from langchain_core.messages import SystemMessage
-
-# retrieval usage
-from langchain_community.document_loaders import WebBaseLoader, TextLoader
+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_chroma import Chroma
-from langchain_openai import OpenAIEmbeddings
 
-from pathlib import Path
-import re
-import os
-import argparse
-import logging
 logging.getLogger().setLevel(logging.ERROR)  # hide warning log
 
 
 class LangchainChatbot:
-    def __init__(self,
-                 model_name_or_path: str,
-                 provider: str):
+    def __init__(self, model_name_or_path: str, provider: str):
         self.prompt = ChatPromptTemplate.from_messages(
             [
                 SystemMessage(content="You are a helpful chatbot."),
                 MessagesPlaceholder(variable_name="history"),
                 MessagesPlaceholder(variable_name="retriever", optional=True),
-                HumanMessagePromptTemplate.from_template("{input}")
+                HumanMessagePromptTemplate.from_template("{input}"),
             ]
         )
         self.model_name_or_path = model_name_or_path
@@ -53,13 +51,13 @@ def __init__(self,
     def check_valid_provider(self):
         provider = self.provider
         model_name_or_path = self.model_name_or_path
-        if provider == "openai" and 'gpt' in model_name_or_path:
+        if provider == "openai" and "gpt" in model_name_or_path:
             if os.getenv("OPENAI_API_KEY") is None:
                 raise OSError("OPENAI_API_KEY environment variable is not set.")
-        elif provider == "anthropic" and 'claude' in model_name_or_path:
+        elif provider == "anthropic" and "claude" in model_name_or_path:
             if os.getenv("ANTHROPIC_API_KEY") is None:
                 raise OSError("ANTHROPIC_API_KEY environment variable is not set.")
-        elif provider == "google" and 'gemini' in model_name_or_path:
+        elif provider == "google" and "gemini" in model_name_or_path:
             if os.getenv("GOOGLE_API_KEY") is None:
                 raise OSError("GOOGLE_API_KEY environment variable is not set.")
         elif provider == "huggingface":
@@ -77,7 +75,7 @@ def set_retriever_url(self, url, chunk_size, chunk_overlap):
         self.retriever_url = vectorstore.as_retriever(k=4)
 
     def set_retriever_file(self, file, chunk_size, chunk_overlap):
-        loader = TextLoader(file, encoding='utf-8')
+        loader = TextLoader(file, encoding="utf-8")
         data = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         all_splits = text_splitter.split_documents(data)
@@ -107,16 +105,16 @@ def chat_with_chatbot(self, human_input, session_id):
         if self.retriever_file:
             retriever_search.extend(self.retrieve_by_file(human_input))
 
-        response = self.llm_chain.invoke({"input": human_input,
-                                          "retriever": retriever_search},
-                                         config={"configurable": {"session_id": session_id}})
+        response = self.llm_chain.invoke(
+            {"input": human_input, "retriever": retriever_search}, config={"configurable": {"session_id": session_id}}
+        )
         return response if self.provider == "huggingface" else response.content
 
     def retrieve_by_url(self, query):
-        return [re.sub('\n+', '\n', dict(result)['page_content']) for result in self.retriever_url.invoke(query)]
+        return [re.sub("\n+", "\n", dict(result)["page_content"]) for result in self.retriever_url.invoke(query)]
 
     def retrieve_by_file(self, query):
-        return [re.sub('\n+', '\n', dict(result)['page_content']) for result in self.retriever_file.invoke(query)]
+        return [re.sub("\n+", "\n", dict(result)["page_content"]) for result in self.retriever_file.invoke(query)]
 
     def get_session_history(self, session_id: str) -> BaseChatMessageHistory:
         if session_id not in self.memory:
@@ -128,48 +126,32 @@ def get_cli() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(
         description=__doc__,
     )
-    parser.add_argument(
-        "--model-name-or-path", type=str, help="Model name or path"
-    )
-    parser.add_argument(
-        "--provider", type=str, help="Provider of the model"
-    )
-    parser.add_argument(
-        "--set-url", action="store_true", help="Set a URL for retrieval if enabled"
-    )
-    parser.add_argument(
-        "--set-txt", action="store_true", help="Set a single text file for retrieval if enabled"
-    )
-    parser.add_argument(
-        "--chunk-size", type=int, default=400, help="Chunk size for splitting documents."
-    )
-    parser.add_argument(
-        "--chunk-overlap", type=int, default=20, help="Chunk overlap for splitting documents."
-    )
-    parser.add_argument(
-        "--session-id", type=str, default="demo", help="Session id of this chat"
-    )
-    parser.add_argument(
-        "--save-history", action="store_true", help="Save chat history if enabled"
-    )
+    parser.add_argument("--model-name-or-path", type=str, help="Model name or path")
+    parser.add_argument("--provider", type=str, help="Provider of the model")
+    parser.add_argument("--set-url", action="store_true", help="Set a URL for retrieval if enabled")
+    parser.add_argument("--set-txt", action="store_true", help="Set a single text file for retrieval if enabled")
+    parser.add_argument("--chunk-size", type=int, default=400, help="Chunk size for splitting documents.")
+    parser.add_argument("--chunk-overlap", type=int, default=20, help="Chunk overlap for splitting documents.")
+    parser.add_argument("--session-id", type=str, default="demo", help="Session id of this chat")
+    parser.add_argument("--save-history", action="store_true", help="Save chat history if enabled")
     parser.add_argument(
         "--save-dir", type=Path, default=Path("tmp", "chat_history"), help="Directory to store chat history"
     )
     return parser
 
 
-def main(model_name_or_path: str,
-         provider: str,
-         set_url: bool,
-         set_txt: bool,
-         chunk_size: int,
-         chunk_overlap: int,
-         session_id: str,
-         save_history: bool,
-         save_dir: Path
-         ):
-    chatbot = LangchainChatbot(model_name_or_path=model_name_or_path,
-                               provider=provider)
+def main(
+    model_name_or_path: str,
+    provider: str,
+    set_url: bool,
+    set_txt: bool,
+    chunk_size: int,
+    chunk_overlap: int,
+    session_id: str,
+    save_history: bool,
+    save_dir: Path,
+):
+    chatbot = LangchainChatbot(model_name_or_path=model_name_or_path, provider=provider)
     if set_url:
         url = input("Please enter the url: ")
         chatbot.set_retriever_url(url, chunk_size, chunk_overlap)
@@ -183,12 +165,12 @@ def main(model_name_or_path: str,
         response = chatbot.chat_with_chatbot(human_input, session_id)
         print(f"Chatbot: {response}")
     if save_history:
-        if '/' in model_name_or_path:
+        if "/" in model_name_or_path:
             model_name_or_path = Path(model_name_or_path).parts[-1]
         if not os.path.exists(save_dir):
             os.makedirs(save_dir)
         save_path = Path(save_dir, f"{model_name_or_path}_{session_id}.txt")
-        with open(save_path, 'w') as file:
+        with open(save_path, "w") as file:
             file.write(str(chatbot.memory[session_id].messages))
 
 
diff --git a/contrib/long-context/sft_summarizer.py b/contrib/long-context/sft_summarizer.py
index 34190dc29..979b57458 100644
--- a/contrib/long-context/sft_summarizer.py
+++ b/contrib/long-context/sft_summarizer.py
@@ -1,35 +1,36 @@
 #!/usr/bin/env python
-# coding=utf-8
 import logging
-import os,sys
-# os.environ['CUDA_VISIBLE_DEVICES'] = "6"
-from transformers.trainer_callback import TrainerControl, TrainerState
-import wandb
-from colorama import Fore,init
-from typing import Optional, List
+import os
+from dataclasses import dataclass, field
+from typing import Optional
 
 import torch
+import wandb
+from colorama import Fore, init
 from datasets import load_dataset
-from dataclasses import dataclass, field
 from tqdm.rich import tqdm
-from transformers import AutoTokenizer, TrainingArguments, TrainerCallback
+from transformers import AutoTokenizer, TrainerCallback, TrainingArguments
+
+# os.environ['CUDA_VISIBLE_DEVICES'] = "6"
+from transformers.trainer_callback import TrainerControl, TrainerState
 
 from lmflow.utils.versioning import is_trl_available
 
 if is_trl_available():
     from trl import (
-        ModelConfig,
-        SFTTrainer,
         DataCollatorForCompletionOnlyLM,
+        ModelConfig,
         SFTConfig,
+        SFTTrainer,
+        get_kbit_device_map,
         get_peft_config,
         get_quantization_config,
-        get_kbit_device_map,
     )
     from trl.commands.cli_utils import TrlParser
 else:
     raise ImportError("Please install trl package to use sft_summarizer.py")
 
+
 @dataclass
 class UserArguments:
     wandb_key: Optional[str] = field(
@@ -39,35 +40,36 @@ class UserArguments:
         default="huggingface_sft_summarizer", metadata={"help": "The name of project saved in wandb"}
     )
 
+
 if __name__ == "__main__":
     # Initialize logging, tqdm and init
-    logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
     tqdm.pandas()
     init(autoreset=True)
 
     parser = TrlParser((UserArguments, SFTConfig, ModelConfig))
     user_args, sft_config, model_config = parser.parse_args_and_config()
-    
+
     # Initialize wandb
     if user_args.wandb_key:
-        wandb.login(key=user_args.wandb_key) # replace your own wandb key if there are multiple wandb accounts in your server
+        wandb.login(
+            key=user_args.wandb_key
+        )  # replace your own wandb key if there are multiple wandb accounts in your server
     else:
         wandb.init(mode="offline")
     wandb.init(project=user_args.wandb_projectname)
 
     # https://huggingface.co/docs/transformers/en/main_classes/trainer#transformers.TrainingArguments
     logging.debug(sft_config)
-    logging.debug('-' * 50)
+    logging.debug("-" * 50)
     logging.debug(model_config)
-    logging.debug('-' * 50)
-    logging.debug('cuda===> %s', os.environ['CUDA_VISIBLE_DEVICES'])
-
+    logging.debug("-" * 50)
+    logging.debug("cuda===> %s", os.environ["CUDA_VISIBLE_DEVICES"])
 
     if model_config.use_peft:
-        use_peft = 'peft'
+        use_peft = "peft"
     else:
-        use_peft = 'nopeft'
-
+        use_peft = "nopeft"
 
     ################
     # Model & Tokenizer
@@ -79,7 +81,7 @@ class UserArguments:
     )
     logging.debug("torch_dtype===> %s", torch_dtype)
     if model_config.use_peft:
-        quantization_config = None 
+        quantization_config = None
     else:
         quantization_config = get_quantization_config(model_config)
     logging.debug("quantization_config===> %s", quantization_config)
@@ -91,7 +93,7 @@ class UserArguments:
         use_cache=False if sft_config.gradient_checkpointing else True,
         device_map=get_kbit_device_map() if quantization_config is not None else None,
         quantization_config=quantization_config,
-        local_files_only=True
+        local_files_only=True,
     )
     logging.debug("model_kwargs: %s", model_kwargs)
     tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path, use_fast=True, local_files_only=True)
@@ -101,10 +103,10 @@ class UserArguments:
     # Dataset
     ################
 
-    train_dataset = load_dataset("LukaMagic077/downsampled_below10k_arxiv_dataset_on_hub", split='train')
-    val_dataset = load_dataset("LukaMagic077/downsampled_below10k_arxiv_dataset_on_hub", split='validation')
+    train_dataset = load_dataset("LukaMagic077/downsampled_below10k_arxiv_dataset_on_hub", split="train")
+    val_dataset = load_dataset("LukaMagic077/downsampled_below10k_arxiv_dataset_on_hub", split="validation")
     # test_dataset = load_dataset("LukaMagic077/downsampled_below10k_arxiv_dataset_on_hub", split='test')
-    
+
     # Get the size of training dataset
     train_dataset_size = len(train_dataset)
     # Get the size of validation dataset
@@ -117,13 +119,10 @@ class UserArguments:
     ################
     # Training
     ################
-    
+
     # Define datacollector
     data_collector = DataCollatorForCompletionOnlyLM(
-        instruction_template="article",
-        response_template="abstract",
-        tokenizer=tokenizer,
-        mlm=False
+        instruction_template="article", response_template="abstract", tokenizer=tokenizer, mlm=False
     )
 
     class WandbCallback(TrainerCallback):
@@ -131,20 +130,21 @@ def __init__(self, trainer):
             # trainer.model.to("cuda:0")
             self.model, self.tokenizer = trainer.model, trainer.tokenizer
             self.tokenizer.pad_token = self.tokenizer.eos_token
-            logging.debug(Fore.GREEN + "entering callback=====>")
+            logging.debug(Fore.GREEN)
+            logging.debug("entering callback=====>")
             logging.debug(self.tokenizer)
+
         def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
             logging.debug("current step %s", state.global_step)
             return super().on_save(args, state, control, **kwargs)
 
-
     trainer = SFTTrainer(
         model=model_config.model_name_or_path,
         model_init_kwargs=model_kwargs,
         args=sft_config,
-        train_dataset= train_dataset,
+        train_dataset=train_dataset,
         dataset_text_field="article",
-        eval_dataset= val_dataset,
+        eval_dataset=val_dataset,
         tokenizer=tokenizer,
         peft_config=get_peft_config(model_config),
     )
diff --git a/contrib/rlhflow/reward_modeling.py b/contrib/rlhflow/reward_modeling.py
index 9cd755ead..7a5b2a8cc 100644
--- a/contrib/rlhflow/reward_modeling.py
+++ b/contrib/rlhflow/reward_modeling.py
@@ -1,24 +1,23 @@
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Union
 import os
 import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 import numpy as np
 import torch
 import torch.nn as nn
 from datasets import load_dataset
 from peft import LoraConfig, TaskType, get_peft_model
-
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
     HfArgumentParser,
     Trainer,
 )
+
 from lmflow.args import (
-    ModelArguments,
-    DatasetArguments,
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
 
 ## Prepare training_args
@@ -31,7 +30,7 @@
 else:
     model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
 
-pipeline_args.remove_unused_columns = False 
+pipeline_args.remove_unused_columns = False
 pipeline_args.label_names = []
 
 ## Get model, by default we use lora to accelerate training
@@ -42,7 +41,9 @@
     lora_dropout=0.1,
 )
 # trust_remote_code=True if you want to use chatglm
-model = AutoModelForSequenceClassification.from_pretrained(model_args.model_name_or_path, num_labels=1, torch_dtype=torch.bfloat16)
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_args.model_name_or_path, num_labels=1, torch_dtype=torch.bfloat16
+)
 model_lora = get_peft_model(model, peft_config)
 model_lora.print_trainable_parameters()
 
@@ -66,15 +67,18 @@
 model_lora.config.pad_token_id = tokenizer.eos_token_id
 assert model_lora.config.pad_token_id == tokenizer.pad_token_id
 
+
 ## Get the dataset
 def build_dataset(tokenizer, config):
-    ''' 
+    """
     We assume that we have preprocessed the dataset appropriately such that the sample is organized as follows:
-    {"positive": prompt + answer_positive, "negative": prompt + answer_negative}, where the positive response is preferred.
-    '''
+    {"positive": prompt + answer_positive, "negative": prompt + answer_negative},
+    where the positive response is preferred.
+    """
+
     def tokenize(sample):
-        tokenized_pos = tokenizer(sample['positive'], truncation=True)
-        tokenized_neg = tokenizer(sample['negative'], truncation=True)
+        tokenized_pos = tokenizer(sample["positive"], truncation=True)
+        tokenized_neg = tokenizer(sample["negative"], truncation=True)
         sample["chosen_input_ids"] = tokenized_pos["input_ids"]
         sample["chosen_attention_mask"] = tokenized_pos["attention_mask"]
         sample["rejected_input_ids"] = tokenized_neg["input_ids"]
@@ -86,7 +90,7 @@ def tokenize(sample):
     ds = ds.filter(lambda x: len(x["chosen_input_ids"]) <= 512 and len(x["rejected_input_ids"]) <= 512)
     eval_dataset = None
     if config.validation_split_percentage > 0:
-        idx_gap = int((1-config.validation_split_percentage/100) * len(ds))
+        idx_gap = int((1 - config.validation_split_percentage / 100) * len(ds))
         train_dataset = ds.select(range(idx_gap))
         eval_dataset = ds.select(range(idx_gap, len(ds)))
     else:
@@ -94,20 +98,23 @@ def tokenize(sample):
 
     return train_dataset, eval_dataset
 
+
 train_dataset, eval_dataset = build_dataset(tokenizer, data_args)
 if not eval_dataset and pipeline_args.eval_steps > 0:
     raise ValueError("Cannot evaluate on an empty eval set")
 print("Training set: ", len(train_dataset), " Eval set: ", len(eval_dataset))
 
+
 ## Define the trainer
 def compute_metrics(eval_pred):
     result = {}
     pos_predictions_scores = eval_pred.predictions[0]
     neg_predictions_scores = eval_pred.predictions[1]
     # We assume that the first sample is preferred by default in groundtruth
-    result['accuracy'] = np.sum(pos_predictions_scores >= neg_predictions_scores) / len(pos_predictions_scores)
+    result["accuracy"] = np.sum(pos_predictions_scores >= neg_predictions_scores) / len(pos_predictions_scores)
     return result
-    
+
+
 class DataCollatorReward:
     def __init__(self, tokenizer):
         self.tokenizer = tokenizer
@@ -117,27 +124,34 @@ def __call__(self, data):
         data_pos = []
         data_neg = []
         for sample in data:
-            data_pos.append({"input_ids": sample['chosen_input_ids'], "attention_mask": sample["chosen_attention_mask"]})
-            data_neg.append({"input_ids": sample['rejected_input_ids'], "attention_mask": sample["rejected_attention_mask"]})
+            data_pos.append(
+                {"input_ids": sample["chosen_input_ids"], "attention_mask": sample["chosen_attention_mask"]}
+            )
+            data_neg.append(
+                {"input_ids": sample["rejected_input_ids"], "attention_mask": sample["rejected_attention_mask"]}
+            )
         batch_pos = self.tokenizer.pad(data_pos, padding=True, return_tensors="pt")
         batch_neg = self.tokenizer.pad(data_neg, padding=True, return_tensors="pt")
-        batch['chosen_input_ids'] = batch_pos['input_ids']
-        batch['rejected_input_ids'] = batch_neg['input_ids']
-        batch['chosen_attention_mask'] = batch_pos['attention_mask']
-        batch['rejected_attention_mask'] = batch_neg['attention_mask']
-        batch['return_loss'] = True
+        batch["chosen_input_ids"] = batch_pos["input_ids"]
+        batch["rejected_input_ids"] = batch_neg["input_ids"]
+        batch["chosen_attention_mask"] = batch_pos["attention_mask"]
+        batch["rejected_attention_mask"] = batch_neg["attention_mask"]
+        batch["return_loss"] = True
         return batch
 
 
 class RMTrainer(Trainer):
     def compute_loss(self, model, inputs, return_outputs=False):
         chosen_rewards = model(input_ids=inputs["chosen_input_ids"], attention_mask=inputs["chosen_attention_mask"])[0]
-        rejected_rewards = model(input_ids=inputs["rejected_input_ids"], attention_mask=inputs["rejected_attention_mask"])[0]
+        rejected_rewards = model(
+            input_ids=inputs["rejected_input_ids"], attention_mask=inputs["rejected_attention_mask"]
+        )[0]
         loss = -nn.functional.logsigmoid(chosen_rewards - rejected_rewards).mean()
         if return_outputs:
             return loss, {"chosen_rewards": chosen_rewards, "rejected_rewards": rejected_rewards}
         return loss
 
+
 data_collator = DataCollatorReward(tokenizer=tokenizer)
 trainer = RMTrainer(
     model=model_lora,
@@ -152,4 +166,3 @@ def compute_loss(self, model, inputs, return_outputs=False):
 
 ## Save model
 model_lora.save_pretrained(pipeline_args.output_dir)
-
diff --git a/contrib/rlhflow/run_reward_modeling.sh b/contrib/rlhflow/run_reward_modeling.sh
index d83fe789f..fcee04ef5 100644
--- a/contrib/rlhflow/run_reward_modeling.sh
+++ b/contrib/rlhflow/run_reward_modeling.sh
@@ -30,7 +30,7 @@ deepspeed ${deepspeed_args} \
     --block_size 512 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1\
-    --deepspeed configs/ds_config_zero2.json \
+    --deepspeed configs/archive/ds_config_zero2.json \
     --bf16 \
     --run_name rm_test \
     --validation_split_percentage 10 \
diff --git a/contrib/text2image/diffuser_args.py b/contrib/text2image/diffuser_args.py
index 7a2cf4697..0d6f2a9ce 100644
--- a/contrib/text2image/diffuser_args.py
+++ b/contrib/text2image/diffuser_args.py
@@ -1,54 +1,45 @@
-from dataclasses import dataclass, field
-from typing import Optional, List
 import os
+from dataclasses import dataclass, field
+from typing import Optional
 
 from lmflow.args import DatasetArguments
 
+
 @dataclass
 class T2IDatasetArguments(DatasetArguments):
     """Arguments for T2I dataset"""
-    
-    image_folder: Optional[str] = field(
-        default=None, metadata={"help": "The folder of the image file."}
-    )
-    
-    image_size: Optional[int] = field(
-        default=512, metadata={"help": "The size of the image."}
-    )
-    
-    image_crop_type: Optional[str] = field(
-        default="center", metadata={"help": "The type of image crop."}
-    )
-    
-    text_embedding_type: Optional[str] = field(
-        default="raw", metadata={"help": "How to get text embedding."}
-    )
-    
-    is_t2i: Optional[bool] = field(
-        default=True, metadata={"help": "Flag for the modality type."}
-    )
-    
+
+    image_folder: Optional[str] = field(default=None, metadata={"help": "The folder of the image file."})
+
+    image_size: Optional[int] = field(default=512, metadata={"help": "The size of the image."})
+
+    image_crop_type: Optional[str] = field(default="center", metadata={"help": "The type of image crop."})
+
+    text_embedding_type: Optional[str] = field(default="raw", metadata={"help": "How to get text embedding."})
+
+    is_t2i: Optional[bool] = field(default=True, metadata={"help": "Flag for the modality type."})
+
     def __post_init__(self):
         def check_extension(file_path: str, extension: str):
             assert file_path.split(".")[-1] == extension, f"The file must be a {extension} file."
-        
-        
+
         if self.dataset_path is None or self.image_folder is None:
             raise ValueError("The dataset_path, image_folder must be provided.")
-            
+
         else:
             if self.train_file is None:
                 if os.path.exists(os.path.join(self.dataset_path, "train.json")):
                     self.train_file = "train.json"
                 else:
                     raise ValueError("The train_file must be provided.")
-        
+
             check_extension(self.train_file, "json")
-            if (self.validation_file is not None and self.test_file is None)\
-                or (self.validation_file is None and self.test_file is not None):
-                    same_file = self.validation_file if self.validation_file is not None else self.test_file
-                    self.validation_file = same_file
-                    self.test_file = same_file
+            if (self.validation_file is not None and self.test_file is None) or (
+                self.validation_file is None and self.test_file is not None
+            ):
+                same_file = self.validation_file if self.validation_file is not None else self.test_file
+                self.validation_file = same_file
+                self.test_file = same_file
             if self.validation_file is not None:
                 check_extension(self.validation_file, "json")
                 if not os.path.exists(os.path.join(self.dataset_path, self.validation_file)):
@@ -58,18 +49,15 @@ def check_extension(file_path: str, extension: str):
                 if not os.path.exists(os.path.join(self.dataset_path, self.test_file)):
                     self.test_file = None
 
-@dataclass           
+
+@dataclass
 class DiffuserModelArguments:
     """Arguments for T2I model"""
-    
-    model_name_or_path: Optional[str] = field(
-        default=None, metadata={"help": "The model name or path."}
-    )
-    
-    model_type: Optional[str] = field(
-        default=None, metadata={"help": "The model type."}
-    )
-    
+
+    model_name_or_path: Optional[str] = field(default=None, metadata={"help": "The model name or path."})
+
+    model_type: Optional[str] = field(default=None, metadata={"help": "The model type."})
+
     # torch_dtype: Optional[str] = field(
     #     default=None,
     #     metadata={
@@ -80,12 +68,12 @@ class DiffuserModelArguments:
     #         "choices": ["auto", "bfloat16", "float16", "float32"],
     #     },
     # )
-    
+
     use_lora: bool = field(
         default=False,
         metadata={"help": "Whether to lora."},
     )
-    
+
     lora_r: int = field(
         default=8,
         metadata={"help": "the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has."},
@@ -93,80 +81,53 @@ class DiffuserModelArguments:
     lora_alpha: int = field(
         default=8,
         metadata={
-            "help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."},
-    )
-    lora_target_modules: List[str] = field(
-        default=None, metadata={"help": "Modules to apply lora."}
+            "help": (
+                "Merging ratio between the fine-tuned model and the original. This is controlled by a "
+                "parameter called alpha in the paper."
+            ),
+        },
     )
+    lora_target_modules: list[str] = field(default=None, metadata={"help": "Modules to apply lora."})
     lora_dropout: float = field(
         default=0.1,
         metadata={"help": "The dropout rate in lora.linear."},
     )
-  
-@dataclass  
+
+
+@dataclass
 class DiffuserTunerArguments:
     """Arguments for T2I finetuner"""
-    
-    output_dir: Optional[str] = field(
-        default="output", metadata={"help": "The output directory."}
-    )
-    
-    logging_dir: Optional[str] = field(
-        default="logs", metadata={"help": "The logging directory."}
-    )
-    
+
+    output_dir: Optional[str] = field(default="output", metadata={"help": "The output directory."})
+
+    logging_dir: Optional[str] = field(default="logs", metadata={"help": "The logging directory."})
+
     overwrite_output_dir: bool = field(
         default=False, metadata={"help": "Overwrite the content of the output directory."}
     )
-    
-    mixed_precision: str = field(
-        default="no", metadata={"help": "Whether to use mixed precision."}
-    )
-    
-    do_train: bool = field(
-        default=True, metadata={"help": "Whether to run training."}
-    )
-    
-    num_train_epochs: Optional[int] = field(
-        default=50, metadata={"help": "The number of training epochs."}
-    )
-    
-    train_batch_size: Optional[int] = field(
-        default=1, metadata={"help": "The number of batch size in training."}
-    )
-    
-    learning_rate: Optional[float] = field(
-        default=1e-4, metadata={"help": "The learning rate."}
-    )
-    
-    weight_decay: Optional[float] = field(
-        default=0.0, metadata={"help": "The weight decay."}
-    )
-    
-    do_valid: bool = field(
-        default=True, metadata={"help": "Whether to run evaluation."}
-    )
-    
-    do_test: bool = field(
-        default=True, metadata={"help": "Whether to run testing."}
-    )
-    
-    valid_steps: Optional[int] = field(
-        default=50, metadata={"help": "The evaluation steps."}
-    )
-    
-    valid_seed: Optional[int] = field(
-        default=42, metadata={"help": "The seed for validation."}
-    )
-    
-    test_seed: Optional[int] = field(
-        default=42, metadata={"help": "The seed for testing."}
-    )
-    
-    save_steps: Optional[int] = field(
-        default=500, metadata={"help": "The saving steps."}
-    )
-    
-    save_total_limit: Optional[int] = field(
-        default=None, metadata={"help": "The total number of checkpoints to save."}
-    )
\ No newline at end of file
+
+    mixed_precision: str = field(default="no", metadata={"help": "Whether to use mixed precision."})
+
+    do_train: bool = field(default=True, metadata={"help": "Whether to run training."})
+
+    num_train_epochs: Optional[int] = field(default=50, metadata={"help": "The number of training epochs."})
+
+    train_batch_size: Optional[int] = field(default=1, metadata={"help": "The number of batch size in training."})
+
+    learning_rate: Optional[float] = field(default=1e-4, metadata={"help": "The learning rate."})
+
+    weight_decay: Optional[float] = field(default=0.0, metadata={"help": "The weight decay."})
+
+    do_valid: bool = field(default=True, metadata={"help": "Whether to run evaluation."})
+
+    do_test: bool = field(default=True, metadata={"help": "Whether to run testing."})
+
+    valid_steps: Optional[int] = field(default=50, metadata={"help": "The evaluation steps."})
+
+    valid_seed: Optional[int] = field(default=42, metadata={"help": "The seed for validation."})
+
+    test_seed: Optional[int] = field(default=42, metadata={"help": "The seed for testing."})
+
+    save_steps: Optional[int] = field(default=500, metadata={"help": "The saving steps."})
+
+    save_total_limit: Optional[int] = field(default=None, metadata={"help": "The total number of checkpoints to save."})
diff --git a/contrib/text2image/diffuser_finetuner.py b/contrib/text2image/diffuser_finetuner.py
index 7276b991d..847818c78 100644
--- a/contrib/text2image/diffuser_finetuner.py
+++ b/contrib/text2image/diffuser_finetuner.py
@@ -1,36 +1,33 @@
+import copy
+import gc
 import json
-import os
 import logging
-import gc
-import copy
-
-from tqdm import tqdm
-import wandb
+import os
 
 import torch
 import torch.nn.functional as F
-from torch.utils.data import DataLoader
+import wandb
+from accelerate import Accelerator
+from diffuser_args import DiffuserModelArguments, DiffuserTunerArguments, T2IDatasetArguments
 from diffusers import (
-    DiffusionPipeline,
-    UNet2DConditionModel,
     DDPMScheduler,
+    DiffusionPipeline,
 )
 from diffusers.loaders import LoraLoaderMixin
+from diffusers.optimization import get_scheduler
 from diffusers.utils import (
     convert_state_dict_to_diffusers,
-    convert_unet_state_dict_to_peft,
 )
 from diffusers.utils.torch_utils import is_compiled_module
-from diffusers.optimization import get_scheduler
-from accelerate import Accelerator
-from peft.utils import get_peft_model_state_dict, set_peft_model_state_dict
+from peft.utils import get_peft_model_state_dict
+from torch.utils.data import DataLoader
+from tqdm import tqdm
 
 from lmflow.pipeline.finetuner import BaseTuner
 
-from diffuser_args import T2IDatasetArguments, DiffuserModelArguments, DiffuserTunerArguments
-
 logger = logging.getLogger(__name__)
 
+
 def log_validation(
     pipeline,
     accelerator: Accelerator,
@@ -40,12 +37,10 @@ def log_validation(
 ):
     pipeline.to(accelerator.device)
     pipeline.vae.to(torch.float32)
-    
+
     with torch.no_grad():
-        prompt_images = [
-            (pipeline_arg["prompt"], pipeline(**pipeline_arg).images[0]) for pipeline_arg in pipeline_args
-        ]
-        
+        prompt_images = [(pipeline_arg["prompt"], pipeline(**pipeline_arg).images[0]) for pipeline_arg in pipeline_args]
+
     for tracker in accelerator.trackers:
         if tracker.name == "wandb":
             tracker.log(
@@ -55,18 +50,19 @@ def log_validation(
                     ]
                 }
             )
-    
+
     if not os.path.exists(save_dir):
         os.makedirs(save_dir)
     for i, (prompt, image) in enumerate(prompt_images):
         image.save(os.path.join(save_dir, f"{prompt.replace(' ', '_')}.png"))
-    
+
     del pipeline
     torch.cuda.empty_cache()
     gc.collect()
-    
+
     return
 
+
 class DiffuserModelTuner(BaseTuner):
     """Initializes the `RewardModelTuner` class.
 
@@ -87,18 +83,19 @@ class DiffuserModelTuner(BaseTuner):
     kwargs : Optional.
         Keyword arguments.
     """
+
     def __init__(
-        self, 
+        self,
         model_args: DiffuserModelArguments,
         data_args: T2IDatasetArguments,
         finetuner_args: DiffuserTunerArguments,
-        *args, 
-        **kwargs
+        *args,
+        **kwargs,
     ):
         self.model_args = model_args
         self.data_args = data_args
         self.finetuner_args = finetuner_args
-    
+
     def tune(
         self,
         accelerator: Accelerator,
@@ -106,85 +103,87 @@ def tune(
         dataset,
     ):
         dataloader = DataLoader(dataset=dataset, batch_size=self.finetuner_args.train_batch_size, shuffle=True)
-        
+
         noise_scheduler = DDPMScheduler.from_pretrained(self.model_args.model_name_or_path, subfolder="scheduler")
-        
+
         def unwrap_model(model):
             model = accelerator.unwrap_model(model)
             model = model._orig_mod if is_compiled_module(model) else model
-            return model    
-        
+            return model
+
         # filter trainable parameters
         params_to_optimize = list(filter(lambda p: p.requires_grad, model.parameters()))
         accelerator.print(len(params_to_optimize))
-        
+
         optimizer = torch.optim.AdamW(
-            params_to_optimize,
-            lr=self.finetuner_args.learning_rate,
-            weight_decay=self.finetuner_args.weight_decay
+            params_to_optimize, lr=self.finetuner_args.learning_rate, weight_decay=self.finetuner_args.weight_decay
         )
-        
+
         lr_scheduler = get_scheduler(
             "constant",
             optimizer=optimizer,
         )
-        
-        model, dataloader, optimizer, lr_scheduler = accelerator.prepare(
-            model, dataloader, optimizer, lr_scheduler
-        )
+
+        model, dataloader, optimizer, lr_scheduler = accelerator.prepare(model, dataloader, optimizer, lr_scheduler)
         weight_dtype = torch.float32
         if accelerator.mixed_precision == "fp16":
             weight_dtype = torch.float16
-            
+
         progress_bar = tqdm(
             range(self.finetuner_args.num_train_epochs * len(dataloader)),
             desc="Training",
-            disable=not accelerator.is_main_process
+            disable=not accelerator.is_main_process,
         )
-        
+
         global_step = 0
         for epoch in range(self.finetuner_args.num_train_epochs):
             model.train()
             for batch in dataloader:
                 clean_latents = batch["image"].to(dtype=weight_dtype)
                 text_embedding = batch["text"].to(dtype=weight_dtype)
-                
+
                 bsz, channel, height, width = clean_latents.shape
                 noise = torch.randn_like(clean_latents).to(dtype=weight_dtype)
                 timesteps = torch.randint(
                     0, noise_scheduler.config.num_train_timesteps, (bsz,), device=clean_latents.device
                 )
                 timesteps = timesteps.long()
-                
+
                 noisy_latents = noise_scheduler.add_noise(clean_latents, noise, timesteps)
                 model_pred = model(
-                    noisy_latents, timesteps, text_embedding,
+                    noisy_latents,
+                    timesteps,
+                    text_embedding,
                 )[0]
-                
+
                 if noise_scheduler.config.prediction_type == "epsilon":
                     target = noise
                 elif noise_scheduler.config.prediction_type == "v_prediction":
                     target = noise_scheduler.get_velocity(clean_latents, noise, timesteps)
                 else:
                     raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
-                
+
                 loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-                
+
                 accelerator.backward(loss)
                 optimizer.step()
                 lr_scheduler.step()
                 optimizer.zero_grad()
-                
+
                 progress_bar.update(1)
-            
+
                 if accelerator.is_main_process:
                     logs = {"loss": loss.item(), "lr": lr_scheduler.get_last_lr()[0]}
                     progress_bar.set_postfix(**logs)
                     accelerator.log(logs, step=global_step)
                     global_step += 1
-                
+
                 # validation
-                if accelerator.is_main_process and self.finetuner_args.do_valid and self.data_args.validation_file is not None:
+                if (
+                    accelerator.is_main_process
+                    and self.finetuner_args.do_valid
+                    and self.data_args.validation_file is not None
+                ):
                     if global_step % self.finetuner_args.valid_steps == 0:
                         with torch.no_grad():
                             pipeline = DiffusionPipeline.from_pretrained(
@@ -197,12 +196,19 @@ def unwrap_model(model):
                                 pipeline.transformer = unwrap_model(model)
                             else:
                                 raise ValueError(f"Unknown model type {self.model_args.model_type}")
-                            
-                            with open(os.path.join(self.data_args.dataset_path, self.data_args.validation_file), "r") as f:
+
+                            with open(os.path.join(self.data_args.dataset_path, self.data_args.validation_file)) as f:
                                 validation_data = json.load(f)
-                            generator = torch.Generator(device=accelerator.device).manual_seed(self.finetuner_args.valid_seed)
+                            generator = torch.Generator(device=accelerator.device).manual_seed(
+                                self.finetuner_args.valid_seed
+                            )
                             pipeline_args = [
-                                {"prompt": item["text"], "generator": generator, "width": self.data_args.image_size, "height": self.data_args.image_size}
+                                {
+                                    "prompt": item["text"],
+                                    "generator": generator,
+                                    "width": self.data_args.image_size,
+                                    "height": self.data_args.image_size,
+                                }
                                 for item in validation_data["instances"]
                             ]
                             log_validation(
@@ -212,27 +218,38 @@ def unwrap_model(model):
                                 os.path.join(self.finetuner_args.output_dir, f"step_{global_step}_validation"),
                                 global_step,
                             )
-                                
+
                 if accelerator.is_main_process and global_step % self.finetuner_args.save_steps == 0:
-                    os.makedirs(os.path.join(self.finetuner_args.output_dir, f"checkpoints"), exist_ok=True)
-                    if len(os.listdir(os.path.join(self.finetuner_args.output_dir, f"checkpoints"))) > self.finetuner_args.max_checkpoints:
-                        os.remove(os.path.join(self.finetuner_args.output_dir, f"checkpoints", sorted(os.listdir(os.path.join(self.finetuner_args.output_dir, f"checkpoints")))[0]))
+                    os.makedirs(os.path.join(self.finetuner_args.output_dir, "checkpoints"), exist_ok=True)
+                    if (
+                        len(os.listdir(os.path.join(self.finetuner_args.output_dir, "checkpoints")))
+                        > self.finetuner_args.max_checkpoints
+                    ):
+                        os.remove(
+                            os.path.join(
+                                self.finetuner_args.output_dir,
+                                "checkpoints",
+                                sorted(os.listdir(os.path.join(self.finetuner_args.output_dir, "checkpoints")))[0],
+                            )
+                        )
                     if self.model_args.use_lora:
                         temp_model = unwrap_model(copy.deepcopy(model))
                         temp_model = temp_model.to(torch.float32)
                         model_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(temp_model))
                         LoraLoaderMixin.save_lora_weights(
-                            save_directory=os.path.join(self.finetuner_args.output_dir, f"checkpoints", f"final"),
+                            save_directory=os.path.join(self.finetuner_args.output_dir, "checkpoints", "final"),
                             unet_lora_layers=model_lora_state_dict if self.model_args.model_type == "unet" else None,
-                            transformer_lora_layers=model_lora_state_dict if self.model_args.model_type == "transformer" else None,
+                            transformer_lora_layers=model_lora_state_dict
+                            if self.model_args.model_type == "transformer"
+                            else None,
                         )
                         del temp_model
                     else:
                         accelerator.save(
                             accelerator.get_state_dict(model),
-                            os.path.join(self.finetuner_args.output_dir, f"checkpoints", f"final.pt")
+                            os.path.join(self.finetuner_args.output_dir, "checkpoints", "final.pt"),
                         )
-        
+
         accelerator.wait_for_everyone()
         progress_bar.close()
         if accelerator.is_main_process:
@@ -247,38 +264,44 @@ def unwrap_model(model):
                     pipeline.transformer = unwrap_model(model)
                 else:
                     raise ValueError(f"Unknown model type {self.model_args.model_type}")
-                
-                with open(os.path.join(self.data_args.dataset_path, self.data_args.test_file), "r") as f:
+
+                with open(os.path.join(self.data_args.dataset_path, self.data_args.test_file)) as f:
                     test_data = json.load(f)
                 generator = torch.Generator(device=accelerator.device).manual_seed(self.finetuner_args.test_seed)
                 pipeline_args = [
-                    {"prompt": item["text"], "generator": generator, "width": self.data_args.image_size, "height": self.data_args.image_size}
+                    {
+                        "prompt": item["text"],
+                        "generator": generator,
+                        "width": self.data_args.image_size,
+                        "height": self.data_args.image_size,
+                    }
                     for item in test_data["instances"]
                 ]
                 log_validation(
                     pipeline,
                     accelerator,
                     pipeline_args,
-                    os.path.join(self.finetuner_args.output_dir, f"test_final"),
+                    os.path.join(self.finetuner_args.output_dir, "test_final"),
                     global_step,
                 )
-            
-            os.makedirs(os.path.join(self.finetuner_args.output_dir, f"checkpoints"), exist_ok=True)
+
+            os.makedirs(os.path.join(self.finetuner_args.output_dir, "checkpoints"), exist_ok=True)
             if self.model_args.use_lora:
                 model = unwrap_model(model)
                 model = model.to(torch.float32)
                 model_lora_state_dict = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
                 LoraLoaderMixin.save_lora_weights(
-                    save_directory=os.path.join(self.finetuner_args.output_dir, f"checkpoints", f"final"),
+                    save_directory=os.path.join(self.finetuner_args.output_dir, "checkpoints", "final"),
                     unet_lora_layers=model_lora_state_dict if self.model_args.model_type == "unet" else None,
-                    transformer_lora_layers=model_lora_state_dict if self.model_args.model_type == "transformer" else None,
+                    transformer_lora_layers=model_lora_state_dict
+                    if self.model_args.model_type == "transformer"
+                    else None,
                 )
                 # pipeline.load_lora_weights(output_dir, weight_name="pytorch_lora_weights.safetensors")
             else:
                 accelerator.save(
                     accelerator.get_state_dict(model),
-                    os.path.join(self.finetuner_args.output_dir, f"checkpoints", f"final.pt")
+                    os.path.join(self.finetuner_args.output_dir, "checkpoints", "final.pt"),
                 )
-                
+
         return
-    
\ No newline at end of file
diff --git a/contrib/text2image/finetune_t2i.py b/contrib/text2image/finetune_t2i.py
index 7c12acdae..1dc0eaf88 100644
--- a/contrib/text2image/finetune_t2i.py
+++ b/contrib/text2image/finetune_t2i.py
@@ -1,34 +1,26 @@
-import sys
 import os
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 os.environ["WANDB_MODE"] = "offline"
+import gc
 import shutil
 from pathlib import Path
-import gc
 
 import torch
-from diffusers import (
-    AutoencoderKL,
-    UNet2DConditionModel
-)
-from transformers import (
-    AutoTokenizer,
-    CLIPTextModel
-)
 from accelerate import Accelerator
 from accelerate.utils import ProjectConfiguration
-from diffuser_args import T2IDatasetArguments, DiffuserModelArguments, DiffuserTunerArguments
-from t2i_dataset import build_t2i_dataset
+from diffuser_args import DiffuserModelArguments, DiffuserTunerArguments, T2IDatasetArguments
 from diffuser_finetuner import DiffuserModelTuner
-from transformers import HfArgumentParser
-
+from diffusers import AutoencoderKL, UNet2DConditionModel
 from peft import LoraConfig
+from t2i_dataset import build_t2i_dataset
+from transformers import AutoTokenizer, CLIPTextModel, HfArgumentParser
+
 
 def main():
     parser = HfArgumentParser((DiffuserModelArguments, T2IDatasetArguments, DiffuserTunerArguments))
     model_args, data_args, tuner_args = parser.parse_args_into_dataclasses()
-    
-    
+
     logging_dir = Path(tuner_args.output_dir, tuner_args.logging_dir)
     accelerator_project_config = ProjectConfiguration(project_dir=tuner_args.output_dir, logging_dir=logging_dir)
     accelerator = Accelerator(
@@ -36,20 +28,20 @@ def main():
         log_with="wandb",
         project_config=accelerator_project_config,
     )
-    
+
     if accelerator.is_main_process and tuner_args.overwrite_output_dir and os.path.exists(tuner_args.output_dir):
         shutil.rmtree(tuner_args.output_dir)
-    
+
     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, subfolder="tokenizer")
     text_encoder = CLIPTextModel.from_pretrained(model_args.model_name_or_path, subfolder="text_encoder").to("cuda")
     vae = AutoencoderKL.from_pretrained(model_args.model_name_or_path, subfolder="vae").to("cuda")
-    
+
     dataset = build_t2i_dataset(data_args, tokenizer, text_encoder, vae)
-    
+
     del tokenizer, text_encoder, vae
     torch.cuda.empty_cache()
     gc.collect()
-    
+
     model = None
     if model_args.model_type == "unet":
         model = UNet2DConditionModel.from_pretrained(model_args.model_name_or_path, subfolder=model_args.model_type)
@@ -70,19 +62,20 @@ def main():
         model.add_adapter(lora_config)
     else:
         model.requires_grad_(True)
-    
+
     fintuner = DiffuserModelTuner(model_args, data_args, tuner_args)
-    accelerator.init_trackers("text2image-finetune", config={
-        "data_args": data_args,
-        "model_args": model_args,
-        "tuner_args": tuner_args,
-    })
-    
-    accelerator.wait_for_everyone()
-    fintuner.tune(
-        accelerator=accelerator,
-        model=model, dataset=dataset
+    accelerator.init_trackers(
+        "text2image-finetune",
+        config={
+            "data_args": data_args,
+            "model_args": model_args,
+            "tuner_args": tuner_args,
+        },
     )
 
-if __name__ == '__main__':
+    accelerator.wait_for_everyone()
+    fintuner.tune(accelerator=accelerator, model=model, dataset=dataset)
+
+
+if __name__ == "__main__":
     main()
diff --git a/contrib/text2image/t2i_dataset.py b/contrib/text2image/t2i_dataset.py
index aafe2de6e..2e35b5393 100644
--- a/contrib/text2image/t2i_dataset.py
+++ b/contrib/text2image/t2i_dataset.py
@@ -1,112 +1,119 @@
 #!/usr/bin/env python
-# coding=utf-8
 
-"""This Python code defines a class T2I Dataset.
-"""
+"""This Python code defines a class T2I Dataset."""
+
 import json
-from PIL import Image
-import os.path as osp
-from tqdm import tqdm
 import logging
+import os.path as osp
 
-import torch
+from diffuser_args import T2IDatasetArguments
+from PIL import Image
 from torch.utils.data import Dataset
 from torchvision import transforms
-
-from diffuser_args import T2IDatasetArguments
+from tqdm import tqdm
 
 logger = logging.getLogger(__name__)
 
+
 class CustomT2IDataset(Dataset):
     """Dataset for T2I data"""
-    
+
     def __init__(self, data_args: T2IDatasetArguments):
         self.data_args = data_args
         self.image_folder = osp.join(data_args.dataset_path, data_args.image_folder)
         self.data_file = osp.join(data_args.dataset_path, data_args.train_file)
-        
-        self.data_dict = json.load(open(self.data_file, "r"))
+
+        self.data_dict = json.load(open(self.data_file))
         assert self.data_dict["type"] == "text-image", "The dataset type must be text-image."
-        
+
         self.data_instances = self.data_dict["instances"]
-    
+
     def __len__(self):
         return len(self.data_instances)
-    
+
     def __getitem__(self, idx):
         instance = self.data_instances[idx]
         image_path = osp.join(self.image_folder, instance["image"])
         image = Image.open(image_path)
         image = image.convert("RGB")
-        
+
         return {
             "image": image,
             "text": instance["text"],
         }
 
-class EncodePreprocessor(object):
+
+class EncodePreprocessor:
     def __init__(self, data_args: T2IDatasetArguments, kind: str = "simple", **kwargs):
         self.transform = transforms.Compose(
             [
                 transforms.Resize(data_args.image_size, interpolation=transforms.InterpolationMode.BILINEAR),
-                transforms.CenterCrop(data_args.image_size) if data_args.image_crop_type == "center" else transforms.RandomCrop(data_args.image_size),
+                transforms.CenterCrop(data_args.image_size)
+                if data_args.image_crop_type == "center"
+                else transforms.RandomCrop(data_args.image_size),
                 transforms.ToTensor(),
                 transforms.Normalize(mean=[0.5], std=[0.5]),
             ]
         )
-        
+
         self.pre_func = None
         if kind == "simple":
             self.register_simple_func(**kwargs)
-    
+
     def register_simple_func(self, tokenizer, text_encoder, vae):
         self.tokenizer = tokenizer
         self.text_encoder = text_encoder
         self.vae = vae
-        
+
         def simple_func(data_item):
             image = self.transform(data_item["image"])
             latents = self.vae.encode(image.to(self.vae.device, dtype=self.vae.dtype).unsqueeze(0)).latent_dist.sample()
             encoded_image = latents * self.vae.config.scaling_factor
             encoded_image = encoded_image.detach()
-            encoded_image=encoded_image.squeeze(0).cpu()
-            
+            encoded_image = encoded_image.squeeze(0).cpu()
+
             max_length = self.tokenizer.model_max_length
-            tokens = self.tokenizer([data_item["text"]], max_length=max_length, padding="max_length", truncation=True, return_tensors="pt").input_ids
+            tokens = self.tokenizer(
+                [data_item["text"]], max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
+            ).input_ids
             encoded_text = self.text_encoder(tokens.to(self.text_encoder.device))[0]
             encoded_text = encoded_text.detach()
-            encoded_text =encoded_text.squeeze(0).cpu()
-            
+            encoded_text = encoded_text.squeeze(0).cpu()
+
             return {
                 "image": encoded_image,
                 "text": encoded_text,
             }
-            
+
         self.pre_func = simple_func
-        
+
     def __call__(self, data_item):
-        return self.pre_func(data_item)     
-   
+        return self.pre_func(data_item)
+
+
 class PreprocessedT2IDataset(Dataset):
     "Preprocess dataset with prompt"
-    
-    def __init__(self, raw_dataset:Dataset, data_args: T2IDatasetArguments, preprocessor:EncodePreprocessor):
+
+    def __init__(self, raw_dataset: Dataset, data_args: T2IDatasetArguments, preprocessor: EncodePreprocessor):
         self.data_dict = []
-        
+
         logger.info("Preprocessing data ...")
         for data_item in tqdm(raw_dataset):
             self.data_dict.append(preprocessor(data_item))
-            
+
     def __len__(self):
         return len(self.data_dict)
-    
+
     def __getitem__(self, idx):
         return self.data_dict[idx]
 
+
 def build_t2i_dataset(data_args: T2IDatasetArguments, tokenizer, text_encoder, vae):
     raw_dataset = CustomT2IDataset(data_args)
     # dataset = SimpleT2IDataset(raw_dataset, data_args, tokenizer, text_encoder, vae)
-    preprocessor = EncodePreprocessor(kind="simple", data_args=data_args, tokenizer=tokenizer, text_encoder=text_encoder, vae=vae)
+    preprocessor = EncodePreprocessor(
+        kind="simple", data_args=data_args, tokenizer=tokenizer, text_encoder=text_encoder, vae=vae
+    )
     dataset = PreprocessedT2IDataset(raw_dataset, data_args, preprocessor)
-    
-    return dataset
\ No newline at end of file
+
+    return dataset
diff --git a/contrib/tool-finetune/function_call_finetune.py b/contrib/tool-finetune/function_call_finetune.py
index 313563db5..942e8a4a2 100644
--- a/contrib/tool-finetune/function_call_finetune.py
+++ b/contrib/tool-finetune/function_call_finetune.py
@@ -1,57 +1,51 @@
 import os
 import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 import hashlib
-from typing import Dict, List, Union, Tuple, Optional, Sequence
 import logging
-from datasets import Features
+from typing import Union
+
 import transformers
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.testing_utils import CaptureLogger
 from transformers import HfArgumentParser, PreTrainedTokenizer, PreTrainedTokenizerFast
-from peft import LoraConfig, TaskType
+from transformers.testing_utils import CaptureLogger
+
+from lmflow.args import (
+    AutoArguments,
+    DatasetArguments,
+    ModelArguments,
+)
 from lmflow.datasets.dataset import Dataset
+from lmflow.models.hf_decoder_model import HFDecoderModel
 from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.tokenization.hf_decoder_model import blocking
-from lmflow.utils.conversation_template.base import TemplateComponent
 from lmflow.utils.constants import (
-    TEXT_ONLY_DATASET_DESCRIPTION,
-    TEXT2TEXT_DATASET_DESCRIPTION,
     CONVERSATION_DATASET_DESCRIPTION,
-    CONVERSATION_ROLE_NAMES
-)
-from lmflow.utils.conversation_template import ConversationTemplateForTool, PRESET_TEMPLATES
-from lmflow.args import (
-    ModelArguments,
-    DatasetArguments,
-    AutoArguments,
+    CONVERSATION_ROLE_NAMES,
+    TEXT2TEXT_DATASET_DESCRIPTION,
+    TEXT_ONLY_DATASET_DESCRIPTION,
 )
-from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.utils.conversation_template import PRESET_TEMPLATES, ConversationTemplateForTool
+
 tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
 logger = logging.getLogger(__name__)
 
 
 class HFDecoderModelForTool(HFDecoderModel):
-    def tokenize(
-        self, 
-        dataset, 
-        add_special_tokens=True, 
-        *args, 
-        **kwargs
-    ) -> Dataset:
+    def tokenize(self, dataset, add_special_tokens=True, *args, **kwargs) -> Dataset:
         """
         Tokenize the full dataset.
-    
+
         Parameters
         ------------
         dataset : lmflow.datasets.Dataset.
 
         args : Optional.
             Positional arguments.
-        
+
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         tokenized_datasets :
@@ -62,10 +56,7 @@ def tokenize(
         # Preprocessing the datasets.
         # First we tokenize all the texts.
         if dataset.get_backend() != "huggingface":
-            raise NotImplementedError(
-                "tokenization of datasets with non-huggingface backend are"
-                "not supported yet"
-            )
+            raise NotImplementedError("tokenization of datasets with non-huggingface backend arenot supported yet")
 
         dataset_type = dataset.get_type()
         model_args = self.model_args
@@ -84,8 +75,8 @@ def tokenize(
         #   3) Which fields require loss in final computation, e.g.
         #        "text_only": "text"
         #        "text2text": "output" only
-        tokenized_column_order = None       # Handles 1) and 2)
-        label_columns = None                # Handles 3)
+        tokenized_column_order = None  # Handles 1) and 2)
+        label_columns = None  # Handles 3)
         if dataset_type == "text_only":
             tokenized_column_order = ["text"]
             label_columns = ["text"]
@@ -103,12 +94,12 @@ def tokenize(
                     )
             else:
                 logger.warning("No conversation template provided. Using default template.")
-                conversation_template = PRESET_TEMPLATES['empty']
-                        
+                conversation_template = PRESET_TEMPLATES["empty"]
+
             logger.warning(f"Conversation template: {conversation_template}")
         else:
             raise NotImplementedError(
-                f"dataset type \"{dataset_type}\" is not supported, currently"
+                f'dataset type "{dataset_type}" is not supported, currently'
                 " only support following data types:\n"
                 f"    1) {TEXT_ONLY_DATASET_DESCRIPTION}\n"
                 f"    2) {TEXT2TEXT_DATASET_DESCRIPTION}\n"
@@ -119,7 +110,7 @@ def tokenize(
         use_truncation = False
         if model_args.use_lora or data_args.disable_group_texts:
             use_truncation = True
-        
+
         tokenize_fn = conversation_tokenize_function
         tokenize_fn_kwargs = {
             "data_args": data_args,
@@ -133,17 +124,21 @@ def tokenize(
             tokenize_fn_kwargs["tokenized_column_order"] = tokenized_column_order
             tokenize_fn_kwargs["add_special_tokens"] = add_special_tokens
             tokenize_fn_kwargs["use_truncation"] = use_truncation
-                           
+
         tokenize_kwargs = {}
         if not data_args.streaming:
             fingerprint = hashlib.md5(
                 (
                     raw_datasets.get_fingerprint()
                     + str(self.tokenizer)
-                    + f'###padding_side={self.tokenizer.padding_side}'
-                    + ('###conversation_template=' + str(conversation_template) if "conversation" in dataset_type else "")
-                    + f'###disable_group_texts={data_args.disable_group_texts}'
-                    + f'###block_size={data_args.block_size}'
+                    + f"###padding_side={self.tokenizer.padding_side}"
+                    + (
+                        "###conversation_template=" + str(conversation_template)
+                        if "conversation" in dataset_type
+                        else ""
+                    )
+                    + f"###disable_group_texts={data_args.disable_group_texts}"
+                    + f"###block_size={data_args.block_size}"
                 ).encode("utf-8")
             ).hexdigest()
             tokenize_kwargs = {
@@ -154,24 +149,20 @@ def tokenize(
             }
 
         tokenized_datasets = raw_datasets.map(
-            tokenize_fn,
-            batched=True,
-            remove_columns=column_names,
-            fn_kwargs=tokenize_fn_kwargs,
-            **tokenize_kwargs
+            tokenize_fn, batched=True, remove_columns=column_names, fn_kwargs=tokenize_fn_kwargs, **tokenize_kwargs
         )
 
         return tokenized_datasets
 
+
 def conversation_tokenize_function(
-    examples, 
+    examples,
     data_args: DatasetArguments,
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], 
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     column_names,
     conversation_template: ConversationTemplateForTool,
-) -> Dict:
-    """Handels conversation datasets tokenization
-    """
+) -> dict:
+    """Handels conversation datasets tokenization"""
     num_example = len(examples[column_names[0]])
     token_dict = {
         "input_ids": [[] for _ in range(num_example)],
@@ -183,19 +174,17 @@ def conversation_tokenize_function(
             messages = examples["messages"][i]
             system = examples.get("system", [None] * num_example)[i]
             tools = examples.get("tools", [None] * num_example)[i]
-            if len(messages) < 2 or messages[0]['role'] != CONVERSATION_ROLE_NAMES['user']:
+            if len(messages) < 2 or messages[0]["role"] != CONVERSATION_ROLE_NAMES["user"]:
                 tok_logger.warning(
                     "Invalid instance encountered. Either the conversation has less than "
                     "one round or the first message is not from the user."
                 )
                 continue
-        
+
             if len(messages) % 2 != 0:
-                logger.warning(
-                    "The number of messages is not even, the last message will be ignored."
-                )
+                logger.warning("The number of messages is not even, the last message will be ignored.")
                 messages = messages[:-1]
-                
+
             encoded_conversation = conversation_template.encode_conversation(
                 tokenizer=tokenizer,
                 messages=messages,
@@ -222,10 +211,15 @@ def conversation_tokenize_function(
                     if data_args.train_on_prompt:
                         labels += user_input + function_result + observation_input + assistant_result
                     else:
-                        labels += [-100] * len(user_input) + function_result + [-100] * len(observation_input) + assistant_result
+                        labels += (
+                            [-100] * len(user_input)
+                            + function_result
+                            + [-100] * len(observation_input)
+                            + assistant_result
+                        )
                 else:
                     logger.warning("The number of roles in conversation is not appropriate")
-                
+
             token_dict["input_ids"][i].extend(input_ids)
             token_dict["attention_mask"][i].extend([1] * len(input_ids))
             token_dict["labels"][i].extend(labels)
@@ -249,7 +243,6 @@ def conversation_tokenize_function(
 
 
 def train():
-
     # Initialize args
     ## Prepare training_args
     pipeline_name = "finetuner"
@@ -258,7 +251,7 @@ def train():
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
-        model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses() 
+        model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
     print("Model args", model_args)
     print("data_args", data_args)
     print("training_args", pipeline_args)
@@ -276,7 +269,8 @@ def train():
         data_args=data_args,
         pipeline_args=pipeline_args,
     )
-    tuned_model = finetuner.tune(model=model, dataset=dataset)
+    finetuner.tune(model=model, dataset=dataset)
+
 
 if __name__ == "__main__":
-    train()
\ No newline at end of file
+    train()
diff --git a/contrib/tool-finetune/run_function_call_finetune.sh b/contrib/tool-finetune/run_function_call_finetune.sh
index 220abee9a..185ca72ea 100755
--- a/contrib/tool-finetune/run_function_call_finetune.sh
+++ b/contrib/tool-finetune/run_function_call_finetune.sh
@@ -65,7 +65,7 @@ deepspeed ${deepspeed_args} \
     --disable_group_texts 1 \
     --block_size 1024 \
     --per_device_train_batch_size 1 \
-    --deepspeed configs/ds_config_zero3.json \
+    --deepspeed configs/archive/ds_config_zero3.json \
     --fp16 \
     --run_name finetune \
     --validation_split_percentage 0 \
diff --git a/docker/Dockerfile b/docker/Dockerfile
deleted file mode 100644
index 60b1d5712..000000000
--- a/docker/Dockerfile
+++ /dev/null
@@ -1,21 +0,0 @@
-FROM nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04
-
-ENV TZ=Etc/UTC
-RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-
-RUN apt-get update --fix-missing && apt-get install -y fontconfig --fix-missing
-RUN apt-get install -y libopenmpi-dev
-RUN apt-get install -y git python3.9 python3.9-dev python3.9-venv
-RUN python3.9 -m venv /venv
-ENV PATH=/venv/bin:$PATH
-RUN pip install mpi4py
-
-ARG SRCDIR
-
-RUN mkdir /LMFlow/
-WORKDIR /LMFlow/
-
-COPY $SRCDIR/  /LMFlow/
-
-RUN pip install wheel
-RUN pip install -e .
diff --git a/docker/README.md b/docker/README.md
deleted file mode 100644
index 5e7fa482f..000000000
--- a/docker/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# Docker
-
-LMFlow is available as a docker image in Docker Hub, built from the Dockerfile
-in this directory, with cuda:11.3.0-cudnn8 (source docker:
-nvidia/cuda:11.3.0-cudnn8-devel-ubuntu20.04).  You need to have at least a
-Nvidia 3090 GPU on your machine with cuda driver compatible with cuda:11.3.0 to
-run this docker image.
-
-## Install docker with nvidia support
-
-First you may need to install docker with nvidia support. This step requires
-root permission. If you don't have one, you may need to contact the system
-adminstrator to do that for you.
-
-We provide an example in Ubuntu 20.04. For other operating systems, you may
-refer to Nvidia's [Install
-Guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker).
-
-```sh
-curl https://get.docker.com | sh && sudo systemctl --now enable docker
-
-distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \
-  && curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \
-    | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
-  && curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list \
-    | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \
-    | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
-
-sudo apt-get update
-sudo apt-get install -y nvidia-container-toolkit
-sudo nvidia-ctk runtime configure --runtime=docker
-sudo systemctl restart docker
-```
-
-## Pull docker image and run
-
-Use the following command to pull our docker image.
-
-```sh
-docker pull optimalscale/lmflow
-```
-
-The working directory in docker is `/LMFlow`, where LMFlow (commit:
-[fa0e66f94](https://github.com/OptimalScale/LMFlow/tree/fa0e66f94eb5b7bfd624afdf9826b054641e3373))
-is cloned and installed.  Use the following command to enter the docker
-container, where `./LMFlow/log/finetune` in the container will be mapped to
-`./output_dir/log/finetune` on the host machine. You may add more directory
-mappings in a similar manner.
-
-```sh
-docker run \
-  -v ./output_dir/log/finetune:/LMFlow/log/finetune \
-  --gpus=all \
-  --shm-size=64g \
-  -e WANDB_DISABLED=true \
-  -it \
-  --rm \
-  optimalscale/lmflow \
-  bash
-```
-
-Then you will be able to work inside the docker, just like in a physical
-machine. Notice that to use multiple gpus, you need to allocate enough
-shared memory. We have setup the dependency for you, so you can directly
-run our scripts, e.g.
-
-```
-./scripts/run_chatbot.sh
-./scripts/run_evaluation.sh
-
-# May need a GPU with --bf16 support, or you can remove --bf16
-# and use --fp16 instead
-./scripts/run_finetune.sh	
-```
diff --git a/assets/Cockatoo3b.png b/docs/assets/Cockatoo3b.png
similarity index 100%
rename from assets/Cockatoo3b.png
rename to docs/assets/Cockatoo3b.png
diff --git a/assets/Cockatoo7b.png b/docs/assets/Cockatoo7b.png
similarity index 100%
rename from assets/Cockatoo7b.png
rename to docs/assets/Cockatoo7b.png
diff --git a/assets/Hymba_loss.png b/docs/assets/Hymba_loss.png
similarity index 100%
rename from assets/Hymba_loss.png
rename to docs/assets/Hymba_loss.png
diff --git a/assets/Parakeets.png b/docs/assets/Parakeets.png
similarity index 100%
rename from assets/Parakeets.png
rename to docs/assets/Parakeets.png
diff --git a/assets/colab-shell-chatbot-demo.png b/docs/assets/colab-shell-chatbot-demo.png
similarity index 100%
rename from assets/colab-shell-chatbot-demo.png
rename to docs/assets/colab-shell-chatbot-demo.png
diff --git a/assets/features.png b/docs/assets/features.png
similarity index 100%
rename from assets/features.png
rename to docs/assets/features.png
diff --git a/assets/logo.png b/docs/assets/logo.png
similarity index 100%
rename from assets/logo.png
rename to docs/assets/logo.png
diff --git a/assets/multimodal-chatbot-demo.gif b/docs/assets/multimodal-chatbot-demo.gif
similarity index 100%
rename from assets/multimodal-chatbot-demo.gif
rename to docs/assets/multimodal-chatbot-demo.gif
diff --git a/assets/robin13b.png b/docs/assets/robin13b.png
similarity index 100%
rename from assets/robin13b.png
rename to docs/assets/robin13b.png
diff --git a/assets/robin13b_.jpg b/docs/assets/robin13b_.jpg
similarity index 100%
rename from assets/robin13b_.jpg
rename to docs/assets/robin13b_.jpg
diff --git a/assets/robin33b.png b/docs/assets/robin33b.png
similarity index 100%
rename from assets/robin33b.png
rename to docs/assets/robin33b.png
diff --git a/assets/robin33b_.png b/docs/assets/robin33b_.png
similarity index 100%
rename from assets/robin33b_.png
rename to docs/assets/robin33b_.png
diff --git a/assets/robin65b.png b/docs/assets/robin65b.png
similarity index 100%
rename from assets/robin65b.png
rename to docs/assets/robin65b.png
diff --git a/assets/robin65b_.png b/docs/assets/robin65b_.png
similarity index 100%
rename from assets/robin65b_.png
rename to docs/assets/robin65b_.png
diff --git a/assets/robin7b.jpg b/docs/assets/robin7b.jpg
similarity index 100%
rename from assets/robin7b.jpg
rename to docs/assets/robin7b.jpg
diff --git a/assets/robin7b_.png b/docs/assets/robin7b_.png
similarity index 100%
rename from assets/robin7b_.png
rename to docs/assets/robin7b_.png
diff --git a/docs/dev_notes/finetuning.mmd b/docs/dev_notes/finetuning.mmd
new file mode 100644
index 000000000..d2c2cd947
--- /dev/null
+++ b/docs/dev_notes/finetuning.mmd
@@ -0,0 +1,62 @@
+sequenceDiagram
+    participant User
+    participant Finetuner as LMFlow Finetuner
+    participant Model as LMFlow Model
+    participant Dataset as LMFlow Dataset
+    participant Trainer as Trainer
+    
+    User->>Finetuner: tune(model, dataset)
+        
+    %% Tokenization
+    Finetuner->>Model: tokenize(dataset)
+    Model->>Dataset: Apply tokenization to dataset
+    
+    alt if not disable_group_texts
+        Finetuner->>Finetuner: group_text(tokenized_dataset, model_max_length)
+    end
+    
+    %% Prepare for training
+    Finetuner->>Finetuner: Prepare dataset for trainer
+    
+    %% Create appropriate trainer based on configuration
+    alt if model_args.use_lora
+        Finetuner->>Finetuner: Initialize PeftTrainer
+    else
+        Finetuner->>Finetuner: Initialize standard Trainer
+    end
+    
+    alt if training_args.use_customized_optim
+        Finetuner->>Finetuner: create_customized_optimizer()
+    end
+    
+    alt if training_args.use_lisa
+        Finetuner->>Finetuner: Create DynamicLayerActivationCallback
+    end
+    
+    %% Start training
+    Finetuner->>Trainer: train(resume_from_checkpoint)
+    
+    %% Training loop (simplified)
+    loop Training iterations (Trainer._inner_training_loop simplified)
+        Trainer->>Model: Forward pass
+        Model-->>Trainer: Return predictions
+        Trainer->>Trainer: Compute loss
+        Trainer->>Model: Backward pass
+        Model->>Model: Compute Gradient  
+        Trainer->>Trainer: Optimizer step
+    end
+    
+    %% Save the model
+    alt if not model_args.use_lora
+        Trainer->>Trainer: save_model()
+    else
+        alt if model_args.save_aggregated_lora
+            Finetuner->>Model: merge_lora_weights()
+        end
+        Finetuner->>Model: save(output_dir, save_aggregated_lora)
+    end
+    
+    %% Finish and return
+    Trainer-->>Finetuner: Return train result
+    Finetuner->>Finetuner: Log metrics
+    Finetuner-->>User: Return fine-tuned model
\ No newline at end of file
diff --git a/readme/Position_Interpolation.md b/docs/readme/Position_Interpolation.md
similarity index 100%
rename from readme/Position_Interpolation.md
rename to docs/readme/Position_Interpolation.md
diff --git a/readme/README_es.md b/docs/readme/README_es.md
similarity index 98%
rename from readme/README_es.md
rename to docs/readme/README_es.md
index cdf2d9b5a..c02dcf0be 100644
--- a/readme/README_es.md
+++ b/docs/readme/README_es.md
@@ -1,5 +1,5 @@
 <p align="center" width="100%">
-<img src="../assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
+<img src="../docs/assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
 </p>
 
 # LMFlow
@@ -33,7 +33,7 @@ Una caja de herramientas extensible, conveniente y eficiente para ajustar modelo
 
 
 <p align="center" width="100%">
-<img src="../assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
+<img src="../docs/assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
 </p>
 
 
diff --git a/readme/README_hindi.md b/docs/readme/README_hindi.md
similarity index 98%
rename from readme/README_hindi.md
rename to docs/readme/README_hindi.md
index 7a8345baf..63cea6a55 100644
--- a/readme/README_hindi.md
+++ b/docs/readme/README_hindi.md
@@ -1,5 +1,5 @@
 <p align="center" width="100%">
-<img src="../assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
+<img src="../docs/assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
 </p>
 
 # LMFlow
@@ -32,7 +32,7 @@
 एक विस्तारयोग्य, सुविधाजनक और दक्ष टूलबॉक्स जो बड़े मशीन लर्निंग मॉडल को finetune करने के लिए बनाया गया है, जो सभी समुदाय के उपयोगकर्ताओं के लिए उपलब्ध होने के साथ-साथ उपयोगकर्ता मित्रता, गति और विश्वसनीयता के साथ डिजाइन किया गया है।
 
 <p align="center" width="100%">
-<img src="../assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
+<img src="../docs/assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
 </p>
 
 
diff --git a/readme/README_jp.md b/docs/readme/README_jp.md
similarity index 98%
rename from readme/README_jp.md
rename to docs/readme/README_jp.md
index 7fb3d9d85..e23beb0cf 100644
--- a/readme/README_jp.md
+++ b/docs/readme/README_jp.md
@@ -1,5 +1,5 @@
 <p align="center" width="100%">
-<img src="../assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
+<img src="../docs/assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
 </p>
 
 # LMFlow
@@ -33,7 +33,7 @@
 
 
 <p align="center" width="100%">
-<img src="../assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
+<img src="../docs/assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
 </p>
 
 
diff --git a/readme/README_ko.md b/docs/readme/README_ko.md
similarity index 98%
rename from readme/README_ko.md
rename to docs/readme/README_ko.md
index 712696522..81da4ba72 100644
--- a/readme/README_ko.md
+++ b/docs/readme/README_ko.md
@@ -1,5 +1,5 @@
 <p align="center" width="100%">
-<img src="../assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
+<img src="../docs/assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
 </p>
 
 # LMFlow
@@ -32,7 +32,7 @@
 다음은 사용자 친화적이고 빠르며 신뢰할 수 있으며 커뮤니티 전체에 액세스할 수 있도록 설계된 대규모 기계 학습 모델을 미세 조정하는 데 유용한 확장 가능하고 편리하며 효율적인 도구 상자입니다.
 
 <p align="center" width="100%">
-<img src="../assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
+<img src="../docs/assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
 </p>
 
 
diff --git a/readme/README_zh-hans.md b/docs/readme/README_zh-hans.md
similarity index 97%
rename from readme/README_zh-hans.md
rename to docs/readme/README_zh-hans.md
index 90c67211c..56b5bc71e 100644
--- a/readme/README_zh-hans.md
+++ b/docs/readme/README_zh-hans.md
@@ -1,5 +1,5 @@
 <p align="center" width="100%">
-<img src="../assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
+<img src="../docs/assets/logo.png" alt="LMFlow" style="width: 100%; min-width: 300px; display: block; margin: auto; background-color: transparent;">
 </p>
 
 # LMFlow
@@ -26,7 +26,7 @@
 一个可扩展、方便和高效的工具箱，用于微调大型机器学习模型。我们的目标是开发一套用户友好、快速可靠，并对整个社区开放的全流程微调代码库。
 
 <p align="center" width="100%">
-<img src="../assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
+<img src="../docs/assets/features.png" alt="LMFlow-features" style="width: 100%; min-width: 300px; display: block; margin: auto;">
 </p>
 
 
diff --git a/readme/flash_attn2.md b/docs/readme/flash_attn2.md
similarity index 100%
rename from readme/flash_attn2.md
rename to docs/readme/flash_attn2.md
diff --git a/readme/multi_node.md b/docs/readme/multi_node.md
similarity index 100%
rename from readme/multi_node.md
rename to docs/readme/multi_node.md
diff --git a/docs/source/conf.py b/docs/source/conf.py
index afb40d1de..6259349d2 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -6,20 +6,20 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = 'LMFlow'
-copyright = 'LMFlow 2024'
-author = 'The LMFlow Team'
 
-import sys
 import os
-sys.path.insert(0,os.path.abspath('../..'))
+import sys
+
+sys.path.insert(0, os.path.abspath("../.."))
 
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+project = "LMFlow"
+copyright = "LMFlow 2025"
+author = "The LMFlow Team"
 
-
-templates_path = ['_templates']
+templates_path = ["_templates"]
 exclude_patterns = []
 
 extensions = [
@@ -27,29 +27,29 @@
     "sphinx.ext.autosummary",
     "sphinx.ext.todo",
     "sphinx.ext.viewcode",
-    'myst_parser',
-    'autoapi.extension',
-    #"sphinxext.rediraffe",
+    "myst_parser",
+    "autoapi.extension",
+    # "sphinxext.rediraffe",
     "sphinx_design",
-    #"sphinx_copybutton",
+    # "sphinx_copybutton",
     # For extension examples and demos
-    #"ablog",
+    # "ablog",
     "matplotlib.sphinxext.plot_directive",
-    #"myst_nb",
+    # "myst_nb",
     # "nbsphinx",  # Uncomment and comment-out MyST-NB for local testing purposes.
     "numpydoc",
-    #"sphinx_togglebutton",
-    #"sphinx_favicon",
+    # "sphinx_togglebutton",
+    # "sphinx_favicon",
 ]
 
 autosummary_generate = True
 
-autoapi_type = 'python'
-autoapi_dirs = ['../../src']
+autoapi_type = "python"
+autoapi_dirs = ["../../src"]
 
 source_suffix = {
-    '.rst': 'restructuredtext',
-    '.md': 'markdown',
+    ".rst": "restructuredtext",
+    ".md": "markdown",
 }
 
 
@@ -57,11 +57,11 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
 html_theme = "pydata_sphinx_theme"
-html_static_path = ['_static']
+html_static_path = ["_static"]
 html_css_files = []
 # html_logo = "_static/logo.png"
 html_theme_options = {
-    "announcement": "We've released our memory-efficient finetuning algorithm LISA, check out [<a href='https://arxiv.org/pdf/2403.17919.pdf'>Paper</a>][<a href='https://github.com/OptimalScale/LMFlow#finetuning-lisa'>User Guide</a>] for more details!",
+    "announcement": "We've released our memory-efficient finetuning algorithm LISA, check out [<a href='https://arxiv.org/pdf/2403.17919.pdf'>Paper</a>][<a href='https://github.com/OptimalScale/LMFlow#finetuning-lisa'>User Guide</a>] for more details!",  # noqa: E501
     "back_to_top_button": False,
     "header_links_before_dropdown": 4,
     "icon_links": [
@@ -78,5 +78,4 @@
         "image_dark": "_static/logo5.svg",
         "alt_text": "LMFlow",
     },
-   }
-
+}
diff --git a/docs/source/examples/DATASETS.md b/docs/source/examples/DATASETS.md
index 6a3dd5959..220dcc7fc 100644
--- a/docs/source/examples/DATASETS.md
+++ b/docs/source/examples/DATASETS.md
@@ -129,8 +129,8 @@ Conversational data are commonly used in sft process. We currently support conve
 Data types:
 - `conversation_id`: `Optional[Any]`. An identifier for the conversation. `conversation_id` is only for convience of tracking the conversation and will not be used in the pipeline.
 - `system`: `Optional[string]`. A system prompt that is used to start the conversation.
-- `tools`: `Optional[List[string]]`. A list of tools that are used in the conversation.
-- `messages`: `List[Dict]`. A list of messages in the conversation. Each message contains the following fields:
+- `tools`: `Optional[list[string]]`. A list of tools that are used in the conversation.
+- `messages`: `list[dict]`. A list of messages in the conversation. Each message contains the following fields:
   - `role`: `string`. The role of the message. It can be either `user` or `assistant`.
   - `content`: `string`. The content of the message.
 
diff --git a/docs/source/examples/customize_conversation_template.md b/docs/source/examples/customize_conversation_template.md
index a129f0f3f..0b58b3347 100644
--- a/docs/source/examples/customize_conversation_template.md
+++ b/docs/source/examples/customize_conversation_template.md
@@ -35,8 +35,8 @@ Also, we have a bos token at the beginning of the conversation session.
 ###  2. Choose proper `Formatter`  
 Recall the requirements for a conversation dataset:  
 > - `system`: `Optional[string]`. 
-> - `tools`: `Optional[List[string]]`.  
-> - `messages`: `List[Dict]`.  
+> - `tools`: `Optional[list[string]]`.  
+> - `messages`: `list[dict]`.  
 >    - `role`: `string`.  
 >    - `content`: `string`.  
 
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
index 2b1d57741..a32d4ea79 100644
--- a/examples/benchmarking.py
+++ b/examples/benchmarking.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
 """A one-line summary of the module or program, terminated by a period.
 
@@ -13,100 +12,106 @@
   foo = ClassFoo()
   bar = foo.FunctionBar()
 """
+
 import json
 import logging
-import os 
+import os
 import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 
 import subprocess
 
 from transformers import HfArgumentParser
 
+from lmflow.args import AutoArguments, BenchmarkingArguments, DatasetArguments, ModelArguments
 from lmflow.datasets.dataset import Dataset
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.models.auto_model import AutoModel
-from lmflow.args import ModelArguments, DatasetArguments, AutoArguments, BenchmarkingArguments
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 
 logger = logging.getLogger(__name__)
 
 
-LOCAL_DATSET_MAP ={
-    "gpt4_en_eval":"data/gp4_instruction_en_eval/",
-    "gpt4_zh_eval":"data/gpt4_instruction_zh_eval/",
-    "wiki_zh_eval":"data/wiki_zh_eval/",
-    "wiki_en_eval":"data/wiki_en_eval/",
-    "multiturn_dialog_eval":"data/multiturn_dialog_eval/",
-    "MedMCQA":"data/MedMCQA/validation/",
-    "MedQA-USMLE":"data/MedQA-USMLE/validation/",
-    "PubMedQA":"data/PubMedQA/test/",
-    "alpaca":"data/alpaca/test/",
-    "common_sense_eval_arc_c":"data/commonsense_eval/arc_c",
-    "common_sense_eval_arc_e":"data/commonsense_eval/arc_e",
-    "common_sense_eval_winogrande":"data/commonsense_eval/winogrande/",
-    "common_sense_eval_obqa":"data/commonsense_eval/obqa/",
-    "common_sense_eval_piqa":"data/commonsense_eval/piqa/",
-    "common_sense_eval_hellaswag":"data/commonsense_eval/hellaswag/",
-    "common_sense_eval_siqa":"data/commonsense_eval/siqa/",
-    "common_sense_eval_boolq":"data/commonsense_eval/boolq/",
-    "lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp":"data/lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp",
-    "lmflow_chat_cn_dialog_multiturn_single_nll_text2text":"data/lmflow_chat_cn_dialog_multiturn_single_nll_text2text",
-    "lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp":"data/lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp",
-    "lmflow_chat_en_dialog_multiturn_single_nll_text2text":"data/lmflow_chat_en_dialog_multiturn_single_nll_text2text",
+LOCAL_DATSET_MAP = {
+    "gpt4_en_eval": "data/gp4_instruction_en_eval/",
+    "gpt4_zh_eval": "data/gpt4_instruction_zh_eval/",
+    "wiki_zh_eval": "data/wiki_zh_eval/",
+    "wiki_en_eval": "data/wiki_en_eval/",
+    "multiturn_dialog_eval": "data/multiturn_dialog_eval/",
+    "MedMCQA": "data/MedMCQA/validation/",
+    "MedQA-USMLE": "data/MedQA-USMLE/validation/",
+    "PubMedQA": "data/PubMedQA/test/",
+    "alpaca": "data/alpaca/test/",
+    "common_sense_eval_arc_c": "data/commonsense_eval/arc_c",
+    "common_sense_eval_arc_e": "data/commonsense_eval/arc_e",
+    "common_sense_eval_winogrande": "data/commonsense_eval/winogrande/",
+    "common_sense_eval_obqa": "data/commonsense_eval/obqa/",
+    "common_sense_eval_piqa": "data/commonsense_eval/piqa/",
+    "common_sense_eval_hellaswag": "data/commonsense_eval/hellaswag/",
+    "common_sense_eval_siqa": "data/commonsense_eval/siqa/",
+    "common_sense_eval_boolq": "data/commonsense_eval/boolq/",
+    "lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp": "data/lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp",  # noqa: E501
+    "lmflow_chat_cn_dialog_multiturn_single_nll_text2text": "data/lmflow_chat_cn_dialog_multiturn_single_nll_text2text",  # noqa: E501
+    "lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp": "data/lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp",  # noqa: E501
+    "lmflow_chat_en_dialog_multiturn_single_nll_text2text": "data/lmflow_chat_en_dialog_multiturn_single_nll_text2text",  # noqa: E501
 }
 
-LM_EVAL_DATASET_MAP={
-    "commonsense_qa_eval":"openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq",
-    "math_eval":"gsm8k",
-    'boolq':"boolq",
+LM_EVAL_DATASET_MAP = {
+    "commonsense_qa_eval": "openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq",
+    "math_eval": "gsm8k",
+    "boolq": "boolq",
 }
 
 
-LOCAL_DATSET_GROUP_MAP={
-    "commonsense_nll_eval":"common_sense_eval_arc_c,common_sense_eval_arc_e,common_sense_eval_winogrande,\
+LOCAL_DATSET_GROUP_MAP = {
+    "commonsense_nll_eval": "common_sense_eval_arc_c,common_sense_eval_arc_e,common_sense_eval_winogrande,\
     common_sense_eval_obqa,common_sense_eval_piqa,common_sense_eval_hellaswag,common_sense_eval_siqa,\
     common_sense_eval_boolq",
-    "gpt4_en_eval":"gpt4_en_eval",
-    "gpt4_zh_eval":"gpt4_zh_eval",
-    "wiki_zh_eval":"wiki_zh_eval",
-    "wiki_en_eval":"wiki_en_eval",
-    "wiki_eval":"wiki_zh_eval,wiki_en_eval",
-    "multiturn_dialog_eval":"multiturn_dialog_eval",
-    "all_nll_eval":"common_sense_eval_arc_c,common_sense_eval_arc_e,common_sense_eval_winogrande,\
+    "gpt4_en_eval": "gpt4_en_eval",
+    "gpt4_zh_eval": "gpt4_zh_eval",
+    "wiki_zh_eval": "wiki_zh_eval",
+    "wiki_en_eval": "wiki_en_eval",
+    "wiki_eval": "wiki_zh_eval,wiki_en_eval",
+    "multiturn_dialog_eval": "multiturn_dialog_eval",
+    "all_nll_eval": "common_sense_eval_arc_c,common_sense_eval_arc_e,common_sense_eval_winogrande,\
     common_sense_eval_obqa,common_sense_eval_piqa,common_sense_eval_hellaswag,common_sense_eval_siqa,\
     common_sense_eval_boolq,gpt4_en_eval,gpt4_zh_eval,wiki_zh_eval,wiki_en_eval,\
-    lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_cn_dialog_multiturn_single_nll_text2text,\
+    lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,\
+    lmflow_chat_cn_dialog_multiturn_single_nll_text2text,\
     lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_en_dialog_multiturn_single_nll_text2text",
-    "lmflow_chat_nll_eval":"lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_cn_dialog_multiturn_single_nll_text2text,\
-    lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_en_dialog_multiturn_single_nll_text2text",
-    "lmflow_chat_zh_nll_eval":"lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,lmflow_chat_cn_dialog_multiturn_single_nll_text2text",
+    "lmflow_chat_nll_eval": "lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,\
+    lmflow_chat_cn_dialog_multiturn_single_nll_text2text,\
+    lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp,\
+    lmflow_chat_en_dialog_multiturn_single_nll_text2text",
+    "lmflow_chat_zh_nll_eval": "lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp,\
+    lmflow_chat_cn_dialog_multiturn_single_nll_text2text",
 }
 
-LOCAL_DATSET_ANSWERTYPE_MAP={
-    "gpt4_en_eval":"text2text",
-    "gpt4_zh_eval":"text2text",
-    "wiki_zh_eval":"text_only",
-    "wiki_en_eval":"text_only",
-    "multiturn_dialog_eval":"text2text",
-    "MedMCQA":"multiple_choice",
-    "MedQA-USMLE":"multiple_choice",
-    "PubMedQA":"binary_choice",
-    "alpaca":"text_only",
-    "common_sense_eval_arc_c":"text_only",
-    "common_sense_eval_arc_e":"text_only",
-    "common_sense_eval_winogrande":"text_only",
-    "common_sense_eval_obqa":"text_only",
-    "common_sense_eval_piqa":"text_only",
-    "common_sense_eval_hellaswag":"text_only",
-    "common_sense_eval_siqa":"text_only",
-    "common_sense_eval_boolq":"text_only",
-    "lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp":"text2text",
-    "lmflow_chat_cn_dialog_multiturn_single_nll_text2text":"text2text",
-    "lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp":"text2text",
-    "lmflow_chat_en_dialog_multiturn_single_nll_text2text":"text2text",
+LOCAL_DATSET_ANSWERTYPE_MAP = {
+    "gpt4_en_eval": "text2text",
+    "gpt4_zh_eval": "text2text",
+    "wiki_zh_eval": "text_only",
+    "wiki_en_eval": "text_only",
+    "multiturn_dialog_eval": "text2text",
+    "MedMCQA": "multiple_choice",
+    "MedQA-USMLE": "multiple_choice",
+    "PubMedQA": "binary_choice",
+    "alpaca": "text_only",
+    "common_sense_eval_arc_c": "text_only",
+    "common_sense_eval_arc_e": "text_only",
+    "common_sense_eval_winogrande": "text_only",
+    "common_sense_eval_obqa": "text_only",
+    "common_sense_eval_piqa": "text_only",
+    "common_sense_eval_hellaswag": "text_only",
+    "common_sense_eval_siqa": "text_only",
+    "common_sense_eval_boolq": "text_only",
+    "lmflow_chat_cn_dialog_multiturn_nll_text2text_nosharp": "text2text",
+    "lmflow_chat_cn_dialog_multiturn_single_nll_text2text": "text2text",
+    "lmflow_chat_en_dialog_multiturn_nll_text2text_nosharp": "text2text",
+    "lmflow_chat_en_dialog_multiturn_single_nll_text2text": "text2text",
 }
 
-    
+
 def is_lmflow_local_benchmarking(dataset_name):
     # local_dataset = ["gpt4_en_eval","gpt4_zh_eval","wiki_zh_eval",\
     #     "multiturn_dialog_eval","MedMCQA","MedQA-USMLE","PubMedQA",\
@@ -116,49 +121,52 @@ def is_lmflow_local_benchmarking(dataset_name):
     #     "common_sense_eval_siqa","common_sense_eval_boolq"]
     if dataset_name in LOCAL_DATSET_GROUP_MAP.keys():
         dataset_name_collection = LOCAL_DATSET_GROUP_MAP[dataset_name]
-        for dataset_name_exact in dataset_name_collection.split(','):
+        for dataset_name_exact in dataset_name_collection.split(","):
             dataset_name_exact = dataset_name_exact.strip()
             if "common_sense_eval" in dataset_name_exact:
                 dataset_list = dataset_name_exact.split(",")
                 for common_exact_ in dataset_list:
-                    print("Dealing with "+common_exact_.strip())
+                    print("Dealing with " + common_exact_.strip())
                     common_exact_ = common_exact_.strip()
                     if not os.path.exists(LOCAL_DATSET_MAP[common_exact_]):
                         os.system("cd data && ./download.sh common_sense_eval && cd -")
             else:
                 if not os.path.exists(LOCAL_DATSET_MAP[dataset_name_exact]):
-                    print("Checking if dataset "+ str(dataset_name_exact) + " exists")
-                    os.system("cd data && "+'./download.sh '+dataset_name_exact +" && cd -")
+                    print("Checking if dataset " + str(dataset_name_exact) + " exists")
+                    os.system("cd data && " + "./download.sh " + dataset_name_exact + " && cd -")
         return True
     else:
         return False
 
+
 def is_lm_evaluation_benchmarking(dataset_name):
     if dataset_name in LM_EVAL_DATASET_MAP.keys():
         return True
     else:
         return False
 
-def run_lmflow_local_benchmarking(dataset_name,pipeline_name,model_args, \
-    pipeline_args, model, local_metric="neg_log_likelihood"):
+
+def run_lmflow_local_benchmarking(
+    dataset_name, pipeline_name, model_args, pipeline_args, model, local_metric="neg_log_likelihood"
+):
     # Downloads dataset via "data/download.sh"
-    print('dataset_name.split')
-    print(dataset_name.split(","))
-    result_list = []
+    print(f"{dataset_name.split(',')=}")
     dataset_name = LOCAL_DATSET_GROUP_MAP[dataset_name]
     dataset_collection = dataset_name.split(",")
     reuslt_collection = []
     for dataset_name_ in dataset_collection:
-        # Gets mapping from dataset_name to dataset 
+        # Gets mapping from dataset_name to dataset
         dataset_name_ = dataset_name_.strip()
         dataset_path = LOCAL_DATSET_MAP[dataset_name_]
         data_args = DatasetArguments(dataset_path=dataset_path)
         dataset = Dataset(data_args)
-        
-        logger.warning("Default answer type for lmflow local benchmark tasks. \
-                       Users need to change answer type in LOCAL_DATSET_ANSWERTYPE_MAP for new benchmark tasks.")
+
+        logger.warning(
+            "Default answer type for lmflow local benchmark tasks. \
+                       Users need to change answer type in LOCAL_DATSET_ANSWERTYPE_MAP for new benchmark tasks."
+        )
         pipeline_args.answer_type = LOCAL_DATSET_ANSWERTYPE_MAP[dataset_name_]
-        
+
         evaluator = AutoPipeline.get_pipeline(
             pipeline_name=pipeline_name,
             model_args=model_args,
@@ -166,31 +174,42 @@ def run_lmflow_local_benchmarking(dataset_name,pipeline_name,model_args, \
             pipeline_args=pipeline_args,
         )
         # model = model_args.model_name_or_path
-        # metric should be decided by both dataset_name and pipeline_args 
+        # metric should be decided by both dataset_name and pipeline_args
         # 1. When --metric is not specified, or "accuracy", log warning and change to
         #    the dataset_name's default metric
         # 2. If specified, use the specified metric
-        result = evaluator.evaluate(model=model, dataset=dataset, metric=local_metric,verbose=True)
-        reuslt_collection.append({"dataset":dataset_name_,"result":result})
+        result = evaluator.evaluate(model=model, dataset=dataset, metric=local_metric, verbose=True)
+        reuslt_collection.append({"dataset": dataset_name_, "result": result})
     for record in reuslt_collection:
-        print("-"*30)
-        print("| Dataset: " + record['dataset'] )
-        print("-"*30)
+        print("-" * 30)
+        print("| Dataset: " + record["dataset"])
+        print("-" * 30)
         print(f"| current nll: {record['result']}")
-        print("-"*30)
+        print("-" * 30)
 
 
-def run_lm_evaluation_benchmarking(dataset_name,model_name):
+def run_lm_evaluation_benchmarking(dataset_name, model_name):
     # use subprocess maybe
-    # subprocess.run(["python3", "main.py", "--model", "hf-causal-experimental", 
+    # subprocess.run(["python3", "main.py", "--model", "hf-causal-experimental",
     # "--model_args" "pretrained=EleutherAI/gpt-j-6b",
     # "--tasks", "openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq"
     # "--device", "cuda:0"])
     dataset = LM_EVAL_DATASET_MAP[dataset_name]
-    subprocess.run(["python3", "utils/lm_evaluator.py", "--model", "hf-causal-experimental", 
-    "--model_args", "pretrained="+model_name,
-    "--tasks", dataset,
-    "--device", "cuda:0"])
+    subprocess.run(
+        [
+            "python3",
+            "utils/lm_evaluator.py",
+            "--model",
+            "hf-causal-experimental",
+            "--model_args",
+            "pretrained=" + model_name,
+            "--tasks",
+            dataset,
+            "--device",
+            "cuda:0",
+        ]
+    )
+
 
 def main():
     # Parses arguments (self-defined for our evaluation platform)
@@ -202,28 +221,26 @@ def main():
     pipeline_name = "evaluator"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments, PipelineArguments, BenchmarkingArguments
-    ))
+    parser = HfArgumentParser((ModelArguments, PipelineArguments, BenchmarkingArguments))
     model_args, pipeline_args, benchmarking_args = parser.parse_args_into_dataclasses()
 
-    with open (pipeline_args.deepspeed, "r") as f:
+    with open(pipeline_args.deepspeed) as f:
         ds_config = json.load(f)
     # Based on dataset name, you need specify the default dataset_path
     # (if local) or call corresponding lm_evaluation package (via python pack or subprocess)
     dataset_name = benchmarking_args.dataset_name
     # metric = pipeline_args.metric
-    if is_lmflow_local_benchmarking(dataset_name):   # TODO (@Jipeng)
-        model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config)
-        run_lmflow_local_benchmarking(dataset_name,pipeline_name,model_args,pipeline_args,model)  # Pass args TODO (@Jipeng)
+    if is_lmflow_local_benchmarking(dataset_name):  # TODO (@Jipeng)
+        model = AutoModel.get_model(model_args, do_train=False, ds_config=ds_config)
+        run_lmflow_local_benchmarking(
+            dataset_name, pipeline_name, model_args, pipeline_args, model
+        )  # Pass args TODO (@Jipeng)
     elif is_lm_evaluation_benchmarking(dataset_name):
         model = model_args.model_name_or_path
         run_lm_evaluation_benchmarking(dataset_name, model)  # TODO (@Jipeng)
     else:
-        raise NotImplementedError(
-            f"benchmarking dataset {dataset_name} "
-            " is not supported"
-        )
+        raise NotImplementedError(f"benchmarking dataset {dataset_name}  is not supported")
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/chatbot.py b/examples/chatbot.py
index b6d8138e7..195d6607d 100644
--- a/examples/chatbot.py
+++ b/examples/chatbot.py
@@ -1,24 +1,23 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""A simple shell chatbot implemented with lmflow APIs.
-"""
-import logging
+"""A simple shell chatbot implemented with lmflow APIs."""
+
 import json
+import logging
 import os
 import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 import warnings
-
 from dataclasses import dataclass, field
-from transformers import HfArgumentParser
 from typing import Optional
 
+from transformers import HfArgumentParser
+
+from lmflow.args import AutoArguments, DatasetArguments, ModelArguments
 from lmflow.datasets.dataset import Dataset
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.models.auto_model import AutoModel
-from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
-
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 
 logging.disable(logging.ERROR)
 warnings.filterwarnings("ignore")
@@ -28,46 +27,40 @@
 class ChatbotArguments:
     prompt_structure: Optional[str] = field(
         default="{input_text}",
-        metadata={
-            "help": "prompt structure given user's input text"
-        },
+        metadata={"help": "prompt structure given user's input text"},
     )
     end_string: Optional[str] = field(
         default="\n\n",
-        metadata={
-            "help": "end string mark of the chatbot's output"
-        },
+        metadata={"help": "end string mark of the chatbot's output"},
     )
     num_token_per_step: int = field(
         default=4,
-        metadata={
-            "help": "Number of tokens per step for stream inference"
-        },
+        metadata={"help": "Number of tokens per step for stream inference"},
     )
 
+
 def main():
     pipeline_name = "inferencer"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments,
-        PipelineArguments,
-        ChatbotArguments,
-    ))
-    model_args, pipeline_args, chatbot_args = (
-        parser.parse_args_into_dataclasses()
+    parser = HfArgumentParser(
+        (
+            ModelArguments,
+            PipelineArguments,
+            ChatbotArguments,
+        )
     )
+    model_args, pipeline_args, chatbot_args = parser.parse_args_into_dataclasses()
     inferencer_args = pipeline_args
 
-    with open (pipeline_args.deepspeed, "r") as f:
+    with open(pipeline_args.deepspeed) as f:
         ds_config = json.load(f)
 
     model = AutoModel.get_model(
         model_args,
-        tune_strategy='none',
+        do_train=False,
         ds_config=ds_config,
         device=pipeline_args.device,
-        use_accelerator=True,
     )
 
     # We don't need input data, we will read interactively from stdin
@@ -117,12 +110,9 @@ def main():
             input_text = " "
 
         context += prompt_structure.format(input_text=input_text)
-        context = context[-model.get_max_length():]     # Memory of the bot
+        context = context[-model.get_max_length() :]  # Memory of the bot
 
-        input_dataset = dataset.from_dict({
-            "type": "text_only",
-            "instances": [ { "text": context } ]
-        })
+        input_dataset = dataset.from_dict({"type": "text_only", "instances": [{"text": context}]})
 
         print("Bot: ", end="")
         print_index = 0
@@ -136,7 +126,7 @@ def main():
             token_per_step=token_per_step,
             temperature=inferencer_args.temperature,
             end_string=end_string,
-            input_dataset=input_dataset
+            input_dataset=input_dataset,
         ):
             # Prints characters in the buffer
             new_print_index = print_index
diff --git a/examples/chatbot_gradio.py b/examples/chatbot_gradio.py
index d522497a1..f5014861f 100644
--- a/examples/chatbot_gradio.py
+++ b/examples/chatbot_gradio.py
@@ -1,24 +1,24 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""A simple shell chatbot implemented with lmflow APIs.
-"""
-from dataclasses import dataclass, field
-import logging
+"""A simple shell chatbot implemented with lmflow APIs."""
+
 import json
+import logging
 import os
 import sys
+from dataclasses import dataclass, field
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
-from typing import Optional
 import warnings
+from typing import Optional
 
 import torch
 from transformers import HfArgumentParser
 
+from lmflow.args import AutoArguments, DatasetArguments, ModelArguments
 from lmflow.datasets.dataset import Dataset
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.models.auto_model import AutoModel
-from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.utils.versioning import is_gradio_available
 
 if is_gradio_available():
@@ -80,40 +80,32 @@
 class ChatbotArguments:
     prompt_structure: Optional[str] = field(
         default="{input_text}",
-        metadata={
-            "help": "prompt structure given user's input text"
-        },
+        metadata={"help": "prompt structure given user's input text"},
     )
     end_string: Optional[str] = field(
         default="\n\n",
-        metadata={
-            "help": "end string mark of the chatbot's output"
-        },
+        metadata={"help": "end string mark of the chatbot's output"},
     )
 
 
 pipeline_name = "inferencer"
 PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-parser = HfArgumentParser((
-    ModelArguments,
-    PipelineArguments,
-    ChatbotArguments,
-))
-model_args, pipeline_args, chatbot_args = (
-    parser.parse_args_into_dataclasses()
+parser = HfArgumentParser(
+    (
+        ModelArguments,
+        PipelineArguments,
+        ChatbotArguments,
+    )
 )
+model_args, pipeline_args, chatbot_args = parser.parse_args_into_dataclasses()
 inferencer_args = pipeline_args
 
-with open (pipeline_args.deepspeed, "r") as f:
+with open(pipeline_args.deepspeed) as f:
     ds_config = json.load(f)
 
 model = AutoModel.get_model(
-    model_args,
-    tune_strategy='none',
-    ds_config=ds_config,
-    device=pipeline_args.device,
-    torch_dtype=torch.float16
+    model_args, do_train=False, ds_config=ds_config, device=pipeline_args.device, torch_dtype=torch.float16
 )
 
 # We don't need input data, we will read interactively from stdin
@@ -145,47 +137,50 @@ class ChatbotArguments:
 
 token_per_step = 4
 
+
 def hist2context(hist):
     context = ""
     for query, response in hist:
         context += prompt_structure.format(input_text=query)
-        if not (response is None):
+        if response is not None:
             context += response
     return context
 
-def chat_stream(query: str, history= None, **kwargs):
+
+def chat_stream(query: str, history=None, **kwargs):
     if history is None:
         history = []
 
     context = hist2context(history)
     print_index = 0
     context += prompt_structure.format(input_text=query)
-    context_ = context[-model.get_max_length():] 
-    input_dataset = dataset.from_dict({
-        "type": "text_only",
-        "instances": [ { "text": context_ } ]
-    })
+    context_ = context[-model.get_max_length() :]
+    input_dataset = dataset.from_dict({"type": "text_only", "instances": [{"text": context_}]})
     print(context_)
-    for response, flag_break in inferencer.stream_inference(context=context_, model=model, max_new_tokens=inferencer_args.max_new_tokens, 
-                                    token_per_step=token_per_step, temperature=inferencer_args.temperature,
-                                    end_string=end_string, input_dataset=input_dataset):
+    for response, flag_break in inferencer.stream_inference(
+        context=context_,
+        model=model,
+        max_new_tokens=inferencer_args.max_new_tokens,
+        token_per_step=token_per_step,
+        temperature=inferencer_args.temperature,
+        end_string=end_string,
+        input_dataset=input_dataset,
+    ):
         delta = response[print_index:]
         seq = response
         print_index = len(response)
-        
+
         yield delta, history + [(query, seq)]
         if flag_break:
             break
 
 
-
-
-def predict(input, history=None): 
+def predict(input, history=None):
     if history is None:
         history = []
-    for response, history in chat_stream(input, history):
+    for response, history_ in chat_stream(input, history):
         updates = []
-        for query, response in history:
+        for query, response in history_:
             updates.append(gr.update(visible=True, value="" + query))
             updates.append(gr.update(visible=True, value="" + response))
         if len(updates) < MAX_BOXES:
@@ -193,9 +188,6 @@ def predict(input, history=None):
         yield [history] + updates
 
 
-
-
-
 with gr.Blocks(css=css) as demo:
     gr.HTML(title)
     state = gr.State([])
@@ -214,6 +206,3 @@ def predict(input, history=None):
 
     button.click(predict, [txt, state], [state] + text_boxes)
     demo.queue().launch(share=True)
-
-
-
diff --git a/examples/detail_memory.py b/examples/detail_memory.py
index 6458fd514..156b44fbd 100644
--- a/examples/detail_memory.py
+++ b/examples/detail_memory.py
@@ -1,32 +1,26 @@
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from peft import (
-    LoraConfig,
-    PeftModel,
-    TaskType,
-    get_peft_config,
-    get_peft_model,
-    prepare_model_for_kbit_training
-)
-import time
 import sys
+import time
+
+import torch
+from peft import LoraConfig, TaskType, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 LISA = True if sys.argv[3] == "1" else False
 LORA = True if sys.argv[4] == "1" else False
-lora_rank = int(sys.argv[5])
+lora_r = int(sys.argv[5])
 # Check if the model name is provided as a command-line argument
 if len(sys.argv) < 6:
     print("Usage: python script_name.py <model_name>")
     sys.exit(1)
 
 
-print("*"*50)
+print("*" * 50)
 print("Script started")
 print("model            : ", sys.argv[1])
 print("token_length     : ", sys.argv[2])
 print("LISA             : ", LISA)
 print("LORA             : ", LORA)
-print("lora_rank        : ", lora_rank)
+print("lora_r        : ", lora_r)
 # Model initialization
 model_name = sys.argv[1]
 token_length = sys.argv[2]
@@ -34,7 +28,7 @@
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 # Measure memory usage of the weights
-model.to('cuda')  # Ensure the model is on GPU
+model.to("cuda")  # Ensure the model is on GPU
 if LISA:
     # Only activate two layers
     for param in model.model.layers.parameters():
@@ -48,10 +42,20 @@
     peft_config = LoraConfig(
         task_type=TaskType.CAUSAL_LM,
         inference_mode=False,
-        r=lora_rank,
+        r=lora_r,
         lora_alpha=32,
         lora_dropout=0.1,
-        target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"],
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+            "embed_tokens",
+            "lm_head",
+        ],
     )
     model = get_peft_model(model, peft_config)
     model.print_trainable_parameters()
@@ -63,16 +67,16 @@
 optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
 
 # Execute a forward pass and measure the time
-sentence_2048_tokens = 'The OpenAI API is powered by GPT-3 language models which can be coaxed to perform natural language tasks using carefully engineered text prompts. But these models can also generate outputs that are untruthful, toxic, or reflect harmful sentiments. This is in part because GPT-3 is trained to predict the next word on a large dataset of Internet text, rather than to safely perform the language task that the user wants. In other words, these models aren’t aligned with their users.\nTo make our models safer, more helpful, and more aligned, we use an existing technique called reinforcement learning from human feedback (RLHF). On prompts submitted by our customers to the API,A[A]We only use prompts submitted through the Playground to an earlier version of the InstructGPT models that was deployed in January 2021. Our human annotators remove personal identifiable information from all prompts before adding it to the training set.our labelers provide demonstrations of the desired model behavior, and rank several outputs from our models. We then use this data to fine-tune GPT-3.The resulting InstructGPT models are much better at following instructions than GPT-3. They also make up facts less often, and show small decreases in toxic output generation. Our labelers prefer outputs from our 1.3B InstructGPT model over outputs from a 175B GPT-3 model, despite having more than 100x fewer parameters. At the same time, we show that we don’t have to compromise on GPT-3’s capabilities, as measured by our model’s performance on academic NLP evaluations.These InstructGPT models, which have been in beta on the API for more than a year, are now the default language models accessible on our API.B[B]The InstructGPT models deployed in the API are updated versions trained using the same human feedback data. They use a similar but slightly different training method that we will describe in a forthcoming publication.We believe that fine-tuning language models with humans in the loop is a powerful tool for improving their safety and reliability, and we will continue to push in this direction.This is the first time our alignment research, which we’ve been pursuing for several years,1,2,3 has been applied to product.Yesterday we announced our next-generation Gemini model: Gemini 1.5. In addition to big improvements to speed and efficiency, one of Gemini 1.5’s innovations is its long context window, which measures how many tokens — the smallest building blocks, like part of a word, image or video — that the model can process at once. To help understand the significance of this milestone, we asked the Google DeepMind project team to explain what long context windows are, and how this breakthrough experimental feature can help developers in many ways. Context windows are important because they help AI models recall information during a session. Have you ever forgotten someone’s name in the middle of a conversation a few minutes after they’ve said it, or sprinted across a room to grab a notebook to jot down a phone number you were just given? Remembering things in the flow of a conversation can be tricky for AI models, too — you might have had an experience where a chatbot “forgot” information after a few turns. That’s where long context windows can help. Previously, Gemini could process up to 32,000 tokens at once, but 1.5 Pro — the first 1.5 model we’re releasing for early testing — has a context window of up to 1 million tokens — the longest context window of any large-scale foundation model to date. In fact, we’ve even successfully tested up to 10 million tokens in our research. And the longer the context window, the more text, images, audio, code or video a model can take in and process. "Our original plan was to achieve 128,000 tokens in context, and I thought setting an ambitious bar would be good, so I suggested 1 million tokens," says Google DeepMind Research Scientist Nikolay Savinov, one of the research leads on the long context project. “And now we’ve even surpassed that in our research by 10x.” To make this kind of leap forward, the team had to make a series of deep learning innovations. Early explorations by Pranav Shyam offered valuable insights that helped steer our subsequent research in the right direction. “There was one breakthrough that led to another and another, and each one of them opened up new possibilities,” explains Google DeepMind Engineer Denis Teplyashin. “And then, when they all stacked together, we were quite surprised to discover what they could do, jumping from 128,000 tokens to 512,000 tokens to 1 million tokens, and just recently, 10 million tokens in our internal research.” The raw data that 1.5 Pro can handle opens up whole new ways to interact with the model. Instead of summarizing a document dozens of pages long, for example, it can summarize documents thousands of pages long. Where the old model could help analyze thousands of lines of code, thanks to its breakthrough long context window, 1.5 Pro can analyze tens of thousands of lines of code at once. “In one test, we dropped in an entire code base and it wrote documentation for it, which was really cool,” says Google DeepMind Research Scientist Machel Reid. “And there was another test where it was able to accurately answer questions about the 1924 film Sherlock Jr. after we gave the model the entire 45-minute movie to ‘watch.’” 1.5 Pro can also reason across data provided in a prompt. “One of my favorite examples from the past few days is this rare language — Kalamang — that fewer than 200 people worldwide speak, and there one grammar manual about it,” says Machel. The model can speak it on its own if you just ask it to translate into this language, but with the expanded long context window, you can put the entire grammar manual and some examples of sentences into context, and the model was able to learn to translate from English to Kalamang at a similar level to a person learning from the same content.”Gemini 1.5 Pro comes standard with a 128K-token context window, but a limited group of developers and enterprise customers can try it with a context window of up to 1 million tokens via AI Studio and Vertex AI in private preview. The full 1 million token context window is computationally intensive and still requires further optimizations to improve latency, which we’re actively working on as we scale it out. And as the team looks to the future, they’re continuing to work to make the model faster and more efficient, with safety at the core. They’re also looking to further expand the long context window, improve the underlying architectures, and integrate new hardware improvements. “10 million tokens at once is already close to the thermal limit of our Tensor Processing Units — we dont know where the limit is yet, and the model might be capable of even more as the hardware continues to improve,” says Nikolay.The team is excited to see what kinds of experiences developers and the broader community are able to achieve, too. “When I first saw we had a million tokens in context, my first question was, ‘What do you even use this for?’” says Machel. “But now, I think people’s imaginations are expanding, and they’ll find more and more creative ways to use these new capabilities.Long-context question answering (QA) tasks require reasoning over a long document or multiple documents. Addressing these tasks often benefits from identifying a set of evidence spans (e.g., sentences), which provide supporting evidence for answering the question. In this work, we propose a novel method for equipping long-context QA models with an additional sequence-level objective for better identification of the supporting evidence. We achieve this via an additional contrastive supervision signal in finetuning, where the model is encouraged to explicitly discriminate supporting evidence sentences from negative ones by maximizing question-evidence similarity. The proposed additional loss exhibits consistent improvements on three different strong long-context transformer models, across two challenging question answering benchmarks – HotpotQA and QAsper.Actively and judiciously select the most helpful questions for LLMs“Existing CoT studies largely rely on a fixed set of human-annotated exemplars, which are not necessarily the most effective ones. A good performance requires human prompt engineering which is costly. We identify the human prompt engineering as two complementary components: question selection and prompt template engineering. In this paper, we offer a solution to the key problem of determining which questions are the most important and helpful ones to annotate from a pool of task-specific queries. By borrowing ideas from the related problem of uncertainty-based active learning, we introduce several metrics to characterize the uncertainty so as to select the most uncertain questions for annotation. Experimental results demonstrate the superiority of our proposed method on eight complex reasoning tasks. With text-davinci-002, active-prompt improves upon  by 7 (67.9->74.9).'
-inputs = tokenizer(sentence_2048_tokens, return_tensors='pt', max_length=int(token_length),truncation=True).to('cuda')
-labels = inputs['input_ids']
+sentence_2048_tokens = 'The OpenAI API is powered by GPT-3 language models which can be coaxed to perform natural language tasks using carefully engineered text prompts. But these models can also generate outputs that are untruthful, toxic, or reflect harmful sentiments. This is in part because GPT-3 is trained to predict the next word on a large dataset of Internet text, rather than to safely perform the language task that the user wants. In other words, these models aren’t aligned with their users.\nTo make our models safer, more helpful, and more aligned, we use an existing technique called reinforcement learning from human feedback (RLHF). On prompts submitted by our customers to the API,A[A]We only use prompts submitted through the Playground to an earlier version of the InstructGPT models that was deployed in January 2021. Our human annotators remove personal identifiable information from all prompts before adding it to the training set.our labelers provide demonstrations of the desired model behavior, and rank several outputs from our models. We then use this data to fine-tune GPT-3.The resulting InstructGPT models are much better at following instructions than GPT-3. They also make up facts less often, and show small decreases in toxic output generation. Our labelers prefer outputs from our 1.3B InstructGPT model over outputs from a 175B GPT-3 model, despite having more than 100x fewer parameters. At the same time, we show that we don’t have to compromise on GPT-3’s capabilities, as measured by our model’s performance on academic NLP evaluations.These InstructGPT models, which have been in beta on the API for more than a year, are now the default language models accessible on our API.B[B]The InstructGPT models deployed in the API are updated versions trained using the same human feedback data. They use a similar but slightly different training method that we will describe in a forthcoming publication.We believe that fine-tuning language models with humans in the loop is a powerful tool for improving their safety and reliability, and we will continue to push in this direction.This is the first time our alignment research, which we’ve been pursuing for several years,1,2,3 has been applied to product.Yesterday we announced our next-generation Gemini model: Gemini 1.5. In addition to big improvements to speed and efficiency, one of Gemini 1.5’s innovations is its long context window, which measures how many tokens — the smallest building blocks, like part of a word, image or video — that the model can process at once. To help understand the significance of this milestone, we asked the Google DeepMind project team to explain what long context windows are, and how this breakthrough experimental feature can help developers in many ways. Context windows are important because they help AI models recall information during a session. Have you ever forgotten someone’s name in the middle of a conversation a few minutes after they’ve said it, or sprinted across a room to grab a notebook to jot down a phone number you were just given? Remembering things in the flow of a conversation can be tricky for AI models, too — you might have had an experience where a chatbot “forgot” information after a few turns. That’s where long context windows can help. Previously, Gemini could process up to 32,000 tokens at once, but 1.5 Pro — the first 1.5 model we’re releasing for early testing — has a context window of up to 1 million tokens — the longest context window of any large-scale foundation model to date. In fact, we’ve even successfully tested up to 10 million tokens in our research. And the longer the context window, the more text, images, audio, code or video a model can take in and process. "Our original plan was to achieve 128,000 tokens in context, and I thought setting an ambitious bar would be good, so I suggested 1 million tokens," says Google DeepMind Research Scientist Nikolay Savinov, one of the research leads on the long context project. “And now we’ve even surpassed that in our research by 10x.” To make this kind of leap forward, the team had to make a series of deep learning innovations. Early explorations by Pranav Shyam offered valuable insights that helped steer our subsequent research in the right direction. “There was one breakthrough that led to another and another, and each one of them opened up new possibilities,” explains Google DeepMind Engineer Denis Teplyashin. “And then, when they all stacked together, we were quite surprised to discover what they could do, jumping from 128,000 tokens to 512,000 tokens to 1 million tokens, and just recently, 10 million tokens in our internal research.” The raw data that 1.5 Pro can handle opens up whole new ways to interact with the model. Instead of summarizing a document dozens of pages long, for example, it can summarize documents thousands of pages long. Where the old model could help analyze thousands of lines of code, thanks to its breakthrough long context window, 1.5 Pro can analyze tens of thousands of lines of code at once. “In one test, we dropped in an entire code base and it wrote documentation for it, which was really cool,” says Google DeepMind Research Scientist Machel Reid. “And there was another test where it was able to accurately answer questions about the 1924 film Sherlock Jr. after we gave the model the entire 45-minute movie to ‘watch.’” 1.5 Pro can also reason across data provided in a prompt. “One of my favorite examples from the past few days is this rare language — Kalamang — that fewer than 200 people worldwide speak, and there one grammar manual about it,” says Machel. The model can speak it on its own if you just ask it to translate into this language, but with the expanded long context window, you can put the entire grammar manual and some examples of sentences into context, and the model was able to learn to translate from English to Kalamang at a similar level to a person learning from the same content.”Gemini 1.5 Pro comes standard with a 128K-token context window, but a limited group of developers and enterprise customers can try it with a context window of up to 1 million tokens via AI Studio and Vertex AI in private preview. The full 1 million token context window is computationally intensive and still requires further optimizations to improve latency, which we’re actively working on as we scale it out. And as the team looks to the future, they’re continuing to work to make the model faster and more efficient, with safety at the core. They’re also looking to further expand the long context window, improve the underlying architectures, and integrate new hardware improvements. “10 million tokens at once is already close to the thermal limit of our Tensor Processing Units — we dont know where the limit is yet, and the model might be capable of even more as the hardware continues to improve,” says Nikolay.The team is excited to see what kinds of experiences developers and the broader community are able to achieve, too. “When I first saw we had a million tokens in context, my first question was, ‘What do you even use this for?’” says Machel. “But now, I think people’s imaginations are expanding, and they’ll find more and more creative ways to use these new capabilities.Long-context question answering (QA) tasks require reasoning over a long document or multiple documents. Addressing these tasks often benefits from identifying a set of evidence spans (e.g., sentences), which provide supporting evidence for answering the question. In this work, we propose a novel method for equipping long-context QA models with an additional sequence-level objective for better identification of the supporting evidence. We achieve this via an additional contrastive supervision signal in finetuning, where the model is encouraged to explicitly discriminate supporting evidence sentences from negative ones by maximizing question-evidence similarity. The proposed additional loss exhibits consistent improvements on three different strong long-context transformer models, across two challenging question answering benchmarks – HotpotQA and QAsper.Actively and judiciously select the most helpful questions for LLMs“Existing CoT studies largely rely on a fixed set of human-annotated exemplars, which are not necessarily the most effective ones. A good performance requires human prompt engineering which is costly. We identify the human prompt engineering as two complementary components: question selection and prompt template engineering. In this paper, we offer a solution to the key problem of determining which questions are the most important and helpful ones to annotate from a pool of task-specific queries. By borrowing ideas from the related problem of uncertainty-based active learning, we introduce several metrics to characterize the uncertainty so as to select the most uncertain questions for annotation. Experimental results demonstrate the superiority of our proposed method on eight complex reasoning tasks. With text-davinci-002, active-prompt improves upon  by 7 (67.9->74.9).'  # noqa: E501
+inputs = tokenizer(sentence_2048_tokens, return_tensors="pt", max_length=int(token_length), truncation=True).to("cuda")
+labels = inputs["input_ids"]
 # store the gpu memory for inputs
 input_memory = torch.cuda.memory_allocated() - weight_memory
 
 torch.cuda.reset_peak_memory_stats()  # Reset peak memory stats to measure forward pass accurately
 start = time.time()
 outputs = model(**inputs, labels=labels)
-activation_memory = torch.cuda.memory_allocated() - weight_memory 
+activation_memory = torch.cuda.memory_allocated() - weight_memory
 forward_time = time.time() - start
 
 # Execute a backward pass and measure the time
@@ -83,7 +87,7 @@
 gradient_memory = torch.cuda.memory_allocated() - weight_memory  # activation memory will free automatically
 
 optimizer.step()
-optimizer_memory = torch.cuda.memory_allocated() - gradient_memory - weight_memory 
+optimizer_memory = torch.cuda.memory_allocated() - gradient_memory - weight_memory
 backward_time = time.time() - start
 
 
@@ -98,4 +102,4 @@
 print(f"Total memory     : {total_memory / 1e6} MB\n")
 print(f"Forward time     : {forward_time} s")
 print(f"Backward time    : {backward_time} s")
-print("*"*50)
+print("*" * 50)
diff --git a/examples/dpo_train.py b/examples/dpo_train.py
index 7932c8c11..3ec3bd341 100644
--- a/examples/dpo_train.py
+++ b/examples/dpo_train.py
@@ -1,26 +1,21 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 # @Time    : 7/4/2024 20:31
 # @Author  : Yu Li
-# @Site    : 
+# @Site    :
 # @File    : dpo_train.py
 # 0. imports
-import logging
 import os
 import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
-from dataclasses import dataclass, field
-from typing import Optional
 
-import torch
-from transformers import HfArgumentParser, pipeline, AutoTokenizer
+from transformers import HfArgumentParser
 
 from lmflow.args import (
-    ModelArguments,
-    DatasetArguments,
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
-from lmflow.datasets.dataset import Dataset
 from lmflow.models.auto_model import AutoModel
 from lmflow.pipeline.auto_pipeline import AutoPipeline
 
@@ -28,11 +23,13 @@
     # Parses arguments
     pipeline_name = "dpo_aligner"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
-    parser = HfArgumentParser((
-        ModelArguments,
-        DatasetArguments,
-        PipelineArguments,
-    ))
+    parser = HfArgumentParser(
+        (
+            ModelArguments,
+            DatasetArguments,
+            PipelineArguments,
+        )
+    )
 
     model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
 
@@ -46,8 +43,4 @@
     model = AutoModel.get_model(model_args)
 
     # Aligns model with rewards
-    aligned_model = aligner.align(
-        model=model,
-        dataset=None,
-        reward_model=None
-    )
\ No newline at end of file
+    aligned_model = aligner.align(model=model, dataset=None, reward_model=None)
diff --git a/examples/dpov2_train.py b/examples/dpov2_train.py
index ae4ca22d6..cf30682cc 100644
--- a/examples/dpov2_train.py
+++ b/examples/dpov2_train.py
@@ -1,33 +1,27 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+import copy
 import logging
 import os
 import sys
-import copy
 
-from transformers import (
-    HfArgumentParser
-)
+from transformers import HfArgumentParser
 
-from lmflow.datasets import Dataset
-from lmflow.models.auto_model import AutoModel
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.args import (
-    ModelArguments, 
-    DatasetArguments, 
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
-from lmflow.utils.common import remove_dataclass_attr_prefix, create_copied_dataclass
-
+from lmflow.datasets import Dataset
+from lmflow.models.auto_model import AutoModel
+from lmflow.pipeline.auto_pipeline import AutoPipeline
+from lmflow.utils.common import create_copied_dataclass, remove_dataclass_attr_prefix
 
 logger = logging.getLogger(__name__)
 
 
 ReferenceModelArguments = create_copied_dataclass(
-    original_dataclass=ModelArguments, 
-    field_prefix="reference_",
-    class_prefix="Reference"
+    original_dataclass=ModelArguments, field_prefix="reference_", class_prefix="Reference"
 )
 
 
@@ -36,19 +30,16 @@ def main():
     pipeline_name = "dpov2_aligner"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments, 
-        ReferenceModelArguments,
-        DatasetArguments,
-        PipelineArguments
-    ))
+    parser = HfArgumentParser((ModelArguments, ReferenceModelArguments, DatasetArguments, PipelineArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, ref_model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+        model_args, ref_model_args, data_args, pipeline_args = parser.parse_json_file(
+            json_file=os.path.abspath(sys.argv[1])
+        )
     else:
         model_args, ref_model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
-        
+
     ref_model_args_dict = remove_dataclass_attr_prefix(ref_model_args, "reference_")
     ref_model_args = ModelArguments(**ref_model_args_dict)
 
@@ -66,13 +57,13 @@ def main():
         ref_model_args=ref_model_args,
     )
 
-    res = aligner.align(
+    aligner.align(
         model=model,
         ref_model=ref_model,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
     )
-    
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/examples/ds_config.json b/examples/ds_config.json
deleted file mode 100644
index 37e4f0dcb..000000000
--- a/examples/ds_config.json
+++ /dev/null
@@ -1,11 +0,0 @@
-{
-    "fp16": {
-        "enabled": false
-    },
-    "bf16": {
-        "enabled": true
-    },
-    "steps_per_print": 2000,
-    "train_micro_batch_size_per_gpu": 1,
-    "wall_clock_breakdown": false
-}
diff --git a/examples/evaluation.py b/examples/evaluation.py
index b9360763c..f436049bf 100644
--- a/examples/evaluation.py
+++ b/examples/evaluation.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
 """A one-line summary of the module or program, terminated by a period.
 
@@ -13,17 +12,18 @@
   foo = ClassFoo()
   bar = foo.FunctionBar()
 """
+
 import json
 import os
 import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 from transformers import HfArgumentParser
 
+from lmflow.args import AutoArguments, DatasetArguments, ModelArguments
 from lmflow.datasets.dataset import Dataset
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.models.auto_model import AutoModel
-from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
-
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 
 pipeline_name = "evaluator"
 PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
@@ -31,14 +31,13 @@
 parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
 model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
 
-with open (pipeline_args.deepspeed, "r") as f:
+with open(pipeline_args.deepspeed) as f:
     ds_config = json.load(f)
 
 model = AutoModel.get_model(
-    model_args, 
-    tune_strategy='none', 
-    ds_config=ds_config, 
-    use_accelerator=pipeline_args.use_accelerator_for_evaluator
+    model_args,
+    do_train=False,
+    ds_config=ds_config,
 )
 dataset = Dataset(data_args)
 
diff --git a/examples/finetune.py b/examples/finetune.py
index 6ec69f70b..1a77d95e6 100644
--- a/examples/finetune.py
+++ b/examples/finetune.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
 """A one-line summary of the module or program, terminated by a period.
 
@@ -14,24 +13,24 @@
   bar = foo.FunctionBar()
 """
 
-import sys
 import os
+import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 from transformers import HfArgumentParser
 
 from lmflow.args import (
-    ModelArguments,
-    DatasetArguments,
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
-
 from lmflow.datasets.dataset import Dataset
 from lmflow.models.auto_model import AutoModel
 from lmflow.pipeline.auto_pipeline import AutoPipeline
 
 
 def main():
-	# Parses arguments
+    # Parses arguments
     pipeline_name = "finetuner"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
@@ -54,8 +53,8 @@ def main():
     model = AutoModel.get_model(model_args)
 
     # Finetuning
-    tuned_model = finetuner.tune(model=model, dataset=dataset)
+    finetuner.tune(model=model, dataset=dataset)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/finetune_multi_modal.py b/examples/finetune_multi_modal.py
index 90fecc2a3..01401096a 100644
--- a/examples/finetune_multi_modal.py
+++ b/examples/finetune_multi_modal.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
 # FIXME should merge with finetune.py
 """A one-line summary of the module or program, terminated by a period.
@@ -15,26 +14,22 @@
   bar = foo.FunctionBar()
 """
 
-import sys
 import os
+import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 from transformers import HfArgumentParser
 
 from lmflow.args import (
-    VisModelArguments,
-    MultiModalDatasetArguments,
     AutoArguments,
+    MultiModalDatasetArguments,
+    VisModelArguments,
 )
-
 from lmflow.datasets.dataset import Dataset
+from lmflow.datasets.multi_modal_dataset import DataCollatorForSupervisedDataset
 from lmflow.models.auto_model import AutoModel
 from lmflow.pipeline.auto_pipeline import AutoPipeline
 
-from lmflow.models.vision2seq_model import CustomAutoVision2SeqModel
-from lmflow.models.vision_encoder import build_vision_tower
-from lmflow.datasets.multi_modal_dataset import DataCollatorForSupervisedDataset
-from torch.utils.data import DataLoader
-
 
 def main():
     # Parses arguments
@@ -59,11 +54,14 @@ def main():
     # do not resiger deepspeed in the model.
     # with_deepspeed flag may be removed
     # by modifying the tune strategy in the future.
-    model = AutoModel.get_model(model_args, tune_strategy='none',
-                                ds_config=pipeline_args.deepspeed,
-                                custom_model=True,
-                                with_deepspeed=False,
-                                pipeline_args=pipeline_args)
+    model = AutoModel.get_model(
+        model_args,
+        do_train=True,
+        ds_config=pipeline_args.deepspeed,
+        custom_model=True,
+        with_deepspeed=False,
+        pipeline_args=pipeline_args,
+    )
     # FIXME check if need to move this part to hf_encoder_decoder.py
     for param in model.backend_model.parameters():
         param.requires_grad = False
@@ -81,9 +79,8 @@ def main():
     data_collator = DataCollatorForSupervisedDataset(tokenizer=model.tokenizer)
 
     # Finetuning
-    tuned_model = finetuner.tune(
-        model=model, dataset=dataset, data_collator=data_collator)
+    finetuner.tune(model=model, dataset=dataset, data_collator=data_collator)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/inference.py b/examples/inference.py
index 6381aa86d..f87c1ee17 100644
--- a/examples/inference.py
+++ b/examples/inference.py
@@ -1,48 +1,47 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""A simple shell chatbot implemented with lmflow APIs.
-"""
-import logging
+"""A simple shell chatbot implemented with lmflow APIs."""
+
 import json
+import logging
 import os
 import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 import warnings
 
-from dataclasses import dataclass, field
 from transformers import HfArgumentParser
-from typing import Optional
 
+from lmflow.args import AutoArguments, DatasetArguments, ModelArguments
 from lmflow.datasets.dataset import Dataset
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.models.auto_model import AutoModel
-from lmflow.args import ModelArguments, DatasetArguments, AutoArguments
-
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 
 logging.disable(logging.ERROR)
 warnings.filterwarnings("ignore")
 
+
 def main():
     pipeline_name = "inferencer"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments,
-        PipelineArguments,
-    ))
+    parser = HfArgumentParser(
+        (
+            ModelArguments,
+            PipelineArguments,
+        )
+    )
     model_args, pipeline_args = parser.parse_args_into_dataclasses()
     inferencer_args = pipeline_args
 
-    with open (pipeline_args.deepspeed, "r") as f:
+    with open(pipeline_args.deepspeed) as f:
         ds_config = json.load(f)
 
     model = AutoModel.get_model(
         model_args,
-        tune_strategy='none',
+        do_train=False,
         ds_config=ds_config,
         device=pipeline_args.device,
-        use_accelerator=True,
     )
 
     # We don't need input data, we will read interactively from stdin
@@ -63,12 +62,9 @@ def main():
 
     while True:
         input_text = input("User >>> ")
-        input_text = input_text[-model.get_max_length():]     # Truncation
+        input_text = input_text[-model.get_max_length() :]  # Truncation
 
-        input_dataset = dataset.from_dict({
-            "type": "text_only",
-            "instances": [ { "text": input_text } ]
-        })
+        input_dataset = dataset.from_dict({"type": "text_only", "instances": [{"text": input_text}]})
 
         output_dataset = inferencer.inference(
             model=model,
diff --git a/examples/iterative_dpo_train.py b/examples/iterative_dpo_train.py
index 14ffde329..df517c082 100644
--- a/examples/iterative_dpo_train.py
+++ b/examples/iterative_dpo_train.py
@@ -1,42 +1,34 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+import copy
 import logging
 import os
 import sys
-import copy
 
-from transformers import (
-    HfArgumentParser
-)
+from transformers import HfArgumentParser
 
-from lmflow.datasets import Dataset
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.args import (
-    ModelArguments, 
-    DatasetArguments, 
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
-from lmflow.utils.common import remove_dataclass_attr_prefix, create_copied_dataclass
-
+from lmflow.datasets import Dataset
+from lmflow.pipeline.auto_pipeline import AutoPipeline
+from lmflow.utils.common import create_copied_dataclass, remove_dataclass_attr_prefix
 
 logger = logging.getLogger(__name__)
 
 
 # NOTE:
 # In training processes that needs more than one model such as dpo (reference & target),
-# ppo (actor & critic), etc., we use the following function to create separate model arguments 
+# ppo (actor & critic), etc., we use the following function to create separate model arguments
 # to distinguish among them.
 ReferenceModelArguments = create_copied_dataclass(
-    original_dataclass=ModelArguments, 
-    field_prefix="reference_",
-    class_prefix="Reference"
+    original_dataclass=ModelArguments, field_prefix="reference_", class_prefix="Reference"
 )
 
 RewardModelArguments = create_copied_dataclass(
-    original_dataclass=ModelArguments, 
-    field_prefix="reward_",
-    class_prefix="Reward"
+    original_dataclass=ModelArguments, field_prefix="reward_", class_prefix="Reward"
 )
 
 
@@ -44,22 +36,22 @@ def main():
     pipeline_name = "iterative_dpo_aligner"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments, 
-        ReferenceModelArguments,
-        RewardModelArguments,
-        DatasetArguments,
-        PipelineArguments
-    ))
+    parser = HfArgumentParser(
+        (ModelArguments, ReferenceModelArguments, RewardModelArguments, DatasetArguments, PipelineArguments)
+    )
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
-        model_args, ref_model_args, reward_model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+        model_args, ref_model_args, reward_model_args, data_args, pipeline_args = parser.parse_json_file(
+            json_file=os.path.abspath(sys.argv[1])
+        )
     elif len(sys.argv) == 2 and sys.argv[1].endswith(".yaml"):
-        model_args, ref_model_args, reward_model_args, data_args, pipeline_args = parser.parse_yaml_file(yaml_file=os.path.abspath(sys.argv[1]))
+        model_args, ref_model_args, reward_model_args, data_args, pipeline_args = parser.parse_yaml_file(
+            yaml_file=os.path.abspath(sys.argv[1])
+        )
     else:
         model_args, ref_model_args, reward_model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
-        
+
     ref_model_args_dict = remove_dataclass_attr_prefix(ref_model_args, "reference_")
     ref_model_args = ModelArguments(**ref_model_args_dict)
     reward_model_args_dict = remove_dataclass_attr_prefix(reward_model_args, "reward_")
@@ -70,7 +62,7 @@ def main():
         iter_data_args = copy.deepcopy(data_args)
         iter_data_args.dataset_path = dataset
         dataset_list.append(Dataset(iter_data_args))
-    
+
     aligner = AutoPipeline.get_pipeline(
         pipeline_name=pipeline_name,
         model_args=model_args,
@@ -81,7 +73,7 @@ def main():
     )
 
     aligner.align(dataset_list=dataset_list)
-    
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/examples/merge_lora.py b/examples/merge_lora.py
index cd52363a8..55367af10 100644
--- a/examples/merge_lora.py
+++ b/examples/merge_lora.py
@@ -1,45 +1,36 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
 """
 Merge base model and lora model into a full model.
 """
 
-import sys
 import os
+import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 
 from dataclasses import dataclass, field
-from transformers import HfArgumentParser
 from typing import Optional
 
+from transformers import HfArgumentParser
+
 from lmflow.args import (
     ModelArguments,
-    AutoArguments,
 )
-
 from lmflow.models.auto_model import AutoModel
 
 
 @dataclass
 class MergeLoraArguments:
     device: str = field(
-        default='cpu',
+        default="cpu",
         metadata={
             "help": "device to merge model on",
         },
     )
-    ds_config: str = field(
-        default='configs/ds_config_eval.json',
-        metadata={
-            "help": "deepspeed config file path",
-        },
-    )
     output_model_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "output merged full model path"
-        },
+        metadata={"help": "output merged full model path"},
     )
     local_rank: Optional[int] = field(
         default=-1,
@@ -55,21 +46,20 @@ def main():
         model_args, merge_lora_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
         model_args, merge_lora_args = parser.parse_args_into_dataclasses()
-        
-    if merge_lora_args.device == 'gpu':
-        raise NotImplementedError('Merging LoRA weight using GPU not supported yet. Please use cpu.')
+
+    if merge_lora_args.device == "gpu":
+        raise NotImplementedError("Merging LoRA weight using GPU not supported yet. Please use cpu.")
 
     model_args.use_lora = True
     model = AutoModel.get_model(
-        model_args, 
-        tune_strategy='none', 
+        model_args,
+        do_train=False,
         device=merge_lora_args.device,
-        ds_config=merge_lora_args.ds_config
     )
     model.activate_model_for_inference()
     model.merge_lora_weights()
     model.save(merge_lora_args.output_model_path, save_full_model=True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/multistage_finetune.py b/examples/multistage_finetune.py
index 2a090331a..de616a0f6 100644
--- a/examples/multistage_finetune.py
+++ b/examples/multistage_finetune.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
 """
 The finetuning of each epoch will be splitted into `--num_stages_per_epoch`
@@ -10,11 +9,12 @@
 be totally 4*3 stages, and the learning rate schedule will restart every stage.
 """
 
+import gc
 import logging
+import os
 import random
 import sys
-import os
-import gc
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 
 from dataclasses import dataclass, field
@@ -22,35 +22,24 @@
 from transformers import HfArgumentParser
 
 from lmflow.args import (
-    ModelArguments,
-    DatasetArguments,
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
-
 from lmflow.datasets.dataset import Dataset
 from lmflow.models.auto_model import AutoModel
 from lmflow.pipeline.auto_pipeline import AutoPipeline
 
-
 logger = logging.getLogger(__name__)
 
 
 @dataclass
 class MultistageFinetuneArgs:
-    num_stages_per_epoch: int = field(
-        default=1,
-        metadata={"help": "number of stages per epoch"}
-    )
+    num_stages_per_epoch: int = field(default=1, metadata={"help": "number of stages per epoch"})
     shuffle_base_seed: int = field(
-        default=23,
-        metadata={
-            "help": "base seed for generating dataset shuffle seeds each epoch"
-        }
-    )
-    start_epoch: int = field(
-        default=0,
-        metadata={"help": "start from a specific epoch"}
+        default=23, metadata={"help": "base seed for generating dataset shuffle seeds each epoch"}
     )
+    start_epoch: int = field(default=0, metadata={"help": "start from a specific epoch"})
 
 
 def setup_logger():
@@ -59,7 +48,7 @@ def setup_logger():
 
     handler = logging.StreamHandler(sys.stdout)
     handler.setLevel(logging.INFO)
-    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
     handler.setFormatter(formatter)
     root.addHandler(handler)
 
@@ -76,10 +65,7 @@ def shuffle_and_split_data(dataset, num_split=None, seed=None):
 
     dataset_list = []
     for i in range(num_split):
-        partial_data_dict = {
-            "type": data_dict["type"],
-            "instances": data_dict["instances"][i::num_split]
-        }
+        partial_data_dict = {"type": data_dict["type"], "instances": data_dict["instances"][i::num_split]}
         partial_dataset = Dataset.create_from_dict(partial_data_dict)
         dataset_list.append(partial_dataset)
 
@@ -90,22 +76,22 @@ def main():
     # Initializes logger
     setup_logger()
 
-	# Parses arguments
+    # Parses arguments
     pipeline_name = "finetuner"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments,
-        DatasetArguments,
-        PipelineArguments,
-        MultistageFinetuneArgs,
-    ))
+    parser = HfArgumentParser(
+        (
+            ModelArguments,
+            DatasetArguments,
+            PipelineArguments,
+            MultistageFinetuneArgs,
+        )
+    )
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a
         # json file, let's parse it to get our arguments.
-        model_args, data_args, pipeline_args = parser.parse_json_file(
-            json_file=os.path.abspath(sys.argv[1])
-        )
+        model_args, data_args, pipeline_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
     else:
         model_args, data_args, pipeline_args, multistage_args = parser.parse_args_into_dataclasses()
 
@@ -113,10 +99,7 @@ def main():
     full_dataset = Dataset(data_args)
 
     finetuner_args = pipeline_args
-    logger.warning(
-        "force set `--overwrite_output_dir True`"
-        " as required by multistage finetuning"
-    )
+    logger.warning("force set `--overwrite_output_dir True` as required by multistage finetuning")
     finetuner_args.overwrite_output_dir = True
 
     # Finetuning
@@ -126,7 +109,7 @@ def main():
 
     output_dir = finetuner_args.output_dir
     run_name = finetuner_args.run_name
-    finetuner_args.num_train_epochs = 1      # Finetune every 1 epoch
+    finetuner_args.num_train_epochs = 1  # Finetune every 1 epoch
     shuffle_seed = multistage_args.shuffle_base_seed
 
     for epoch in range(int(num_train_epochs)):
@@ -141,16 +124,14 @@ def main():
             continue
 
         for stage, dataset in enumerate(dataset_list):
-            is_main_process = (finetuner_args.local_process_index == 0)
+            is_main_process = finetuner_args.local_process_index == 0
             if is_main_process:
                 logger.setLevel(logging.INFO)
                 logger.info(f"========== epoch {epoch} stage {stage} ==========")
 
             # Initialization for each sub-finetune
             model = AutoModel.get_model(model_args)
-            finetuner_args.output_dir = (
-                output_dir + f"_epoch-{epoch}_stage-{stage}"
-            )
+            finetuner_args.output_dir = output_dir + f"_epoch-{epoch}_stage-{stage}"
             finetuner_args.run_name = run_name + f"_epoch-{epoch}_stage-{stage}"
 
             finetuner = AutoPipeline.get_pipeline(
@@ -167,17 +148,15 @@ def main():
                 transform_dataset_in_place=False,
             )
             model_args.model_name_or_path = finetuner_args.output_dir
+            tuned_model.save(output_dir)
             del model
             del tuned_model
             del finetuner
-            tuned_model = None
-            model = None
-            finetuner = None
+            tuned_model = None  # noqa: F841
+            model = None  # noqa: F841
+            finetuner = None  # noqa: F841
             gc.collect()
 
 
-    tuned_model.save(output_dir)
-
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/raft_align.py b/examples/raft_align.py
index 759d01dba..a5a8904bf 100644
--- a/examples/raft_align.py
+++ b/examples/raft_align.py
@@ -1,23 +1,21 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
 """Alignment tuning example, such as RLHF."""
 
-import logging
 import os
 import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 from dataclasses import dataclass, field
 from typing import Optional
 
-from transformers import HfArgumentParser, pipeline, AutoTokenizer
+from transformers import AutoTokenizer, HfArgumentParser, pipeline
 
 from lmflow.args import (
-    ModelArguments,
-    DatasetArguments,
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
-
 from lmflow.datasets.dataset import Dataset
 from lmflow.models.auto_model import AutoModel
 from lmflow.pipeline.auto_pipeline import AutoPipeline
@@ -30,30 +28,24 @@ class RewardArguments:
         metadata={
             "help": (
                 "type of reward model, support huggingface pipeline. Will"
-                " support \"customized\" torch.nn.modules in the future."
+                ' support "customized" torch.nn.modules in the future.'
             ),
         },
     )
     reward_model_or_path: Optional[str] = field(
         default="weqweasdas/hh_rlhf_rm",
         metadata={
-            "help": (
-                "reward model name (huggingface) or its path"
-            ),
+            "help": ("reward model name (huggingface) or its path"),
         },
     )
     reward_task: Optional[str] = field(
         default="sentiment-analysis",
-        metadata={
-            "help": "type of reward task, such as sentiment-analysis, detoxic."
-        },
+        metadata={"help": "type of reward task, such as sentiment-analysis, detoxic."},
     )
     reward_model_args: Optional[str] = field(
-        default="return_all_scores=True, function_to_apply=\"none\", batch_size=1",
+        default='return_all_scores=True, function_to_apply="none", batch_size=1',
         metadata={
-            "help": (
-                "extra arguments required by different type of reward models."
-            ),
+            "help": ("extra arguments required by different type of reward models."),
         },
     )
 
@@ -63,62 +55,53 @@ def get_reward_function(reward_args, pipeline_args):
     reward_type = args.reward_type
 
     if reward_type == "hf_pipeline":
-
         # GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
         # only for this model.
         rm_tokenizer = AutoTokenizer.from_pretrained(reward_args.reward_model_or_path)
         rm_tokenizer.pad_token = rm_tokenizer.eos_token
         rm_tokenizer.pad_token_id = rm_tokenizer.eos_token_id
         rm_tokenizer.padding_side = "left"
-        
+
         hf_pipe = pipeline(
             reward_args.reward_task,
             model=reward_args.reward_model_or_path,
             device=f"cuda:{pipeline_args.local_rank}",
-            tokenizer=rm_tokenizer
+            tokenizer=rm_tokenizer,
         )
+
         def reward_func(dataset: Dataset):
             if dataset.type != "text_only":
-                raise NotImplementedError(
-                    "reward function only accept \"text_only\" datasets"
-                )
-            pipe_kwargs = {
-                "return_all_scores": True,
-                "function_to_apply": "none",
-                "batch_size": 1
-            }
+                raise NotImplementedError('reward function only accept "text_only" datasets')
+            pipe_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 1}
 
             data_dict = dataset.to_dict()
-            texts_for_rewards = [
-                sample["text"] for sample in data_dict["instances"]
-            ]
+            texts_for_rewards = [sample["text"] for sample in data_dict["instances"]]
             pipe_outputs = hf_pipe(texts_for_rewards, **pipe_kwargs)
             rewards = [output[0]["score"] for output in pipe_outputs]
 
-            reward_dataset = Dataset.create_from_dict({
-                "type": "float_only",
-                "instances": [
-                    { "value": reward } for reward in rewards
-                ]
-            })
+            reward_dataset = Dataset.create_from_dict(
+                {"type": "float_only", "instances": [{"value": reward} for reward in rewards]}
+            )
             return reward_dataset
 
         return reward_func
     else:
-        raise NotImplementedError("unsupported reward type \"{reward_type}\"")
+        raise NotImplementedError('unsupported reward type "{reward_type}"')
 
 
 def main():
-	# Parses arguments
+    # Parses arguments
     pipeline_name = "raft_aligner"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments,
-        DatasetArguments,
-        PipelineArguments,
-        RewardArguments,
-    ))
+    parser = HfArgumentParser(
+        (
+            ModelArguments,
+            DatasetArguments,
+            PipelineArguments,
+            RewardArguments,
+        )
+    )
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         model_args, data_args, pipeline_args, reward_args = parser.parse_json_file(
             json_file=os.path.abspath(sys.argv[1])
@@ -144,12 +127,12 @@ def main():
     reward_model.register_inference_function(reward_function)
 
     # Aligns model with rewards
-    aligned_model = aligner.align(
+    aligner.align(
         model=model,
         dataset=dataset,
         reward_model=reward_model,
     )
 
 
-if __name__ == '__main__':
-    main()
\ No newline at end of file
+if __name__ == "__main__":
+    main()
diff --git a/examples/reward_modeling.py b/examples/reward_modeling.py
index e69fc9b7e..66e6c949f 100644
--- a/examples/reward_modeling.py
+++ b/examples/reward_modeling.py
@@ -1,31 +1,27 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import logging
-import sys
 import os
+import sys
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 
-import torch
-from transformers import (
-    HfArgumentParser
-)
+from transformers import HfArgumentParser
 
 from lmflow.args import (
-    ModelArguments,
-    DatasetArguments,
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
 from lmflow.datasets.dataset import Dataset
 from lmflow.models.auto_model import AutoModel
 from lmflow.pipeline.auto_pipeline import AutoPipeline
 
-
 logger = logging.getLogger(__name__)
 
 
 def main():
-	# Parses arguments
+    # Parses arguments
     pipeline_name = "rm_tuner"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
@@ -46,10 +42,10 @@ def main():
     )
     dataset = Dataset(data_args)
     model = AutoModel.get_model(model_args)
-        
+
     # Finetuning
-    tuned_model = finetuner.tune(model=model, dataset=dataset)
+    finetuner.tune(model=model, dataset=dataset)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/rm_inference.py b/examples/rm_inference.py
index 0f8acf7ce..7de1e727e 100644
--- a/examples/rm_inference.py
+++ b/examples/rm_inference.py
@@ -1,23 +1,19 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import logging
 import os
 import sys
 
-from transformers import (
-    HfArgumentParser
-)
+from transformers import HfArgumentParser
 
-from lmflow.datasets import Dataset
-from lmflow.models.auto_model import AutoModel
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.args import (
-    ModelArguments, 
-    DatasetArguments, 
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
-
+from lmflow.datasets import Dataset
+from lmflow.models.auto_model import AutoModel
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 
 logger = logging.getLogger(__name__)
 
@@ -27,11 +23,7 @@ def main():
     pipeline_name = "rm_inferencer"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments, 
-        DatasetArguments,
-        PipelineArguments
-    ))
+    parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
@@ -40,22 +32,19 @@ def main():
         model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
 
     dataset = Dataset(data_args)
-    model = AutoModel.get_model(model_args, tune_strategy='none', use_accelerator=pipeline_args.use_accelerator)
+    model = AutoModel.get_model(model_args, do_train=False)
     inferencer = AutoPipeline.get_pipeline(
-        pipeline_name=pipeline_name,
-        model_args=model_args,
-        data_args=data_args,
-        pipeline_args=pipeline_args
+        pipeline_name=pipeline_name, model_args=model_args, data_args=data_args, pipeline_args=pipeline_args
     )
 
     res = inferencer.inference(
         model,
         dataset,
     )
-    
+
     if pipeline_args.save_results:
         res.save(pipeline_args.results_path)
-    
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/examples/speculative_inference.py b/examples/speculative_inference.py
index 200507ec9..43fc73734 100644
--- a/examples/speculative_inference.py
+++ b/examples/speculative_inference.py
@@ -1,52 +1,64 @@
-import os
 import argparse
+import os
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--gpu', type=str, default='0', 
-                        help='gpu id, currently speculative inference only support single gpu')
-    parser.add_argument('--model', type=str, default='gpt2-xl', 
-                        help='target model name or path (i.e., the large model you want to accelerate), \
-                            currently only supports huggingface decoder only models')
-    parser.add_argument('--draft_model', type=str, default='gpt2',
-                        help='draft model name or path, currently only supports huggingface decoder only models')
-    parser.add_argument('--gamma', type=int, default=5,
-                        help='number of tokens that the draft model will generate at each step')
-    parser.add_argument('--max_new_tokens', type=int, default=512,
-                        help='maximum number of tokens that the speculative inference will generate')
-    parser.add_argument('--temperature', type=float, default=0.3,
-                        help='temperature for sampling')
-    
+    parser.add_argument(
+        "--gpu", type=str, default="0", help="gpu id, currently speculative inference only support single gpu"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="gpt2-xl",
+        help="target model name or path (i.e., the large model you want to accelerate), \
+                            currently only supports huggingface decoder only models",
+    )
+    parser.add_argument(
+        "--draft_model",
+        type=str,
+        default="gpt2",
+        help="draft model name or path, currently only supports huggingface decoder only models",
+    )
+    parser.add_argument(
+        "--gamma", type=int, default=5, help="number of tokens that the draft model will generate at each step"
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=512,
+        help="maximum number of tokens that the speculative inference will generate",
+    )
+    parser.add_argument("--temperature", type=float, default=0.3, help="temperature for sampling")
+
     params = parser.parse_args()
-    
+
     os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu
-    from lmflow.args import InferencerArguments
-    from lmflow.args import ModelArguments
-    from lmflow.args import DatasetArguments
+    from lmflow.args import DatasetArguments, InferencerArguments, ModelArguments
     from lmflow.models import hf_decoder_model
     from lmflow.pipeline.inferencer import SpeculativeInferencer
-    
-    
+
     model_args = ModelArguments(model_name_or_path=params.model)
     model = hf_decoder_model.HFDecoderModel(model_args)
     draft_model_args = ModelArguments(model_name_or_path=params.draft_model)
     draft_model = hf_decoder_model.HFDecoderModel(draft_model_args)
     inferencer_args = InferencerArguments()
     data_args = DatasetArguments()
-    
+
     specinf = SpeculativeInferencer(model_args, draft_model_args, data_args, inferencer_args)
-    
+
     while True:
         try:
             text = input("Speculative Inference: ")
-            specinf_res = specinf.inference(model, 
-                                            draft_model, 
-                                            text, 
-                                            gamma=params.gamma, 
-                                            max_new_tokens=params.max_new_tokens, 
-                                            temperature=params.temperature)
+            specinf_res = specinf.inference(
+                model,
+                draft_model,
+                text,
+                gamma=params.gamma,
+                max_new_tokens=params.max_new_tokens,
+                temperature=params.temperature,
+            )
             print(specinf_res)
-            print('\n\n')
+            print("\n\n")
 
         except EOFError:
             break
diff --git a/examples/tool_inference.py b/examples/tool_inference.py
index 34b08af5c..f1b3dde0e 100644
--- a/examples/tool_inference.py
+++ b/examples/tool_inference.py
@@ -1,45 +1,51 @@
-import os
 import argparse
-from lmflow.args import InferencerArguments
-from lmflow.args import ModelArguments
-from lmflow.args import DatasetArguments
+import os
+
+from lmflow.args import DatasetArguments, InferencerArguments, ModelArguments
 from lmflow.models import hf_decoder_model
 from lmflow.pipeline.inferencer import ToolInferencer
+
+
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--gpu', type=str, default='0', 
-                        help='gpu id, currently speculative inference only support single gpu')
-    parser.add_argument('--model', type=str, default='codellama/CodeLlama-7b-instruct-hf',
-                        help='target code generation model name or path you  \
-                            currently only supports huggingface decoder only models')
+    parser.add_argument(
+        "--gpu", type=str, default="0", help="gpu id, currently speculative inference only support single gpu"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="codellama/CodeLlama-7b-instruct-hf",
+        help="target code generation model name or path you  \
+                            currently only supports huggingface decoder only models",
+    )
     params = parser.parse_args()
     os.environ["CUDA_VISIBLE_DEVICES"] = params.gpu
-    
+
     model_args = ModelArguments(model_name_or_path=params.model)
     model = hf_decoder_model.HFDecoderModel(model_args)
     inferencer_args = InferencerArguments()
     data_args = DatasetArguments()
-    
+
     toolinf = ToolInferencer(model_args, data_args, inferencer_args)
-    
+
     while True:
         try:
             text = input("Tool Inference: ")
             toolinf_res = toolinf.inference(model, text)
-            toolinf_res = toolinf_res.replace("<s>","")
-            toolinf_res = toolinf_res.replace("</s>","")
-            print('\n\nResult:')
+            toolinf_res = toolinf_res.replace("<s>", "")
+            toolinf_res = toolinf_res.replace("</s>", "")
+            print("\n\nResult:")
             print(toolinf_res)
-            print('\n\n')
+            print("\n\n")
             run_code = input("Run code? (y/n): ")
-            if run_code == 'y':
+            if run_code == "y":
                 toolinf.code_exec(toolinf_res)
-            if run_code == 'n':
+            if run_code == "n":
                 continue
-            
 
         except EOFError:
             break
 
-if __name__ == '__main__':
-    main()
\ No newline at end of file
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/vis_chatbot.py b/examples/vis_chatbot.py
index d313ef9f9..3f32e260a 100644
--- a/examples/vis_chatbot.py
+++ b/examples/vis_chatbot.py
@@ -1,58 +1,46 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""A simple shell to inference the input data.
-"""
-from cmath import e
-from dataclasses import dataclass, field
-import logging
+"""A simple shell to inference the input data."""
+
 import json
-import numpy as np
-import requests
-from PIL import Image
+import logging
 import os
 import os.path as osp
 import sys
+from dataclasses import dataclass, field
+
+import numpy as np
+import requests
+from PIL import Image
+
 sys.path.remove(os.path.abspath(os.path.dirname(sys.argv[0])))
 import time
-from typing import Optional
 import warnings
+from typing import Optional
 
 from transformers import HfArgumentParser
 
+from lmflow.args import AutoArguments, DatasetArguments, VisModelArguments
 from lmflow.datasets.dataset import Dataset
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.models.auto_model import AutoModel
-from lmflow.args import (VisModelArguments, DatasetArguments, \
-                            InferencerArguments, AutoArguments)
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 
 logging.disable(logging.ERROR)
 warnings.filterwarnings("ignore")
 
+
 @dataclass
 class ChatbotArguments:
     prompt_structure: Optional[str] = field(
         default="{input_text}",
-        metadata={
-            "help": "prompt structure given user's input text"
-        },
+        metadata={"help": "prompt structure given user's input text"},
     )
     end_string: Optional[str] = field(
         default="#",
-        metadata={
-            "help": "end string mark of the chatbot's output"
-        },
-    )
-    image_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "image path for input image"}
-    )
-    input_text: Optional[str] = field(
-        default="",
-        metadata={
-            "help": "input text for reasoning"}
+        metadata={"help": "end string mark of the chatbot's output"},
     )
+    image_path: Optional[str] = field(default=None, metadata={"help": "image path for input image"})
+    input_text: Optional[str] = field(default="", metadata={"help": "input text for reasoning"})
     task: Optional[str] = field(
         default="vqa",
         metadata={
@@ -62,50 +50,35 @@ class ChatbotArguments:
                 "the image and the conversation is only one round"
                 "If other, the conversation is multi-round"
             )
-        }
+        },
     )
     chatbot_type: Optional[str] = field(
         default="None",
-        metadata={
-            "help": (
-                "prompt format"
-                "the default format is ''"
-                "Anthoer format is they way in mini-gpt4."
-            )
-        }
-    )
-    stream_inference: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "whether to do the stream inference"
-        }
-    )
-    with_deepspeed: Optional[bool] = field(
-        default=True,
-        metadata={
-            "help": "whether to use deepspeed"
-        }
+        metadata={"help": ("prompt formatthe default format is ''Anthoer format is they way in mini-gpt4.")},
     )
+    stream_inference: Optional[bool] = field(default=False, metadata={"help": "whether to do the stream inference"})
+    with_deepspeed: Optional[bool] = field(default=True, metadata={"help": "whether to use deepspeed"})
+
 
 def main():
     pipeline_name = "inferencer"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        VisModelArguments,
-        PipelineArguments,
-        ChatbotArguments,
-    ))
-
-    model_args, pipeline_args, chatbot_args = (
-        parser.parse_args_into_dataclasses()
+    parser = HfArgumentParser(
+        (
+            VisModelArguments,
+            PipelineArguments,
+            ChatbotArguments,
+        )
     )
+
+    model_args, pipeline_args, chatbot_args = parser.parse_args_into_dataclasses()
     inferencer_args = pipeline_args
-    with open (pipeline_args.deepspeed, "r") as f:
+    with open(pipeline_args.deepspeed) as f:
         ds_config = json.load(f)
     model = AutoModel.get_model(
         model_args,
-        tune_strategy='none',
+        do_train=False,
         ds_config=ds_config,
         device=pipeline_args.device,
         custom_model=model_args.custom_model,
@@ -125,9 +98,7 @@ def main():
     # Chats
     model_name = model_args.model_name_or_path
     if model_args.llm_model_name_or_path is not None:
-        model_name = model_name + " with {}".format(
-            model_args.llm_model_name_or_path
-        )
+        model_name = model_name + " with {}".format(model_args.llm_model_name_or_path)
     if model_args.lora_model_path is not None:
         model_name += f" + {model_args.lora_model_path}"
 
@@ -146,15 +117,20 @@ def main():
     #     " unconditionally."
     # )
 
-
     end_string = chatbot_args.end_string
     if chatbot_args.chatbot_type == "mini_gpt":
-        context = "Give the following image: <Img>ImageContent</Img>. " + "You will be able to see the image once I provide it to you. Please answer my questions."
+        context = (
+            "Give the following image: <Img>ImageContent</Img>. "
+            + "You will be able to see the image once I provide it to you. Please answer my questions."
+        )
         user_name = "Human"
         sep = "###"
 
     elif chatbot_args.chatbot_type == "llava":
-        context = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions."
+        context = (
+            "A chat between a curious human and an artificial intelligence assistant. "
+            "The assistant gives helpful, detailed, and polite answers to the human's questions."
+        )
         user_name = "USER"
         sep = " "
     else:
@@ -168,8 +144,8 @@ def main():
     if chatbot_args.image_path is not None:
         raw_image = Image.open(chatbot_args.image_path)
     else:
-        img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
-        raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
+        img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
+        raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
     base_size = raw_image.size
     image_list.append(np.array(raw_image))
     input_text = chatbot_args.input_text
@@ -187,20 +163,25 @@ def main():
 
     if model_args.use_prompt_cache:
         if osp.exists(model_args.prompt_cache_path):
-            model.backend_model.load_prompt_cache(
-                model_args.prompt_cache_path)
+            model.backend_model.load_prompt_cache(model_args.prompt_cache_path)
 
     if chatbot_args.task == "image_caption":
         # single round reasoning
-        input_dataset = dataset.from_dict({
-            "type": "image_text",
-            "instances": [{"images": np.stack(image_list),
-                           "text":  input_text,}]
-        })
+        input_dataset = dataset.from_dict(
+            {
+                "type": "image_text",
+                "instances": [
+                    {
+                        "images": np.stack(image_list),
+                        "text": input_text,
+                    }
+                ],
+            }
+        )
         output = inferencer.inference(model, input_dataset)
-        print(output.backend_dataset['text'])
+        print(output.backend_dataset["text"])
     else:
-        # text, 1st image token, answer, text, 2nd image token, 
+        # text, 1st image token, answer, text, 2nd image token,
         while True:
             input_text = input("User >>> ")
             if input_text == "exit":
@@ -212,7 +193,7 @@ def main():
                     raw_image = Image.open(image_path)
                     # current dataset doesn't support batch of image with different shape
                     # so we resize the image and convert then into a numpy array
-                    # In the future, we need to design a new dataset format that support 
+                    # In the future, we need to design a new dataset format that support
                     # batch of image with different shape
                     raw_image = raw_image.resize(base_size)
                     image_list.append(np.array(raw_image))
@@ -232,23 +213,29 @@ def main():
                 context = ""
                 print("Chat history cleared")
                 continue
-            
+
             if text_after_loading_image is False:
                 context += sep + user_name + ": "
             else:
                 text_after_loading_image = False
-            
+
             if not input_text:
                 input_text = " "
             context += prompt_structure.format(input_text=input_text)
 
             # TODO handle when model doesn't have the get_max_length
-            context = context[-model.get_max_length():]     # Memory of the bot
-            input_dataset = dataset.from_dict({
-                "type": "image_text",
-                "instances": [{"images": np.stack(image_list),
-                               "text":  context,}]
-            })
+            context = context[-model.get_max_length() :]  # Memory of the bot
+            input_dataset = dataset.from_dict(
+                {
+                    "type": "image_text",
+                    "instances": [
+                        {
+                            "images": np.stack(image_list),
+                            "text": context,
+                        }
+                    ],
+                }
+            )
             if chatbot_args.chatbot_type in ["mini_gpt", "llava"]:
                 remove_image_flag = True
             else:
@@ -260,8 +247,9 @@ def main():
                     model,
                     input_dataset,
                     remove_image_flag=remove_image_flag,
-                    chatbot_type=chatbot_args.chatbot_type,)
-                response = output_dataset.backend_dataset['text']
+                    chatbot_type=chatbot_args.chatbot_type,
+                )
+                response = output_dataset.backend_dataset["text"]
                 print(response[0])
                 print("\n", end="")
                 context += response[0]
@@ -279,7 +267,7 @@ def main():
                     token_per_step=token_per_step,
                     temperature=inferencer_args.temperature,
                     end_string=end_string,
-                    input_dataset=input_dataset
+                    input_dataset=input_dataset,
                 ):
                     # Prints characters in the buffer
                     new_print_index = print_index
@@ -298,13 +286,13 @@ def main():
                 print("\n", end="")
 
                 context += response + "\n"
-            
+
             if model_args.use_prompt_cache:
-                osp.makedirs(osp.dirname(model_args.prompt_cache_path),
-                             exist_ok=True)
-                model.backend_model.save_prompt_cache(
-                            model_args.prompt_cache_path)
+                osp.makedirs(osp.dirname(model_args.prompt_cache_path), exist_ok=True)
+                model.backend_model.save_prompt_cache(model_args.prompt_cache_path)
             end_time = time.time()
             print("Whole data and model forward time", end_time - begin_time)
+
+
 if __name__ == "__main__":
     main()
diff --git a/examples/vis_chatbot_gradio.py b/examples/vis_chatbot_gradio.py
index e86a29c6a..19ef779a5 100644
--- a/examples/vis_chatbot_gradio.py
+++ b/examples/vis_chatbot_gradio.py
@@ -1,38 +1,36 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""A simple Multimodal chatbot implemented with lmflow APIs.
-"""
-from dataclasses import dataclass, field
+"""A simple Multimodal chatbot implemented with lmflow APIs."""
+
 import json
 import logging
 import time
 import warnings
+from dataclasses import dataclass, field
 from typing import Optional
 
 import numpy as np
-from PIL import Image
 import torch
+from PIL import Image
 from transformers import HfArgumentParser
 
+from lmflow.args import AutoArguments, DatasetArguments, VisModelArguments
 from lmflow.datasets.dataset import Dataset
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.models.auto_model import AutoModel
-from lmflow.args import (VisModelArguments, DatasetArguments, \
-                         InferencerArguments, AutoArguments)
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.utils.versioning import is_gradio_available
 
 if is_gradio_available():
     import gradio as gr
 else:
     raise ImportError("Gradio is not available. Please install it via `pip install gradio`.")
-    
+
 
 MAX_BOXES = 20
 
 logging.disable(logging.ERROR)
 warnings.filterwarnings("ignore")
-torch.multiprocessing.set_start_method('spawn', force=True)
+torch.multiprocessing.set_start_method("spawn", force=True)
 
 css = """
 #user {
@@ -71,43 +69,26 @@
 class ChatbotArguments:
     prompt_structure: Optional[str] = field(
         default="{input_text}",
-        metadata={
-            "help": "prompt structure given user's input text"
-        },
+        metadata={"help": "prompt structure given user's input text"},
     )
     end_string: Optional[str] = field(
         default="#",
-        metadata={
-            "help": "end string mark of the chatbot's output"
-        },
-    )
-    image_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "image path for input image"}
-    )
-    input_text: Optional[str] = field(
-        default="",
-        metadata={
-            "help": "input text for reasoning"}
+        metadata={"help": "end string mark of the chatbot's output"},
     )
+    image_path: Optional[str] = field(default=None, metadata={"help": "image path for input image"})
+    input_text: Optional[str] = field(default="", metadata={"help": "input text for reasoning"})
     task: Optional[str] = field(
         default="image_caption",
         metadata={
             "help": "task for reasoning",
-        }
-    )
-    chatbot_format: Optional[str] = field(
-        default="None",
-        metadata={
-            "help": "prompt format"
-        }
+        },
     )
+    chatbot_format: Optional[str] = field(default="None", metadata={"help": "prompt format"})
 
 
 def gradio_reset(chat_state, img_list):
     if chat_state is not None:
-        chat_state = ''
+        chat_state = ""
     if img_list is not None:
         img_list = []
     return (
@@ -118,6 +99,7 @@ def gradio_reset(chat_state, img_list):
         img_list,
     )
 
+
 def upload_image(image_file, history, text_input, chat_state, image_list):
     # if gr_image is None:
     #     return None, None, gr.update(interactive=True), chat_state, None
@@ -125,12 +107,14 @@ def upload_image(image_file, history, text_input, chat_state, image_list):
 
     if chat_state is None:
         if chatbot_args.chatbot_format == "mini_gpt":
-            chat_state = "Give the following image: <Img>ImageContent</Img>. " + "You will be able to see the image once I provide it to you. Please answer my questions."
+            chat_state = (
+                "Give the following image: <Img>ImageContent</Img>. "
+                + "You will be able to see the image once I provide it to you. Please answer my questions."
+            )
         else:
-            chat_state = ''
+            chat_state = ""
     image = read_img(image_file.name)
-    if not isinstance(image_list, list) or (
-            isinstance(image_list, list) and len(image_list) == 0):
+    if not isinstance(image_list, list) or (isinstance(image_list, list) and len(image_list) == 0):
         image_list = []
         image_list.append(image)
     else:
@@ -139,45 +123,48 @@ def upload_image(image_file, history, text_input, chat_state, image_list):
     if chatbot_args.chatbot_format == "mini_gpt":
         chat_state += "### Human: " + "<Img><ImageHere></Img>"
     return (
-        gr.update(interactive=True, placeholder='Enter text and press enter, or upload an image'),
+        gr.update(interactive=True, placeholder="Enter text and press enter, or upload an image"),
         history,
         chat_state,
         image_list,
     )
 
+
 def read_img(image):
     if isinstance(image, str):
-        raw_image = Image.open(image).convert('RGB')
+        raw_image = Image.open(image).convert("RGB")
     elif isinstance(image, Image.Image):
         raw_image = image
     else:
         raise NotImplementedError
     return raw_image
 
+
 def gradio_ask(user_message, chatbot, chat_state):
     if len(user_message) == 0:
-        return gr.update(interactive=True, placeholder='Input should not be empty!'), chatbot, chat_state
+        return gr.update(interactive=True, placeholder="Input should not be empty!"), chatbot, chat_state
     prompted_user_message = prompt_structure.format(input_text=user_message)
     if chat_state is None:
-        chat_state = ''
+        chat_state = ""
     chat_state = chat_state + prompted_user_message
 
     chatbot = chatbot + [[user_message, None]]
-    return '', chatbot, chat_state
+    return "", chatbot, chat_state
 
 
 def gradio_answer(chatbot, chat_state, image_list, num_beams=1, temperature=1.0):
-    input_dataset = dataset.from_dict({
-        "type": "image_text",
-        "instances": [{"images": np.stack([np.array(i) for i in image_list]),
-                        "text": chat_state}]
-    })
-    remove_image_flag = chatbot_args.chatbot_format=="mini_gpt"
+    input_dataset = dataset.from_dict(
+        {
+            "type": "image_text",
+            "instances": [{"images": np.stack([np.array(i) for i in image_list]), "text": chat_state}],
+        }
+    )
+    remove_image_flag = chatbot_args.chatbot_format == "mini_gpt"
 
-    chatbot[-1][1] = ''
+    chatbot[-1][1] = ""
 
     print_index = 0
-    token_per_step = 4 # 48
+    token_per_step = 4  # 48
     max_new_tokens = -1
     temperature = 0.7
     context = chatbot
@@ -192,15 +179,9 @@ def gradio_answer(chatbot, chat_state, image_list, num_beams=1, temperature=1.0)
     while not response_queue.empty():
         response_queue.get()
 
-    request_queue.put((
-        context,
-        max_new_tokens,
-        token_per_step,
-        temperature,
-        end_string,
-        input_dataset,
-        remove_image_flag
-    ))
+    request_queue.put(
+        (context, max_new_tokens, token_per_step, temperature, end_string, input_dataset, remove_image_flag)
+    )
 
     while True:
         if not response_queue.empty():
@@ -240,12 +221,12 @@ def start_inferencer(
     dataset,
     chatbot_args,
 ):
-    with open(pipeline_args.deepspeed, "r") as f:
+    with open(pipeline_args.deepspeed) as f:
         ds_config = json.load(f)
 
     model = AutoModel.get_model(
         model_args,
-        tune_strategy='none',
+        do_train=False,
         ds_config=ds_config,
         device=pipeline_args.device,
         custom_model=model_args.custom_model,
@@ -288,13 +269,13 @@ def start_inferencer(
                     break
 
             if not break_in_the_middle:
-                response_text = ''
+                response_text = ""
                 flag_break = True
                 response_queue.put((response_text, flag_break))
 
             mark = ""
             while mark != "busy":
-                mark = request_queue.get()     # Release the "busy" mark
+                mark = request_queue.get()  # Release the "busy" mark
 
         time.sleep(0.001)
 
@@ -303,14 +284,14 @@ def start_inferencer(
     pipeline_name = "inferencer"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        VisModelArguments,
-        PipelineArguments,
-        ChatbotArguments,
-    ))
-    model_args, pipeline_args, chatbot_args = (
-        parser.parse_args_into_dataclasses()
+    parser = HfArgumentParser(
+        (
+            VisModelArguments,
+            PipelineArguments,
+            ChatbotArguments,
+        )
     )
+    model_args, pipeline_args, chatbot_args = parser.parse_args_into_dataclasses()
     data_args = DatasetArguments(dataset_path=None)
     dataset = Dataset(data_args, backend="dict")
 
@@ -372,9 +353,7 @@ def start_inferencer(
             inputs=[chatbot, chat_state, image_list],
             outputs=[chatbot, chat_state, image_list],
         )
-        txt_msg.then(
-            lambda: gr.update(interactive=True), None, [text_input], queue=False
-        )
+        txt_msg.then(lambda: gr.update(interactive=True), None, [text_input], queue=False)
 
         file_msg = upload_button.upload(
             fn=upload_image,
@@ -392,4 +371,3 @@ def start_inferencer(
 
     demo.queue(max_size=1, api_open=False).launch(share=True)
     inferencer_process.join()
-
diff --git a/examples/vllm_inference.py b/examples/vllm_inference.py
index d6e4b7859..950505db8 100644
--- a/examples/vllm_inference.py
+++ b/examples/vllm_inference.py
@@ -1,23 +1,19 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import logging
 import os
 import sys
 
-from transformers import (
-    HfArgumentParser
-)
+from transformers import HfArgumentParser
 
-from lmflow.datasets import Dataset
-from lmflow.models.auto_model import AutoModel
-from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.args import (
-    ModelArguments, 
-    DatasetArguments, 
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
-
+from lmflow.datasets import Dataset
+from lmflow.models.auto_model import AutoModel
+from lmflow.pipeline.auto_pipeline import AutoPipeline
 
 logger = logging.getLogger(__name__)
 
@@ -27,11 +23,7 @@ def main():
     pipeline_name = "vllm_inferencer"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments, 
-        DatasetArguments,
-        PipelineArguments
-    ))
+    parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
@@ -40,22 +32,19 @@ def main():
         model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
 
     dataset = Dataset(data_args)
-    model = AutoModel.get_model(model_args, tune_strategy='none')
+    model = AutoModel.get_model(model_args, do_train=False)
     inferencer = AutoPipeline.get_pipeline(
-        pipeline_name=pipeline_name,
-        model_args=model_args,
-        data_args=data_args,
-        pipeline_args=pipeline_args
+        pipeline_name=pipeline_name, model_args=model_args, data_args=data_args, pipeline_args=pipeline_args
     )
 
-    res = inferencer.inference(
+    inferencer.inference(
         model,
         dataset,
         release_gpu=False,
         enable_decode_inference_result=pipeline_args.enable_decode_inference_result,
         enable_distributed_vllm_inference=pipeline_args.enable_distributed_vllm_inference,
     )
-    
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/experimental/Hymba/README.md b/experimental/Hymba/README.md
index 3d11018c8..74366d1fb 100644
--- a/experimental/Hymba/README.md
+++ b/experimental/Hymba/README.md
@@ -40,4 +40,4 @@ Recommend on the A100, H100, A40 GPUs.
 
 ## Training Loss
 The training loss curve for `nvidia/Hymba-1.5B-Instruct`, fine-tuned on the `MedMCQA/train` dataset with a learning rate of $5e-5$ over 100 steps using SFT, LoRA, LISA, and DORA, is shown below:
-![Training Loss](../../assets/Hymba_loss.png)
\ No newline at end of file
+![Training Loss](../../docs/assets/Hymba_loss.png)
\ No newline at end of file
diff --git a/experimental/Hymba/run_finetune_hymba.sh b/experimental/Hymba/run_finetune_hymba.sh
index 37b6b05b5..4f141b452 100644
--- a/experimental/Hymba/run_finetune_hymba.sh
+++ b/experimental/Hymba/run_finetune_hymba.sh
@@ -66,7 +66,7 @@ deepspeed ${deepspeed_args} \
     --block_size 256 \
     --trust_remote_code True \
     --per_device_train_batch_size 1 \
-    --deepspeed configs/ds_config_zero2_no_offload.json \
+    --deepspeed configs/archive/ds_config_zero2_no_offload.json \
     --bf16 \
     --run_name hymba_finetune \
     --validation_split_percentage 0 \
diff --git a/experimental/LISA-diffusion/diffusion_dpo/train_diffusion_dpo.py b/experimental/LISA-diffusion/diffusion_dpo/train_diffusion_dpo.py
index 4bb6b8944..32cea65ec 100644
--- a/experimental/LISA-diffusion/diffusion_dpo/train_diffusion_dpo.py
+++ b/experimental/LISA-diffusion/diffusion_dpo/train_diffusion_dpo.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 bram-w, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,6 +21,7 @@
 import shutil
 from pathlib import Path
 
+import diffusers
 import numpy as np
 import torch
 import torch.nn.functional as F
@@ -32,16 +32,6 @@
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from datasets import load_dataset
-from huggingface_hub import create_repo, upload_folder
-from packaging import version
-from peft import LoraConfig
-from peft.utils import get_peft_model_state_dict
-from PIL import Image
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-import diffusers
 from diffusers import (
     AutoencoderKL,
     DDPMScheduler,
@@ -53,7 +43,14 @@
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, convert_state_dict_to_diffusers
 from diffusers.utils.import_utils import is_xformers_available
-
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from peft import LoraConfig
+from peft.utils import get_peft_model_state_dict
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.25.0.dev0")
@@ -62,7 +59,7 @@
 
 
 VALIDATION_PROMPTS = [
-    "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+    "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",  # noqa: E501
     "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
     "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
     "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
@@ -86,7 +83,7 @@ def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: st
 
 
 def log_validation(args, unet, accelerator, weight_dtype, epoch, is_final_validation=False):
-    logger.info(f"Running validation... \n Generating images with prompts:\n" f" {VALIDATION_PROMPTS}.")
+    logger.info(f"Running validation... \n Generating images with prompts:\n {VALIDATION_PROMPTS}.")
 
     # create pipeline
     pipeline = DiffusionPipeline.from_pretrained(
@@ -192,7 +189,9 @@ def parse_args(input_args=None):
         "--run_validation",
         default=False,
         action="store_true",
-        help="Whether to run validation inference in between training and also after training. Helps to track progress.",
+        help=(
+            "Whether to run validation inference in between training and also after training. Helps to track progress."
+        ),
     )
     parser.add_argument(
         "--validation_steps",
@@ -205,8 +204,7 @@ def parse_args(input_args=None):
         type=int,
         default=None,
         help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
+            "For debugging purposes or quicker training, truncate the number of training examples to this value if set."
         ),
     )
     parser.add_argument(
@@ -448,9 +446,7 @@ def tokenize_captions(tokenizer, examples):
     for caption in examples["caption"]:
         captions.append(caption)
 
-    text_inputs = tokenizer(
-        captions, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt"
-    )
+    text_inputs = tokenizer(captions, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
 
     return text_inputs.input_ids
 
@@ -540,8 +536,8 @@ def main(args):
     text_encoder.requires_grad_(False)
     unet.requires_grad_(False)
 
-    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
-    # as these weights are only used for inference, keeping weights in full precision is not required.
+    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet)
+    # to half-precision as these weights are only used for inference, keeping weights in full precision is not required.
     weight_dtype = torch.float32
     if accelerator.mixed_precision == "fp16":
         weight_dtype = torch.float16
@@ -575,7 +571,9 @@ def main(args):
             xformers_version = version.parse(xformers.__version__)
             if xformers_version == version.parse("0.0.16"):
                 logger.warning(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during "
+                    "training, please update xFormers to at least 0.0.17. "
+                    "See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                 )
             unet.enable_xformers_memory_efficient_attention()
         else:
@@ -640,7 +638,7 @@ def load_model_hook(models, input_dir):
         except ImportError:
             raise ImportError(
                 "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
-            )
+            ) from None
 
         optimizer_class = bnb.optim.AdamW8bit
     else:
@@ -904,13 +902,15 @@ def collate_fn(examples):
                             checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
                             checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
 
-                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            # before we save the new checkpoint, we need to have at _most_
+                            # `checkpoints_total_limit - 1` checkpoints
                             if len(checkpoints) >= args.checkpoints_total_limit:
                                 num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
                                 removing_checkpoints = checkpoints[0:num_to_remove]
 
                                 logger.info(
-                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                    f"{len(checkpoints)} checkpoints already exist, removing "
+                                    f"{len(removing_checkpoints)} checkpoints"
                                 )
                                 logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
 
@@ -923,9 +923,7 @@ def collate_fn(examples):
                         logger.info(f"Saved state to {save_path}")
 
                     if args.run_validation and global_step % args.validation_steps == 0:
-                        log_validation(
-                            args, unet=unet, accelerator=accelerator, weight_dtype=weight_dtype, epoch=epoch
-                        )
+                        log_validation(args, unet=unet, accelerator=accelerator, weight_dtype=weight_dtype, epoch=epoch)
 
             logs = {
                 "loss": loss.detach().item(),
diff --git a/experimental/LISA-diffusion/diffusion_dpo/train_diffusion_dpo_lisa.py b/experimental/LISA-diffusion/diffusion_dpo/train_diffusion_dpo_lisa.py
index e166f536a..cff91ea41 100644
--- a/experimental/LISA-diffusion/diffusion_dpo/train_diffusion_dpo_lisa.py
+++ b/experimental/LISA-diffusion/diffusion_dpo/train_diffusion_dpo_lisa.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 bram-w, The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,27 +21,18 @@
 import shutil
 from pathlib import Path
 
+import accelerate
+import diffusers
 import numpy as np
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 import transformers
 import wandb
-import accelerate
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from datasets import load_dataset
-from huggingface_hub import create_repo, upload_folder
-from packaging import version
-from peft import LoraConfig
-from peft.utils import get_peft_model_state_dict
-from PIL import Image
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import AutoTokenizer, PretrainedConfig
-
-import diffusers
 from diffusers import (
     AutoencoderKL,
     DDPMScheduler,
@@ -50,11 +40,15 @@
     DPMSolverMultistepScheduler,
     UNet2DConditionModel,
 )
-from diffusers.loaders import LoraLoaderMixin
 from diffusers.optimization import get_scheduler
-from diffusers.utils import check_min_version, convert_state_dict_to_diffusers
+from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
-
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from PIL import Image
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import AutoTokenizer, PretrainedConfig
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.25.0.dev0")
@@ -63,31 +57,34 @@
 
 
 VALIDATION_PROMPTS = [
-    "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+    "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",  # noqa: E501
     "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
     "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
     "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
 ]
 
+
 def freeze_all_layers(model):
     for param in model.parameters():
         param.requires_grad = False
 
-def random_activate_layers(model,p):
-    activate_number = int((len(list(model.parameters()))-2) * p)
-    index = np.random.choice(range(0,len(list(model.parameters()))-1,1), activate_number, replace=False)
+
+def random_activate_layers(model, p):
+    activate_number = int((len(list(model.parameters())) - 2) * p)
+    index = np.random.choice(range(0, len(list(model.parameters())) - 1, 1), activate_number, replace=False)
     count = 0
     for param in model.parameters():
-        if count == 0 or count == len(list(model.parameters()))-1:
+        if count == 0 or count == len(list(model.parameters())) - 1:
             param.requires_grad = True
         elif count in index:
             param.requires_grad = True
-        
+
         count += 1
 
-def lisa(model,p=0.25):
+
+def lisa(model, p=0.25):
     freeze_all_layers(model)
-    random_activate_layers(model,p)
+    random_activate_layers(model, p)
 
 
 def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str):
@@ -120,8 +117,9 @@ def update_ema(target_params, source_params, rate=0.99):
         # if src.requires_grad == True:
         targ.detach().mul_(rate).add_(src, alpha=1 - rate)
 
+
 def log_validation(args, unet, accelerator, weight_dtype, epoch, is_final_validation=False):
-    logger.info(f"Running validation... \n Generating images with prompts:\n" f" {VALIDATION_PROMPTS}.")
+    logger.info(f"Running validation... \n Generating images with prompts:\n {VALIDATION_PROMPTS}.")
 
     # create pipeline
     pipeline = DiffusionPipeline.from_pretrained(
@@ -226,7 +224,9 @@ def parse_args(input_args=None):
         "--run_validation",
         default=False,
         action="store_true",
-        help="Whether to run validation inference in between training and also after training. Helps to track progress.",
+        help=(
+            "Whether to run validation inference in between training and also after training. Helps to track progress."
+        ),
     )
     parser.add_argument(
         "--validation_steps",
@@ -239,8 +239,7 @@ def parse_args(input_args=None):
         type=int,
         default=None,
         help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
+            "For debugging purposes or quicker training, truncate the number of training examples to this value if set."
         ),
     )
     parser.add_argument(
@@ -476,9 +475,7 @@ def tokenize_captions(tokenizer, examples):
     for caption in examples["caption"]:
         captions.append(caption)
 
-    text_inputs = tokenizer(
-        captions, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt"
-    )
+    text_inputs = tokenizer(captions, truncation=True, padding="max_length", max_length=max_length, return_tensors="pt")
 
     return text_inputs.input_ids
 
@@ -505,6 +502,7 @@ def main(args):
 
     accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
     from accelerate import DistributedDataParallelKwargs
+
     ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
@@ -597,7 +595,9 @@ def main(args):
             xformers_version = version.parse(xformers.__version__)
             if xformers_version == version.parse("0.0.16"):
                 logger.warning(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during "
+                    "training, please update xFormers to at least 0.0.17. "
+                    "See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                 )
             unet.enable_xformers_memory_efficient_attention()
         else:
@@ -629,8 +629,8 @@ def load_model_hook(models, input_dir):
                 del load_model
 
         accelerator.register_save_state_pre_hook(save_model_hook)
-        accelerator.register_load_state_pre_hook(load_model_hook)    # Enable TF32 for faster training on Ampere GPUs,
-    
+        accelerator.register_load_state_pre_hook(load_model_hook)  # Enable TF32 for faster training on Ampere GPUs,
+
     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
     if args.allow_tf32:
         torch.backends.cuda.matmul.allow_tf32 = True
@@ -647,7 +647,7 @@ def load_model_hook(models, input_dir):
         except ImportError:
             raise ImportError(
                 "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
-            )
+            ) from None
 
         optimizer_class = bnb.optim.AdamW8bit
     else:
@@ -657,16 +657,18 @@ def load_model_hook(models, input_dir):
     scheduler_dict = dict()
     param_number = len(list(unet.parameters()))
     print(param_number)
-    lisa_p = 0.25 # 128 / param_number # 32 / param_number
-    lisa(model=unet,p=lisa_p)
+    lisa_p = 0.25  # 128 / param_number # 32 / param_number
+    lisa(model=unet, p=lisa_p)
 
     for p in unet.parameters():
         if p.requires_grad:
-            optimizer_dict[p] = optimizer_class([{"params":p}],
-                                                lr=args.learning_rate,
-                                                betas=(args.adam_beta1, args.adam_beta2),
-                                                weight_decay=args.adam_weight_decay,
-                                                eps=args.adam_epsilon)
+            optimizer_dict[p] = optimizer_class(
+                [{"params": p}],
+                lr=args.learning_rate,
+                betas=(args.adam_beta1, args.adam_beta2),
+                weight_decay=args.adam_weight_decay,
+                eps=args.adam_epsilon,
+            )
             optimizer_dict[p] = accelerator.prepare_optimizer(optimizer_dict[p])
 
     # Dataset and DataLoaders creation:
@@ -737,12 +739,13 @@ def collate_fn(examples):
     for p in unet.parameters():
         if p.requires_grad:
             scheduler_dict[p] = get_scheduler(
-                                args.lr_scheduler,
-                                optimizer=optimizer_dict[p],
-                                num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
-                                num_training_steps=args.max_train_steps * accelerator.num_processes)
+                args.lr_scheduler,
+                optimizer=optimizer_dict[p],
+                num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+                num_training_steps=args.max_train_steps * accelerator.num_processes,
+            )
             scheduler_dict[p] = accelerator.prepare_scheduler(scheduler_dict[p])
-  
+
     # define a hook function to update the parameter p during the backward pass
     def optimizer_hook(p):
         if p.grad is None:
@@ -751,18 +754,21 @@ def optimizer_hook(p):
             return
         else:
             if p not in optimizer_dict:
-                optimizer_dict[p] = optimizer_class([{"params":p}],
-                                                lr=args.learning_rate,
-                                                betas=(args.adam_beta1, args.adam_beta2),
-                                                weight_decay=args.adam_weight_decay,
-                                                eps=args.adam_epsilon)
+                optimizer_dict[p] = optimizer_class(
+                    [{"params": p}],
+                    lr=args.learning_rate,
+                    betas=(args.adam_beta1, args.adam_beta2),
+                    weight_decay=args.adam_weight_decay,
+                    eps=args.adam_epsilon,
+                )
                 optimizer_dict[p] = accelerator.prepare_optimizer(optimizer_dict[p])
             if p not in scheduler_dict:
                 scheduler_dict[p] = get_scheduler(
-                                args.lr_scheduler,
-                                optimizer=optimizer_dict[p],
-                                num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
-                                num_training_steps=args.max_train_steps * accelerator.num_processes)
+                    args.lr_scheduler,
+                    optimizer=optimizer_dict[p],
+                    num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+                    num_training_steps=args.max_train_steps * accelerator.num_processes,
+                )
                 scheduler_dict[p] = accelerator.prepare_scheduler(scheduler_dict[p])
         if accelerator.sync_gradients:
             torch.nn.utils.clip_grad_norm_(p, args.max_grad_norm)
@@ -844,7 +850,7 @@ def optimizer_hook(p):
         for step, batch in enumerate(train_dataloader):
             if total_count % 6 == 0 and total_count != 0:
                 param_number = len(list(unet.parameters()))
-                lisa(model=unet,p=lisa_p)
+                lisa(model=unet, p=lisa_p)
             total_count += 1
             with accelerator.accumulate(unet):
                 # (batch_size, 2*channels, h, w) -> (2*batch_size, channels, h, w)
@@ -902,11 +908,15 @@ def optimizer_hook(p):
 
                 # Reference model predictions.
                 with torch.no_grad():
-                    ref_preds = ref_net(
-                        noisy_model_input.half(),
-                        timesteps.half(),
-                        encoder_hidden_states.half(),
-                    ).sample.detach().float()
+                    ref_preds = (
+                        ref_net(
+                            noisy_model_input.half(),
+                            timesteps.half(),
+                            encoder_hidden_states.half(),
+                        )
+                        .sample.detach()
+                        .float()
+                    )
                     ref_loss = F.mse_loss(ref_preds.float(), target.float(), reduction="none")
                     ref_loss = ref_loss.mean(dim=list(range(1, len(ref_loss.shape))))
 
@@ -945,13 +955,15 @@ def optimizer_hook(p):
                             checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
                             checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
 
-                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            # before we save the new checkpoint, we need to have at _most_
+                            # `checkpoints_total_limit - 1` checkpoints
                             if len(checkpoints) >= args.checkpoints_total_limit:
                                 num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
                                 removing_checkpoints = checkpoints[0:num_to_remove]
 
                                 logger.info(
-                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                    f"{len(checkpoints)} checkpoints already exist, removing "
+                                    f"{len(removing_checkpoints)} checkpoints"
                                 )
                                 logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
 
@@ -964,9 +976,7 @@ def optimizer_hook(p):
                         logger.info(f"Saved state to {save_path}")
 
                     if global_step == 1 or (args.run_validation and global_step % args.validation_steps == 0):
-                        log_validation(
-                            args, unet=unet, accelerator=accelerator, weight_dtype=weight_dtype, epoch=epoch
-                        )
+                        log_validation(args, unet=unet, accelerator=accelerator, weight_dtype=weight_dtype, epoch=epoch)
 
             logs = {
                 "loss": loss.detach().item(),
diff --git a/experimental/LISA-diffusion/instruct_pix2pix/test_instruct_pix2pix.py b/experimental/LISA-diffusion/instruct_pix2pix/test_instruct_pix2pix.py
index f72f95db6..533024c96 100644
--- a/experimental/LISA-diffusion/instruct_pix2pix/test_instruct_pix2pix.py
+++ b/experimental/LISA-diffusion/instruct_pix2pix/test_instruct_pix2pix.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2024 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,11 +17,9 @@
 import sys
 import tempfile
 
-
 sys.path.append("..")
 from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
 
-
 logging.basicConfig(level=logging.DEBUG)
 
 logger = logging.getLogger()
diff --git a/experimental/LISA-diffusion/instruct_pix2pix/train_instruct_pix2pix_lisa.py b/experimental/LISA-diffusion/instruct_pix2pix/train_instruct_pix2pix_lisa.py
index a722a6dd9..0b9f3d666 100644
--- a/experimental/LISA-diffusion/instruct_pix2pix/train_instruct_pix2pix_lisa.py
+++ b/experimental/LISA-diffusion/instruct_pix2pix/train_instruct_pix2pix_lisa.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,6 +24,7 @@
 
 import accelerate
 import datasets
+import diffusers
 import numpy as np
 import PIL
 import requests
@@ -37,20 +37,17 @@
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from datasets import load_dataset
-from huggingface_hub import create_repo, upload_folder
-from packaging import version
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-
-import diffusers
 from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionInstructPix2PixPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel
 from diffusers.utils import check_min_version, deprecate, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module
-
+from huggingface_hub import create_repo, upload_folder
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.28.0.dev0")
@@ -158,8 +155,7 @@ def parse_args():
         type=int,
         default=None,
         help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
+            "For debugging purposes or quicker training, truncate the number of training examples to this value if set."
         ),
     )
     parser.add_argument(
@@ -247,7 +243,11 @@ def parse_args():
         "--conditioning_dropout_prob",
         type=float,
         default=None,
-        help="Conditioning dropout probability. Drops out the conditionings (image and edit prompt) used in training InstructPix2Pix. See section 3.2.1 in the paper: https://arxiv.org/abs/2211.09800.",
+        help=(
+            "Conditioning dropout probability. Drops out the conditionings (image and edit prompt) used "
+            "in training InstructPix2Pix. See section 3.2.1 in the paper: "
+            "https://arxiv.org/abs/2211.09800."
+        ),
     )
     parser.add_argument(
         "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
@@ -377,25 +377,28 @@ def download_image(url):
     image = image.convert("RGB")
     return image
 
+
 def freeze_all_layers(model):
     for param in model.parameters():
         param.requires_grad = False
 
-def random_activate_layers(model,p):
-    activate_number = int((len(list(model.parameters()))-2) * p)
-    index = np.random.choice(range(0,len(list(model.parameters()))-1,1), activate_number, replace=False)
+
+def random_activate_layers(model, p):
+    activate_number = int((len(list(model.parameters())) - 2) * p)
+    index = np.random.choice(range(0, len(list(model.parameters())) - 1, 1), activate_number, replace=False)
     count = 0
     for param in model.parameters():
-        if count == 0 or count == len(list(model.parameters()))-1:
+        if count == 0 or count == len(list(model.parameters())) - 1:
             param.requires_grad = True
         elif count in index:
             param.requires_grad = True
-        
+
         count += 1
 
-def lisa(model,p=0.25):
+
+def lisa(model, p=0.25):
     freeze_all_layers(model)
-    random_activate_layers(model,p)
+    random_activate_layers(model, p)
 
 
 def main():
@@ -418,13 +421,14 @@ def main():
     logging_dir = os.path.join(args.output_dir, args.logging_dir)
     accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
     from accelerate import DistributedDataParallelKwargs
+
     ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
         mixed_precision=args.mixed_precision,
         log_with=args.report_to,
         project_config=accelerator_project_config,
-        kwargs_handlers=[ddp_kwargs]
+        kwargs_handlers=[ddp_kwargs],
     )
 
     generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
@@ -512,7 +516,9 @@ def main():
             xformers_version = version.parse(xformers.__version__)
             if xformers_version == version.parse("0.0.16"):
                 logger.warning(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during "
+                    "training, please update xFormers to at least 0.0.17. "
+                    "See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                 )
             unet.enable_xformers_memory_efficient_attention()
         else:
@@ -578,7 +584,7 @@ def load_model_hook(models, input_dir):
         except ImportError:
             raise ImportError(
                 "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
-            )
+            ) from None
 
         optimizer_cls = bnb.optim.AdamW8bit
     else:
@@ -588,25 +594,28 @@ def load_model_hook(models, input_dir):
     scheduler_dict = dict()
     param_number = len(list(unet.parameters()))
     lisa_p = 8 / param_number
-    lisa(model=unet,p=lisa_p)
+    lisa(model=unet, p=lisa_p)
     for p in unet.parameters():
         if p.requires_grad:
-            optimizer_dict[p] = optimizer_cls([{"params":p}],
-                                                lr=args.learning_rate,
-                                                betas=(args.adam_beta1, args.adam_beta2),
-                                                weight_decay=args.adam_weight_decay,
-                                                eps=args.adam_epsilon)
+            optimizer_dict[p] = optimizer_cls(
+                [{"params": p}],
+                lr=args.learning_rate,
+                betas=(args.adam_beta1, args.adam_beta2),
+                weight_decay=args.adam_weight_decay,
+                eps=args.adam_epsilon,
+            )
             optimizer_dict[p] = accelerator.prepare_optimizer(optimizer_dict[p])
-    
+
     for p in unet.parameters():
         if p.requires_grad:
             scheduler_dict[p] = get_scheduler(
-                                args.lr_scheduler,
-                                optimizer=optimizer_dict[p],
-                                num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
-                                num_training_steps=args.max_train_steps * accelerator.num_processes)
+                args.lr_scheduler,
+                optimizer=optimizer_dict[p],
+                num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+                num_training_steps=args.max_train_steps * accelerator.num_processes,
+            )
             scheduler_dict[p] = accelerator.prepare_scheduler(scheduler_dict[p])
-    
+
     # define a hook function to update the parameter p during the backward pass
     def optimizer_hook(p):
         if p.grad is None:
@@ -615,18 +624,21 @@ def optimizer_hook(p):
             return
         else:
             if p not in optimizer_dict:
-                optimizer_dict[p] = optimizer_cls([{"params":p}],
-                                                lr=args.learning_rate,
-                                                betas=(args.adam_beta1, args.adam_beta2),
-                                                weight_decay=args.adam_weight_decay,
-                                                eps=args.adam_epsilon)
+                optimizer_dict[p] = optimizer_cls(
+                    [{"params": p}],
+                    lr=args.learning_rate,
+                    betas=(args.adam_beta1, args.adam_beta2),
+                    weight_decay=args.adam_weight_decay,
+                    eps=args.adam_epsilon,
+                )
                 optimizer_dict[p] = accelerator.prepare_optimizer(optimizer_dict[p])
             if p not in scheduler_dict:
                 scheduler_dict[p] = get_scheduler(
-                                args.lr_scheduler,
-                                optimizer=optimizer_dict[p],
-                                num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
-                                num_training_steps=args.max_train_steps * accelerator.num_processes)
+                    args.lr_scheduler,
+                    optimizer=optimizer_dict[p],
+                    num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+                    num_training_steps=args.max_train_steps * accelerator.num_processes,
+                )
                 scheduler_dict[p] = accelerator.prepare_scheduler(scheduler_dict[p])
         optimizer_dict[p].step()
         optimizer_dict[p].zero_grad(set_to_none=True)
@@ -675,7 +687,8 @@ def optimizer_hook(p):
         original_image_column = args.original_image_column
         if original_image_column not in column_names:
             raise ValueError(
-                f"--original_image_column' value '{args.original_image_column}' needs to be one of: {', '.join(column_names)}"
+                f"--original_image_column' value '{args.original_image_column}' needs to be one of: "
+                f"{', '.join(column_names)}"
             )
     if args.edit_prompt_column is None:
         edit_prompt_column = dataset_columns[1] if dataset_columns is not None else column_names[1]
@@ -691,7 +704,8 @@ def optimizer_hook(p):
         edited_image_column = args.edited_image_column
         if edited_image_column not in column_names:
             raise ValueError(
-                f"--edited_image_column' value '{args.edited_image_column}' needs to be one of: {', '.join(column_names)}"
+                f"--edited_image_column' value '{args.edited_image_column}' needs to be one of: "
+                f"{', '.join(column_names)}"
             )
 
     # Preprocessing the datasets.
@@ -854,10 +868,9 @@ def collate_fn(examples):
         unet.train()
         train_loss = 0.0
         for step, batch in enumerate(train_dataloader):
-
             if total_count % 20 == 0 and total_count != 0:
                 param_number = len(list(unet.parameters()))
-                lisa(model=unet,p=lisa_p)
+                lisa(model=unet, p=lisa_p)
             total_count += 1
             if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
                 if step % args.gradient_accumulation_steps == 0:
@@ -933,7 +946,7 @@ def collate_fn(examples):
                 accelerator.backward(loss)
                 if accelerator.sync_gradients:
                     accelerator.clip_grad_norm_(unet.parameters(), args.max_grad_norm)
-                
+
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
                 if args.use_ema:
@@ -951,13 +964,15 @@ def collate_fn(examples):
                             checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
                             checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
 
-                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            # before we save the new checkpoint, we need to have at _most_
+                            # `checkpoints_total_limit - 1` checkpoints
                             if len(checkpoints) >= args.checkpoints_total_limit:
                                 num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
                                 removing_checkpoints = checkpoints[0:num_to_remove]
 
                                 logger.info(
-                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                    f"{len(checkpoints)} checkpoints already exist, removing "
+                                    f"{len(removing_checkpoints)} checkpoints"
                                 )
                                 logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
 
diff --git a/experimental/LISA-diffusion/latent_consistency_model/train_lcm_distill_sd_wds_lisa.py b/experimental/LISA-diffusion/latent_consistency_model/train_lcm_distill_sd_wds_lisa.py
index bbc3f0cdb..4d31bbd3d 100644
--- a/experimental/LISA-diffusion/latent_consistency_model/train_lcm_distill_sd_wds_lisa.py
+++ b/experimental/LISA-diffusion/latent_consistency_model/train_lcm_distill_sd_wds_lisa.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,9 +23,10 @@
 import random
 import shutil
 from pathlib import Path
-from typing import List, Union
+from typing import Union
 
 import accelerate
+import diffusers
 import numpy as np
 import torch
 import torch.nn.functional as F
@@ -38,6 +38,17 @@
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from braceexpand import braceexpand
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    LCMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import resolve_interpolation_mode
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from torch.utils.data import default_collate
@@ -51,20 +62,6 @@
     valid_sample,
 )
 
-import diffusers
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    LCMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.optimization import get_scheduler
-from diffusers.training_utils import resolve_interpolation_mode
-from diffusers.utils import check_min_version, is_wandb_available
-from diffusers.utils.import_utils import is_xformers_available
-
-
 MAX_SEQ_LENGTH = 77
 
 if is_wandb_available():
@@ -80,21 +77,23 @@ def freeze_all_layers(model):
     for param in model.parameters():
         param.requires_grad = False
 
-def random_activate_layers(model,p):
-    activate_number = int((len(list(model.parameters()))-2) * p)
-    index = np.random.choice(range(0,len(list(model.parameters()))-1,1), activate_number, replace=False)
+
+def random_activate_layers(model, p):
+    activate_number = int((len(list(model.parameters())) - 2) * p)
+    index = np.random.choice(range(0, len(list(model.parameters())) - 1, 1), activate_number, replace=False)
     count = 0
     for param in model.parameters():
-        if count == 0 or count == len(list(model.parameters()))-1:
+        if count == 0 or count == len(list(model.parameters())) - 1:
             param.requires_grad = True
         elif count in index:
             param.requires_grad = True
-        
+
         count += 1
 
-def lisa(model,p=0.25):
+
+def lisa(model, p=0.25):
     freeze_all_layers(model)
-    random_activate_layers(model,p)
+    random_activate_layers(model, p)
 
 
 def filter_keys(key_set):
@@ -163,7 +162,7 @@ def __call__(self, x):
 class SDText2ImageDataset:
     def __init__(
         self,
-        train_shards_path_or_url: Union[str, List[str]],
+        train_shards_path_or_url: Union[str, list[str]],
         num_train_examples: int,
         per_gpu_batch_size: int,
         global_batch_size: int,
@@ -239,7 +238,6 @@ def train_dataloader(self):
         return self._train_dataloader
 
 
-
 def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="target"):
     logger.info("Running validation... ")
 
@@ -264,7 +262,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe
         generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
 
     validation_prompts = [
-        "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+        "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",  # noqa: E501
         # "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
         # "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
         # "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
@@ -275,7 +273,10 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe
     for _, prompt in enumerate(validation_prompts):
         images = []
         from diffusers import UniPCMultistepScheduler
-        pipeline.scheduler = UniPCMultistepScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+
+        pipeline.scheduler = UniPCMultistepScheduler.from_pretrained(
+            "CompVis/stable-diffusion-v1-4", subfolder="scheduler"
+        )
         with torch.autocast("cuda"):
             images = pipeline(
                 prompt=prompt,
@@ -288,7 +289,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe
 
     for _, prompt in enumerate(validation_prompts):
         images = []
-        pipeline.scheduler=LCMScheduler.from_pretrained(args.pretrained_teacher_model, subfolder="scheduler")
+        pipeline.scheduler = LCMScheduler.from_pretrained(args.pretrained_teacher_model, subfolder="scheduler")
         with torch.autocast("cuda"):
             images = pipeline(
                 prompt=prompt,
@@ -330,7 +331,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step, name="targe
         torch.cuda.empty_cache()
 
         return image_logs
-    
+
 
 # From LatentConsistencyModel.get_guidance_scale_embedding
 def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32):
@@ -641,8 +642,7 @@ def parse_args():
         type=int,
         default=None,
         help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
+            "For debugging purposes or quicker training, truncate the number of training examples to this value if set."
         ),
     )
     # ----Learning Rate----
@@ -882,6 +882,7 @@ def main(args):
 
     accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
     from accelerate import DistributedDataParallelKwargs
+
     ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
     accelerator = Accelerator(
         gradient_accumulation_steps=args.gradient_accumulation_steps,
@@ -889,8 +890,11 @@ def main(args):
         log_with=args.report_to,
         project_config=accelerator_project_config,
         kwargs_handlers=[ddp_kwargs],
-        split_batches=True,  # It's important to set this to True when using webdataset to get the right number of steps for lr scheduling. If set to False, the number of steps will be devide by the number of processes assuming batches are multiplied by the number of processes
+        split_batches=True,
     )
+    # It's important to set `split_batches=True` when using webdataset to get the right number
+    # of steps for lr scheduling. If set to False, the number of steps will be devide by the number of
+    # processes assuming batches are multiplied by the number of processes
 
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
@@ -1066,7 +1070,9 @@ def load_model_hook(models, input_dir):
             xformers_version = version.parse(xformers.__version__)
             if xformers_version == version.parse("0.0.16"):
                 logger.warning(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during "
+                    "training, please update xFormers to at least 0.0.17. "
+                    "See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                 )
             unet.enable_xformers_memory_efficient_attention()
             teacher_unet.enable_xformers_memory_efficient_attention()
@@ -1089,7 +1095,7 @@ def load_model_hook(models, input_dir):
         except ImportError:
             raise ImportError(
                 "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
-            )
+            ) from None
 
         optimizer_class = bnb.optim.AdamW8bit
     else:
@@ -1099,28 +1105,30 @@ def load_model_hook(models, input_dir):
     scheduler_dict = dict()
     param_number = len(list(unet.parameters()))
     print(param_number)
-    lisa_p = 0.25 # 128 / param_number # 32 / param_number
-    lisa(model=unet,p=lisa_p)
+    lisa_p = 0.25  # 128 / param_number # 32 / param_number
+    lisa(model=unet, p=lisa_p)
     for p in unet.parameters():
         if p.requires_grad:
-            optimizer_dict[p] = optimizer_class([{"params":p}],
-                                                lr=args.learning_rate,
-                                                betas=(args.adam_beta1, args.adam_beta2),
-                                                weight_decay=args.adam_weight_decay,
-                                                eps=args.adam_epsilon)
+            optimizer_dict[p] = optimizer_class(
+                [{"params": p}],
+                lr=args.learning_rate,
+                betas=(args.adam_beta1, args.adam_beta2),
+                weight_decay=args.adam_weight_decay,
+                eps=args.adam_epsilon,
+            )
             optimizer_dict[p] = accelerator.prepare_optimizer(optimizer_dict[p])
     dataset = SDText2ImageDataset(
-            train_shards_path_or_url=args.train_shards_path_or_url,
-            num_train_examples=args.max_train_samples,
-            per_gpu_batch_size=args.train_batch_size,
-            global_batch_size=args.train_batch_size * accelerator.num_processes,
-            num_workers=args.dataloader_num_workers,
-            resolution=args.resolution,
-            interpolation_type=args.interpolation_type,
-            shuffle_buffer_size=1000,
-            pin_memory=True,
-            persistent_workers=True,
-        )
+        train_shards_path_or_url=args.train_shards_path_or_url,
+        num_train_examples=args.max_train_samples,
+        per_gpu_batch_size=args.train_batch_size,
+        global_batch_size=args.train_batch_size * accelerator.num_processes,
+        num_workers=args.dataloader_num_workers,
+        resolution=args.resolution,
+        interpolation_type=args.interpolation_type,
+        shuffle_buffer_size=1000,
+        pin_memory=True,
+        persistent_workers=True,
+    )
     train_dataloader = dataset.train_dataloader
 
     def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tokenizer, is_train=True):
@@ -1128,11 +1136,11 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
         return {"prompt_embeds": prompt_embeds}
 
     compute_embeddings_fn = functools.partial(
-            compute_embeddings,
-            proportion_empty_prompts=0,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-        )
+        compute_embeddings,
+        proportion_empty_prompts=0,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+    )
 
     # 14. LR Scheduler creation
     # Scheduler and math around the number of training steps.
@@ -1145,12 +1153,13 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
     for p in unet.parameters():
         if p.requires_grad:
             scheduler_dict[p] = get_scheduler(
-                                args.lr_scheduler,
-                                optimizer=optimizer_dict[p],
-                                num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
-                                num_training_steps=args.max_train_steps * accelerator.num_processes)
+                args.lr_scheduler,
+                optimizer=optimizer_dict[p],
+                num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+                num_training_steps=args.max_train_steps * accelerator.num_processes,
+            )
             scheduler_dict[p] = accelerator.prepare_scheduler(scheduler_dict[p])
-    
+
     # define a hook function to update the parameter p during the backward pass
     def optimizer_hook(p):
         if p.grad is None:
@@ -1159,18 +1168,21 @@ def optimizer_hook(p):
             return
         else:
             if p not in optimizer_dict:
-                optimizer_dict[p] = optimizer_class([{"params":p}],
-                                                lr=args.learning_rate,
-                                                betas=(args.adam_beta1, args.adam_beta2),
-                                                weight_decay=args.adam_weight_decay,
-                                                eps=args.adam_epsilon)
+                optimizer_dict[p] = optimizer_class(
+                    [{"params": p}],
+                    lr=args.learning_rate,
+                    betas=(args.adam_beta1, args.adam_beta2),
+                    weight_decay=args.adam_weight_decay,
+                    eps=args.adam_epsilon,
+                )
                 optimizer_dict[p] = accelerator.prepare_optimizer(optimizer_dict[p])
             if p not in scheduler_dict:
                 scheduler_dict[p] = get_scheduler(
-                                args.lr_scheduler,
-                                optimizer=optimizer_dict[p],
-                                num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
-                                num_training_steps=args.max_train_steps * accelerator.num_processes)
+                    args.lr_scheduler,
+                    optimizer=optimizer_dict[p],
+                    num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+                    num_training_steps=args.max_train_steps * accelerator.num_processes,
+                )
                 scheduler_dict[p] = accelerator.prepare_scheduler(scheduler_dict[p])
         if accelerator.sync_gradients:
             torch.nn.utils.clip_grad_norm_(p, args.max_grad_norm)
@@ -1259,7 +1271,7 @@ def optimizer_hook(p):
         for step, batch in enumerate(train_dataloader):
             if total_count % 5 == 0 and total_count != 0:
                 param_number = len(list(unet.parameters()))
-                lisa(model=unet,p=lisa_p)
+                lisa(model=unet, p=lisa_p)
             total_count += 1
             with accelerator.accumulate(unet):
                 # 1. Load and process the image and text conditioning
@@ -1365,7 +1377,8 @@ def optimizer_hook(p):
                             sigma_schedule,
                         )
 
-                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and
+                        # unconditional embedding 0
                         uncond_teacher_output = teacher_unet(
                             noisy_model_input.to(weight_dtype),
                             start_timesteps,
@@ -1442,13 +1455,15 @@ def optimizer_hook(p):
                             checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
                             checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
 
-                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            # before we save the new checkpoint, we need to have at _most_
+                            # `checkpoints_total_limit - 1` checkpoints
                             if len(checkpoints) >= args.checkpoints_total_limit:
                                 num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
                                 removing_checkpoints = checkpoints[0:num_to_remove]
 
                                 logger.info(
-                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                    f"{len(checkpoints)} checkpoints already exist, removing "
+                                    f"{len(removing_checkpoints)} checkpoints"
                                 )
                                 logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
 
diff --git a/experimental/LISA-diffusion/latent_consistency_model/train_lcm_distill_sd_wds_lora.py b/experimental/LISA-diffusion/latent_consistency_model/train_lcm_distill_sd_wds_lora.py
index 46470be86..3c63e45de 100644
--- a/experimental/LISA-diffusion/latent_consistency_model/train_lcm_distill_sd_wds_lora.py
+++ b/experimental/LISA-diffusion/latent_consistency_model/train_lcm_distill_sd_wds_lora.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -24,9 +23,10 @@
 import random
 import shutil
 from pathlib import Path
-from typing import List, Union
+from typing import Union
 
 import accelerate
+import diffusers
 import numpy as np
 import torch
 import torch.nn.functional as F
@@ -38,6 +38,17 @@
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from braceexpand import braceexpand
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    LCMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import resolve_interpolation_mode
+from diffusers.utils import check_min_version, is_wandb_available
+from diffusers.utils.import_utils import is_xformers_available
 from huggingface_hub import create_repo, upload_folder
 from packaging import version
 from peft import LoraConfig, get_peft_model, get_peft_model_state_dict
@@ -52,20 +63,6 @@
     valid_sample,
 )
 
-import diffusers
-from diffusers import (
-    AutoencoderKL,
-    DDPMScheduler,
-    LCMScheduler,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-)
-from diffusers.optimization import get_scheduler
-from diffusers.training_utils import resolve_interpolation_mode
-from diffusers.utils import check_min_version, is_wandb_available
-from diffusers.utils.import_utils import is_xformers_available
-
-
 MAX_SEQ_LENGTH = 77
 
 if is_wandb_available():
@@ -88,7 +85,7 @@ def get_module_kohya_state_dict(module, prefix: str, dtype: torch.dtype, adapter
 
         # Set alpha parameter
         if "lora_down" in kohya_key:
-            alpha_key = f'{kohya_key.split(".")[0]}.alpha'
+            alpha_key = f"{kohya_key.split('.')[0]}.alpha"
             kohya_ss_state_dict[alpha_key] = torch.tensor(module.peft_config[adapter_name].lora_alpha).to(dtype)
 
     return kohya_ss_state_dict
@@ -160,7 +157,7 @@ def __call__(self, x):
 class SDText2ImageDataset:
     def __init__(
         self,
-        train_shards_path_or_url: Union[str, List[str]],
+        train_shards_path_or_url: Union[str, list[str]],
         num_train_examples: int,
         per_gpu_batch_size: int,
         global_batch_size: int,
@@ -264,7 +261,7 @@ def log_validation(vae, unet, args, accelerator, weight_dtype, step):
         generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)
 
     validation_prompts = [
-        "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",
+        "portrait photo of a girl, photograph, highly detailed face, depth of field, moody light, golden hour, style by Dan Winters, Russell James, Steve McCurry, centered, extremely detailed, Nikon D850, award winning photography",  # noqa: E501
         "Self-portrait oil painting, a beautiful cyborg with golden hair, 8k",
         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
         "A photo of beautiful mountain with realistic sunset and blue lake, highly detailed, masterpiece",
@@ -625,8 +622,7 @@ def parse_args():
         type=int,
         default=None,
         help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
+            "For debugging purposes or quicker training, truncate the number of training examples to this value if set."
         ),
     )
     # ----Learning Rate----
@@ -719,7 +715,7 @@ def parse_args():
         help="The huber loss parameter. Only used if `--loss_type=huber`.",
     )
     parser.add_argument(
-        "--lora_rank",
+        "--lora_r",
         type=int,
         default=64,
         help="The rank of the LoRA projection matrix.",
@@ -730,7 +726,7 @@ def parse_args():
         default=64,
         help=(
             "The value of the LoRA alpha parameter, which controls the scaling factor in front of the LoRA weight"
-            " update delta_W. No scaling will be performed if this value is equal to `lora_rank`."
+            " update delta_W. No scaling will be performed if this value is equal to `lora_r`."
         ),
     )
     parser.add_argument(
@@ -884,8 +880,11 @@ def main(args):
         mixed_precision=args.mixed_precision,
         log_with=args.report_to,
         project_config=accelerator_project_config,
-        split_batches=True,  # It's important to set this to True when using webdataset to get the right number of steps for lr scheduling. If set to False, the number of steps will be devide by the number of processes assuming batches are multiplied by the number of processes
+        split_batches=True,
     )
+    # It's important to set `split_batches=True` when using webdataset to get the right number
+    # of steps for lr scheduling. If set to False, the number of steps will be devide by the number of
+    # processes assuming batches are multiplied by the number of processes
 
     # Make one log on every process with the configuration for debugging.
     logging.basicConfig(
@@ -999,7 +998,7 @@ def main(args):
             "time_emb_proj",
         ]
     lora_config = LoraConfig(
-        r=args.lora_rank,
+        r=args.lora_r,
         target_modules=lora_target_modules,
         lora_alpha=args.lora_alpha,
         lora_dropout=args.lora_dropout,
@@ -1069,7 +1068,9 @@ def load_model_hook(models, input_dir):
             xformers_version = version.parse(xformers.__version__)
             if xformers_version == version.parse("0.0.16"):
                 logger.warning(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during "
+                    "training, please update xFormers to at least 0.0.17. "
+                    "See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                 )
             unet.enable_xformers_memory_efficient_attention()
             teacher_unet.enable_xformers_memory_efficient_attention()
@@ -1092,7 +1093,7 @@ def load_model_hook(models, input_dir):
         except ImportError:
             raise ImportError(
                 "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
-            )
+            ) from None
 
         optimizer_class = bnb.optim.AdamW8bit
     else:
@@ -1324,7 +1325,8 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                             sigma_schedule,
                         )
 
-                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and unconditional embedding 0
+                        # 2. Get teacher model prediction on noisy_model_input z_{t_{n + k}} and
+                        # unconditional embedding 0
                         uncond_teacher_output = teacher_unet(
                             noisy_model_input.to(weight_dtype),
                             start_timesteps,
@@ -1405,13 +1407,15 @@ def compute_embeddings(prompt_batch, proportion_empty_prompts, text_encoder, tok
                             checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
                             checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))
 
-                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            # before we save the new checkpoint, we need to have at _most_
+                            # `checkpoints_total_limit - 1` checkpoints
                             if len(checkpoints) >= args.checkpoints_total_limit:
                                 num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
                                 removing_checkpoints = checkpoints[0:num_to_remove]
 
                                 logger.info(
-                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                    f"{len(checkpoints)} checkpoints already exist, removing "
+                                    f"{len(removing_checkpoints)} checkpoints"
                                 )
                                 logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")
 
diff --git a/experimental/LISA-diffusion/single_lisa.py b/experimental/LISA-diffusion/single_lisa.py
index 73d5bccaf..330fe8864 100644
--- a/experimental/LISA-diffusion/single_lisa.py
+++ b/experimental/LISA-diffusion/single_lisa.py
@@ -1,9 +1,5 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import math,random
 import numpy as np
-import accelerate
+import torch
 
 
 class LISADiffusion:
@@ -17,14 +13,14 @@ def freeze_all_layers(self, model):
             param.requires_grad = False
 
     def random_activate_layers(self, model, p):
-        activate_number = int((len(list(model.parameters()))-2) * p)
-        index = np.random.choice(range(0,len(list(model.parameters()))-1,1), activate_number, replace=False)
+        activate_number = int((len(list(model.parameters())) - 2) * p)
+        index = np.random.choice(range(0, len(list(model.parameters())) - 1, 1), activate_number, replace=False)
         count = 0
         for param in model.parameters():
-            if count == 0 or count == len(list(model.parameters()))-1:
+            if count == 0 or count == len(list(model.parameters())) - 1:
                 param.requires_grad = True
             elif count in index:
-                param.requires_grad = True            
+                param.requires_grad = True
             count += 1
 
     def lisa(self, model, p=0.25):
@@ -34,18 +30,29 @@ def lisa(self, model, p=0.25):
     def lisa_recall(self):
         param_number = len(list(self.model.parameters()))
         lisa_p = 8 / param_number if self.rate is None else self.rate
-        self.lisa(model=self.model,p=lisa_p)
+        self.lisa(model=self.model, p=lisa_p)
 
     def initialize(self):
         self.optimizer_dict = dict()
         self.scheduler_dict = dict()
         self.lisa_recall()
 
-    def register(self, optimizer_class=None, get_scheduler=None, accelerator=None, 
-                 optim_kwargs={}, sched_kwargs={}):
+    def register(
+        self,
+        optimizer_class=None,
+        get_scheduler=None,
+        accelerator=None,
+        optim_kwargs=None,
+        sched_kwargs=None,
+    ):
+        if optim_kwargs is None:
+            optim_kwargs = {}
+        if sched_kwargs is None:
+            sched_kwargs = {}
+
         for p in self.model.parameters():
             if p.requires_grad:
-                self.optimizer_dict[p] = optimizer_class([{"params":p}], **optim_kwargs)
+                self.optimizer_dict[p] = optimizer_class([{"params": p}], **optim_kwargs)
                 if accelerator is not None:
                     self.optimizer_dict[p] = accelerator.prepare_optimizer(self.optimizer_dict[p])
 
@@ -54,9 +61,20 @@ def register(self, optimizer_class=None, get_scheduler=None, accelerator=None,
                 self.scheduler_dict[p] = get_scheduler(optimizer=self.optimizer_dict[p], **sched_kwargs)
                 if accelerator is not None:
                     self.scheduler_dict[p] = accelerator.prepare_scheduler(self.scheduler_dict[p])
-    
-    def insert_hook(self, optimizer_class=None, get_scheduler=None, accelerator=None, 
-                 optim_kwargs={}, sched_kwargs={}):
+
+    def insert_hook(
+        self,
+        optimizer_class=None,
+        get_scheduler=None,
+        accelerator=None,
+        optim_kwargs=None,
+        sched_kwargs=None,
+    ):
+        if optim_kwargs is None:
+            optim_kwargs = {}
+        if sched_kwargs is None:
+            sched_kwargs = {}
+
         def optimizer_hook(p):
             if p.grad is None:
                 del self.scheduler_dict[p]
@@ -64,9 +82,9 @@ def optimizer_hook(p):
                 return
             else:
                 if p not in self.optimizer_dict:
-                    self.optimizer_dict[p] = optimizer_class([{"params":p}], **optim_kwargs)
+                    self.optimizer_dict[p] = optimizer_class([{"params": p}], **optim_kwargs)
                     if accelerator is not None:
-                        self.optimizer_dict[p] = accelerator.prepare_optimizer(self.optimizer_dict[p])    
+                        self.optimizer_dict[p] = accelerator.prepare_optimizer(self.optimizer_dict[p])
                 if p not in self.scheduler_dict:
                     self.scheduler_dict[p] = get_scheduler(optimizer=self.optimizer_dict[p], **sched_kwargs)
                     if accelerator is not None:
@@ -74,7 +92,7 @@ def optimizer_hook(p):
 
             if accelerator is not None and accelerator.sync_gradients:
                 torch.nn.utils.clip_grad_norm_(p, 10.0)
-            
+
             self.optimizer_dict[p].step()
             self.optimizer_dict[p].zero_grad(set_to_none=True)
             self.scheduler_dict[p].step()
diff --git a/experimental/RAFT-diffusion/SD256-RAFT.ipynb b/experimental/RAFT-diffusion/SD256-RAFT.ipynb
index bfa219307..e7e67ec54 100644
--- a/experimental/RAFT-diffusion/SD256-RAFT.ipynb
+++ b/experimental/RAFT-diffusion/SD256-RAFT.ipynb
@@ -1,503 +1,523 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "YoURrh11fbIc"
-      },
-      "source": [
-        "# RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment\n",
-        "\n",
-        "This notebook beautifully showcases how RAFT can be leveraged to fine-tune a model.\n",
-        "\n",
-        "\n",
-        "\n",
-        "\n",
-        "\n",
-        "Curious how this works? Read our [paper](https://arxiv.org/abs/2304.06767) to explore the intricacies of our innovative approach."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BzmCovNKkwbi"
-      },
-      "source": [
-        "## Initial Setup"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "n7TI5hirlzn8"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Install the required libs\n",
-        "%pip install -q accelerate diffusers transformers ftfy bitsandbytes gradio natsort safetensors xformers datasets\n",
-        "%pip install -qq \"ipywidgets>=7,<8\"\n",
-        "!wget -q https://raw.githubusercontent.com/OptimalScale/LMFlow/main/experimental/RAFT-diffusion/train_text_to_image_lora.py"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "cellView": "form",
-        "id": "fvCBZCnrqcX1"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Install CLIP\n",
-        "\n",
-        "!pip install git+https://github.com/deepgoyal19/CLIP.git"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "guDgmswnmW-4"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Import required libraries\n",
-        "import argparse\n",
-        "import itertools\n",
-        "import math\n",
-        "import os\n",
-        "import shutil\n",
-        "from os.path import expanduser  # pylint: disable=import-outside-toplevel\n",
-        "from urllib.request import urlretrieve  # pylint: disable=import-outside-toplevel\n",
-        "from contextlib import nullcontext\n",
-        "import random\n",
-        "import pandas as pd\n",
-        "import numpy as np\n",
-        "import torch\n",
-        "import torch.nn.functional as F\n",
-        "import torch.utils.checkpoint\n",
-        "from torch.utils.data import Dataset\n",
-        "import concurrent\n",
-        "import PIL\n",
-        "from accelerate import Accelerator\n",
-        "from accelerate.logging import get_logger\n",
-        "from accelerate.utils import set_seed\n",
-        "from diffusers import AutoencoderKL, DDPMScheduler, PNDMScheduler, StableDiffusionPipeline, UNet2DConditionModel, DPMSolverMultistepScheduler\n",
-        "from diffusers.optimization import get_scheduler\n",
-        "from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker\n",
-        "from PIL import Image\n",
-        "from torchvision import transforms\n",
-        "from tqdm.auto import tqdm\n",
-        "from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer\n",
-        "import clip\n",
-        "import bitsandbytes as bnb\n",
-        "from torch.utils.data import DataLoader\n",
-        "def image_grid(imgs, rows, cols):\n",
-        "    assert len(imgs) == rows*cols\n",
-        "\n",
-        "    w, h = imgs[0].size\n",
-        "    grid = Image.new('RGB', size=(cols*w, rows*h))\n",
-        "    grid_w, grid_h = grid.size\n",
-        "    \n",
-        "    for i, img in enumerate(imgs):\n",
-        "        grid.paste(img, box=(i%cols*w, i//cols*h))\n",
-        "    return grid"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "f4D64FI9pI38"
-      },
-      "source": [
-        "## Loading Dataset"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "7IryKE4wq0SZ",
-        "cellView": "form"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Creating Dataloader\n",
-        "\n",
-        "prompts=['airplane','automobile','bird','deer','dog','cat','frog','horse','ship','truck']  # CIFAR labels\n",
-        "prompts = pd.DataFrame({'prompts': prompts}) #converting prompts list into a pandas dataframe\n",
-        "\n",
-        "class CIFAR10Dataset():\n",
-        "    def __init__(self):\n",
-        "        global prompts\n",
-        "        self.prompts=prompts.iloc[:,0]\n",
-        "        \n",
-        "    def __len__(self):\n",
-        "        return len(self.prompts)\n",
-        "    \n",
-        "    def __getitem__(self,index):\n",
-        "        return self.prompts.iloc[index]\n",
-        "\n",
-        "#@markdown Please mention the batch size.\n",
-        "batch_size =5 #@param {type:\"integer\"}\n",
-        "\n",
-        "\n",
-        "dataset = CIFAR10Dataset()\n",
-        "finetune_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BWH9vc1kvhvC"
-      },
-      "source": [
-        "## Loading CLIP"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "lJAguhs1d89L"
-      },
-      "outputs": [],
-      "source": [
-        "def get_aesthetic_model(clip_model=\"vit_l_14\"):\n",
-        "    \"\"\"load the aethetic model\"\"\"\n",
-        "    home = expanduser(\"~\")\n",
-        "    cache_folder = home + \"/.cache/emb_reader\"\n",
-        "    path_to_model = cache_folder + \"/sa_0_4_\"+clip_model+\"_linear.pth\"\n",
-        "    if not os.path.exists(path_to_model):\n",
-        "        os.makedirs(cache_folder, exist_ok=True)\n",
-        "        url_model = (\n",
-        "            \"https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_\"+clip_model+\"_linear.pth?raw=true\"\n",
-        "        )\n",
-        "        urlretrieve(url_model, path_to_model)\n",
-        "    if clip_model == \"vit_l_14\":\n",
-        "        m = torch.nn.Linear(768, 1)\n",
-        "    elif clip_model == \"vit_b_32\":\n",
-        "        m = torch.nn.Linear(512, 1)\n",
-        "    else:\n",
-        "        raise ValueError()\n",
-        "    s = torch.load(path_to_model)\n",
-        "    m.load_state_dict(s)\n",
-        "    m.eval()\n",
-        "    return m\n",
-        "\n",
-        "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
-        "amodel= get_aesthetic_model(clip_model=\"vit_l_14\").to(device)\n",
-        "amodel.eval()\n",
-        "\n",
-        "model, preprocess = clip.load('ViT-L/14', device=device)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "0RPeQGHUzUZp"
-      },
-      "source": [
-        "## Evaluating Aesthetic Score"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "s61Ljr9Sd89M"
-      },
-      "outputs": [],
-      "source": [
-        "def get_image_score(image):    #Evaluating Scores if images\n",
-        "    images = preprocess(image).unsqueeze(0).to(device)\n",
-        "    with torch.no_grad():\n",
-        "        image_features= model.encode_image(images).to(device)\n",
-        "        image_features /= image_features.norm(dim=-1, keepdim=True)\n",
-        "        image_features=image_features.to(torch.float32)\n",
-        "        prediction = amodel(image_features)\n",
-        "        return(float(prediction))\n",
-        "    \n",
-        "def get_max_score(image_list,index,epoch=0):  #The get_max_score function will return prompt's image with the highest aesthetic score will be chosen for additional fine-tuning.\n",
-        "    score_list=[]\n",
-        "    for image in image_list:\n",
-        "        score_list.append(get_image_score(image))\n",
-        "    torch.cuda.empty_cache()\n",
-        "\n",
-        "    prompts.loc[index,f'Epoch{epoch} Scores']=max(score_list)\n",
-        "    return [max(score_list),score_list.index(max(score_list))]\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Ak1jArUL0eCi"
-      },
-      "source": [
-        "##Parameters"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "jv6WYJos0iT5",
-        "cellView": "form"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Settings for the model\n",
-        "\n",
-        "#@markdown All settings have been configured to achieve optimal output. Changing them is not advisable.\n",
-        "\n",
-        "#@markdown Enter value for `resolution`.\n",
-        "resolution=256 #@param {type:\"integer\"}\n",
-        "\n",
-        "#@markdown Enter value for `num_images_per_prompt`.\n",
-        "num_images_per_prompt=10 #@param {type:\"integer\"} \n",
-        "\n",
-        "#@markdown Enter value for `epochs`. \n",
-        "epochs=10 #@param {type:\"integer\"} |"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "7gFbnMaLd89N",
-        "cellView": "form"
-      },
-      "outputs": [],
-      "source": [
-        "# @title Setting Stable Diffusion pipeline\n",
-        "model_id = \"runwayml/stable-diffusion-v1-5\"\n",
-        "pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)\n",
-        "pipe.enable_xformers_memory_efficient_attention()\n",
-        "torch.cuda.empty_cache()\n",
-        "\n",
-        "#@markdown Check the `set_progress_bar_config` option if you would like to hide the progress bar for image generation\n",
-        "set_progress_bar_config= False #@param {type:\"boolean\"}\n",
-        "pipe.set_progress_bar_config(disable=set_progress_bar_config) \n",
-        "\n",
-        "\n",
-        "scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)\n",
-        "pipe.scheduler = scheduler\n",
-        "\n",
-        "torch.cuda.empty_cache()\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "9U2P_PUN-5xX"
-      },
-      "source": [
-        "##Finetuning"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "F-m6S9Sg-yS_",
-        "cellView": "form"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Generating images on the pretrained model\n",
-        "\n",
-        "#@markdown Check the box to generate images using the pretrained model.\n",
-        "generate_pretrained_model_images= True #@param {type:\"boolean\"}\n",
-        "\n",
-        "if generate_pretrained_model_images:\n",
-        "  image_list=[]\n",
-        "  for step, prompt_list in enumerate(finetune_dataloader):\n",
-        "      image=pipe(prompt_list,num_images_per_prompt=num_images_per_prompt,width=resolution,height=resolution).images \n",
-        "      image_list+=image\n",
-        "      torch.cuda.empty_cache()\n",
-        "\n",
-        "  grid = image_grid(image_list, len(prompts),num_images_per_prompt)\n",
-        "  grid.save(\"pretrained.png\") \n",
-        "  grid\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "kPfHR4HQd89N"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Run training\n",
-        "\n",
-        "os.environ['MODEL_NAME'] = model_id\n",
-        "os.environ['OUTPUT_DIR'] = f\"./CustomModel/\"\n",
-        "topk=8\n",
-        "training_steps_per_epoch=topk*10\n",
-        "os.environ['CHECKPOINTING_STEPS']=str(training_steps_per_epoch)\n",
-        "os.environ['RESOLUTION']=str(resolution)\n",
-        "os.environ['LEARNING_RATE']=str(9e-6)\n",
-        "\n",
-        "# remove old account directory\n",
-        "try: \n",
-        "    shutil.rmtree('./CustomModel')\n",
-        "except:\n",
-        "    pass\n",
-        "try: \n",
-        "    shutil.rmtree('./trainingdataset/imagefolder/')\n",
-        "except:\n",
-        "    pass\n",
-        "\n",
-        "model_id = \"runwayml/stable-diffusion-v1-5\"\n",
-        "\n",
-        "\n",
-        "for epoch in range(epochs+1):\n",
-        "  print(\"Epoch: \",epoch)\n",
-        "  epoch=epoch\n",
-        "  training_steps=str(training_steps_per_epoch*(epoch+1))\n",
-        "  os.environ['TRAINING_STEPS']=training_steps\n",
-        "  os.environ['TRAINING_DIR'] = f'./trainingdataset/imagefolder/{epoch}'\n",
-        "\n",
-        "  training_prompts=[]\n",
-        "  prompts[f'Epoch{epoch} Scores']=np.nan\n",
-        "\n",
-        "  for step, prompt_list in enumerate(finetune_dataloader):\n",
-        "    image=pipe(prompt_list,num_images_per_prompt=num_images_per_prompt,width=resolution,height=resolution).images\n",
-        "    image_list=[]\n",
-        "\n",
-        "    for i in range(int(len(image)/num_images_per_prompt)):\n",
-        "      image_list.append(image[i*num_images_per_prompt:(i+1)*num_images_per_prompt])\n",
-        "    torch.cuda.empty_cache()\n",
-        "    \n",
-        "    with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:\n",
-        "      step_list=[i for i in range(step*batch_size,(step+1)*batch_size)]\n",
-        "      score_index=executor.map(get_max_score,image_list,step_list,[epoch for i in range(len(step_list))])\n",
-        "\n",
-        "    iterator=0\n",
-        "    for max_scores in score_index:\n",
-        "      training_prompts.append([max_scores[0],image_list[iterator][max_scores[1]],prompt_list[iterator]])\n",
-        "      iterator+=1\n",
-        "\n",
-        "  training_prompts=[row[1:3] for row in sorted(training_prompts,key=lambda x: (x[0]),reverse=True)[:topk]]\n",
-        "  training_prompts=pd.DataFrame(training_prompts)\n",
-        "\n",
-        "  if not os.path.exists(f\"./trainingdataset/imagefolder/{epoch}/train/\"):\n",
-        "    os.makedirs(f\"./trainingdataset/imagefolder/{epoch}/train/\")\n",
-        "  if not os.path.exists(f\"./CustomModel/\"):\n",
-        "    os.makedirs(f\"./CustomModel/\")\n",
-        "  for i in range(len(training_prompts)):\n",
-        "    training_prompts.iloc[i,0].save(f'./trainingdataset/imagefolder/{epoch}/train/{i}.png')\n",
-        "\n",
-        "  training_prompts['file_name']=[f\"{i}.png\" for i in range(len(training_prompts))]\n",
-        "  training_prompts.columns = ['0','text','file_name']\n",
-        "  training_prompts.drop('0',axis=1,inplace=True)\n",
-        "  training_prompts.to_csv(f'./trainingdataset/imagefolder/{epoch}/train/metadata.csv',index=False)\n",
-        "  torch.cuda.empty_cache()\n",
-        "\n",
-        "  if epoch<epochs:\n",
-        "    !accelerate launch --num_processes=1 --mixed_precision='fp16' --dynamo_backend='no' --num_machines=1 train_text_to_image_lora.py \\\n",
-        "        --pretrained_model_name_or_path=$MODEL_NAME \\\n",
-        "        --train_data_dir=$TRAINING_DIR \\\n",
-        "        --resolution=$RESOLUTION \\\n",
-        "        --train_batch_size=8 \\\n",
-        "        --gradient_accumulation_steps=1 \\\n",
-        "        --gradient_checkpointing \\\n",
-        "        --max_grad_norm=1 \\\n",
-        "        --mixed_precision=\"fp16\" \\\n",
-        "        --max_train_steps=$TRAINING_STEPS \\\n",
-        "        --learning_rate=$LEARNING_RATE \\\n",
-        "        --lr_warmup_steps=0 \\\n",
-        "        --enable_xformers_memory_efficient_attention \\\n",
-        "        --dataloader_num_workers=1 \\\n",
-        "        --output_dir=$OUTPUT_DIR \\\n",
-        "        --lr_warmup_steps=0 \\\n",
-        "        --seed=1234 \\\n",
-        "        --checkpointing_steps=$CHECKPOINTING_STEPS \\\n",
-        "        --resume_from_checkpoint=\"latest\" \\\n",
-        "        --lr_scheduler='constant' \n",
-        "  \n",
-        "  pipe.unet.load_attn_procs(f'./CustomModel/')\n",
-        "  torch.cuda.empty_cache()\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rglR5r5gahMv"
-      },
-      "source": [
-        "##Results\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "kcf9aY6od89O",
-        "cellView": "form"
-      },
-      "outputs": [],
-      "source": [
-        "#@title Generating results on the fine-tuned model\n",
-        "\n",
-        "#@markdown Check the box to generate images using the fine-tuned model.\n",
-        "generate_finetuned_model_images= True #@param {type:\"boolean\"}\n",
-        "\n",
-        "if generate_finetuned_model_images:\n",
-        "  image_list=[]\n",
-        "  pipe.unet.load_attn_procs('./CustomModel')\n",
-        "  for step, prompt_list in enumerate(finetune_dataloader):\n",
-        "      image=pipe(prompt_list,num_images_per_prompt=num_images_per_prompt,width=resolution,height=resolution).images \n",
-        "      image_list+=image\n",
-        "      torch.cuda.empty_cache()\n",
-        "\n",
-        "  grid = image_grid(image_list, len(prompts),num_images_per_prompt)\n",
-        "  grid.save(\"trained.png\")\n",
-        "  grid.show()"
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "provenance": [],
-      "collapsed_sections": [
-        "BzmCovNKkwbi",
-        "f4D64FI9pI38",
-        "BWH9vc1kvhvC",
-        "0RPeQGHUzUZp",
-        "Ak1jArUL0eCi",
-        "9U2P_PUN-5xX",
-        "rglR5r5gahMv"
-      ]
-    },
-    "gpuClass": "premium",
-    "kernelspec": {
-      "display_name": "deepanshu",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.9.16"
-    },
-    "vscode": {
-      "interpreter": {
-        "hash": "cd95ac8400f934ca97b7c7125945f5f2a4616fc88b7668f808354bfbb29c51b3"
-      }
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "YoURrh11fbIc"
+   },
+   "source": [
+    "# RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment\n",
+    "\n",
+    "This notebook beautifully showcases how RAFT can be leveraged to fine-tune a model.\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "Curious how this works? Read our [paper](https://arxiv.org/abs/2304.06767) to explore the intricacies of our innovative approach."
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "BzmCovNKkwbi"
+   },
+   "source": [
+    "## Initial Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "n7TI5hirlzn8"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install the required libs\n",
+    "%pip install -q accelerate diffusers transformers ftfy bitsandbytes gradio natsort safetensors xformers datasets\n",
+    "%pip install -qq \"ipywidgets>=7,<8\"\n",
+    "!wget -q https://raw.githubusercontent.com/OptimalScale/LMFlow/main/experimental/RAFT-diffusion/train_text_to_image_lora.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "fvCBZCnrqcX1"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Install CLIP\n",
+    "\n",
+    "!pip install git+https://github.com/deepgoyal19/CLIP.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "guDgmswnmW-4"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Import required libraries\n",
+    "import argparse\n",
+    "import itertools\n",
+    "import math\n",
+    "import os\n",
+    "import shutil\n",
+    "from os.path import expanduser  # pylint: disable=import-outside-toplevel\n",
+    "from urllib.request import urlretrieve  # pylint: disable=import-outside-toplevel\n",
+    "from contextlib import nullcontext\n",
+    "import random\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "import torch.utils.checkpoint\n",
+    "from torch.utils.data import Dataset\n",
+    "import concurrent\n",
+    "import PIL\n",
+    "from accelerate import Accelerator\n",
+    "from accelerate.logging import get_logger\n",
+    "from accelerate.utils import set_seed\n",
+    "from diffusers import (\n",
+    "    AutoencoderKL,\n",
+    "    DDPMScheduler,\n",
+    "    PNDMScheduler,\n",
+    "    StableDiffusionPipeline,\n",
+    "    UNet2DConditionModel,\n",
+    "    DPMSolverMultistepScheduler,\n",
+    ")\n",
+    "from diffusers.optimization import get_scheduler\n",
+    "from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker\n",
+    "from PIL import Image\n",
+    "from torchvision import transforms\n",
+    "from tqdm.auto import tqdm\n",
+    "from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer\n",
+    "import clip\n",
+    "import bitsandbytes as bnb\n",
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "\n",
+    "def image_grid(imgs, rows, cols):\n",
+    "    assert len(imgs) == rows * cols\n",
+    "\n",
+    "    w, h = imgs[0].size\n",
+    "    grid = Image.new(\"RGB\", size=(cols * w, rows * h))\n",
+    "    grid_w, grid_h = grid.size\n",
+    "\n",
+    "    for i, img in enumerate(imgs):\n",
+    "        grid.paste(img, box=(i % cols * w, i // cols * h))\n",
+    "    return grid"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "f4D64FI9pI38"
+   },
+   "source": [
+    "## Loading Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "7IryKE4wq0SZ"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Creating Dataloader\n",
+    "\n",
+    "prompts = [\"airplane\", \"automobile\", \"bird\", \"deer\", \"dog\", \"cat\", \"frog\", \"horse\", \"ship\", \"truck\"]  # CIFAR labels\n",
+    "prompts = pd.DataFrame({\"prompts\": prompts})  # converting prompts list into a pandas dataframe\n",
+    "\n",
+    "\n",
+    "class CIFAR10Dataset:\n",
+    "    def __init__(self):\n",
+    "        global prompts\n",
+    "        self.prompts = prompts.iloc[:, 0]\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.prompts)\n",
+    "\n",
+    "    def __getitem__(self, index):\n",
+    "        return self.prompts.iloc[index]\n",
+    "\n",
+    "\n",
+    "# @markdown Please mention the batch size.\n",
+    "batch_size = 5  # @param {type:\"integer\"}\n",
+    "\n",
+    "\n",
+    "dataset = CIFAR10Dataset()\n",
+    "finetune_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "BWH9vc1kvhvC"
+   },
+   "source": [
+    "## Loading CLIP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "lJAguhs1d89L"
+   },
+   "outputs": [],
+   "source": [
+    "def get_aesthetic_model(clip_model=\"vit_l_14\"):\n",
+    "    \"\"\"load the aethetic model\"\"\"\n",
+    "    home = expanduser(\"~\")\n",
+    "    cache_folder = home + \"/.cache/emb_reader\"\n",
+    "    path_to_model = cache_folder + \"/sa_0_4_\" + clip_model + \"_linear.pth\"\n",
+    "    if not os.path.exists(path_to_model):\n",
+    "        os.makedirs(cache_folder, exist_ok=True)\n",
+    "        url_model = (\n",
+    "            \"https://github.com/LAION-AI/aesthetic-predictor/blob/main/sa_0_4_\" + clip_model + \"_linear.pth?raw=true\"\n",
+    "        )\n",
+    "        urlretrieve(url_model, path_to_model)\n",
+    "    if clip_model == \"vit_l_14\":\n",
+    "        m = torch.nn.Linear(768, 1)\n",
+    "    elif clip_model == \"vit_b_32\":\n",
+    "        m = torch.nn.Linear(512, 1)\n",
+    "    else:\n",
+    "        raise ValueError()\n",
+    "    s = torch.load(path_to_model)\n",
+    "    m.load_state_dict(s)\n",
+    "    m.eval()\n",
+    "    return m\n",
+    "\n",
+    "\n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "amodel = get_aesthetic_model(clip_model=\"vit_l_14\").to(device)\n",
+    "amodel.eval()\n",
+    "\n",
+    "model, preprocess = clip.load(\"ViT-L/14\", device=device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0RPeQGHUzUZp"
+   },
+   "source": [
+    "## Evaluating Aesthetic Score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "s61Ljr9Sd89M"
+   },
+   "outputs": [],
+   "source": [
+    "def get_image_score(image):  # Evaluating Scores if images\n",
+    "    images = preprocess(image).unsqueeze(0).to(device)\n",
+    "    with torch.no_grad():\n",
+    "        image_features = model.encode_image(images).to(device)\n",
+    "        image_features /= image_features.norm(dim=-1, keepdim=True)\n",
+    "        image_features = image_features.to(torch.float32)\n",
+    "        prediction = amodel(image_features)\n",
+    "        return float(prediction)\n",
+    "\n",
+    "\n",
+    "def get_max_score(\n",
+    "    image_list, index, epoch=0\n",
+    "):  # The get_max_score function will return prompt's image with the highest aesthetic score will be chosen for additional fine-tuning.\n",
+    "    score_list = []\n",
+    "    for image in image_list:\n",
+    "        score_list.append(get_image_score(image))\n",
+    "    torch.cuda.empty_cache()\n",
+    "\n",
+    "    prompts.loc[index, f\"Epoch{epoch} Scores\"] = max(score_list)\n",
+    "    return [max(score_list), score_list.index(max(score_list))]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Ak1jArUL0eCi"
+   },
+   "source": [
+    "##Parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "jv6WYJos0iT5"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Settings for the model\n",
+    "\n",
+    "# @markdown All settings have been configured to achieve optimal output. Changing them is not advisable.\n",
+    "\n",
+    "# @markdown Enter value for `resolution`.\n",
+    "resolution = 256  # @param {type:\"integer\"}\n",
+    "\n",
+    "# @markdown Enter value for `num_images_per_prompt`.\n",
+    "num_images_per_prompt = 10  # @param {type:\"integer\"}\n",
+    "\n",
+    "# @markdown Enter value for `epochs`.\n",
+    "epochs = 10  # @param {type:\"integer\"} |"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "7gFbnMaLd89N"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Setting Stable Diffusion pipeline\n",
+    "model_id = \"runwayml/stable-diffusion-v1-5\"\n",
+    "pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to(device)\n",
+    "pipe.enable_xformers_memory_efficient_attention()\n",
+    "torch.cuda.empty_cache()\n",
+    "\n",
+    "# @markdown Check the `set_progress_bar_config` option if you would like to hide the progress bar for image generation\n",
+    "set_progress_bar_config = False  # @param {type:\"boolean\"}\n",
+    "pipe.set_progress_bar_config(disable=set_progress_bar_config)\n",
+    "\n",
+    "\n",
+    "scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)\n",
+    "pipe.scheduler = scheduler\n",
+    "\n",
+    "torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "9U2P_PUN-5xX"
+   },
+   "source": [
+    "##Finetuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "F-m6S9Sg-yS_"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Generating images on the pretrained model\n",
+    "\n",
+    "# @markdown Check the box to generate images using the pretrained model.\n",
+    "generate_pretrained_model_images = True  # @param {type:\"boolean\"}\n",
+    "\n",
+    "if generate_pretrained_model_images:\n",
+    "    image_list = []\n",
+    "    for step, prompt_list in enumerate(finetune_dataloader):\n",
+    "        image = pipe(\n",
+    "            prompt_list, num_images_per_prompt=num_images_per_prompt, width=resolution, height=resolution\n",
+    "        ).images\n",
+    "        image_list += image\n",
+    "        torch.cuda.empty_cache()\n",
+    "\n",
+    "    grid = image_grid(image_list, len(prompts), num_images_per_prompt)\n",
+    "    grid.save(\"pretrained.png\")\n",
+    "    grid"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "kPfHR4HQd89N"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Run training\n",
+    "\n",
+    "os.environ[\"MODEL_NAME\"] = model_id\n",
+    "os.environ[\"OUTPUT_DIR\"] = f\"./CustomModel/\"\n",
+    "topk = 8\n",
+    "training_steps_per_epoch = topk * 10\n",
+    "os.environ[\"CHECKPOINTING_STEPS\"] = str(training_steps_per_epoch)\n",
+    "os.environ[\"RESOLUTION\"] = str(resolution)\n",
+    "os.environ[\"LEARNING_RATE\"] = str(9e-6)\n",
+    "\n",
+    "# remove old account directory\n",
+    "try:\n",
+    "    shutil.rmtree(\"./CustomModel\")\n",
+    "except:\n",
+    "    pass\n",
+    "try:\n",
+    "    shutil.rmtree(\"./trainingdataset/imagefolder/\")\n",
+    "except:\n",
+    "    pass\n",
+    "\n",
+    "model_id = \"runwayml/stable-diffusion-v1-5\"\n",
+    "\n",
+    "\n",
+    "for epoch in range(epochs + 1):\n",
+    "    print(\"Epoch: \", epoch)\n",
+    "    epoch = epoch\n",
+    "    training_steps = str(training_steps_per_epoch * (epoch + 1))\n",
+    "    os.environ[\"TRAINING_STEPS\"] = training_steps\n",
+    "    os.environ[\"TRAINING_DIR\"] = f\"./trainingdataset/imagefolder/{epoch}\"\n",
+    "\n",
+    "    training_prompts = []\n",
+    "    prompts[f\"Epoch{epoch} Scores\"] = np.nan\n",
+    "\n",
+    "    for step, prompt_list in enumerate(finetune_dataloader):\n",
+    "        image = pipe(\n",
+    "            prompt_list, num_images_per_prompt=num_images_per_prompt, width=resolution, height=resolution\n",
+    "        ).images\n",
+    "        image_list = []\n",
+    "\n",
+    "        for i in range(int(len(image) / num_images_per_prompt)):\n",
+    "            image_list.append(image[i * num_images_per_prompt : (i + 1) * num_images_per_prompt])\n",
+    "        torch.cuda.empty_cache()\n",
+    "\n",
+    "        with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:\n",
+    "            step_list = [i for i in range(step * batch_size, (step + 1) * batch_size)]\n",
+    "            score_index = executor.map(get_max_score, image_list, step_list, [epoch for i in range(len(step_list))])\n",
+    "\n",
+    "        iterator = 0\n",
+    "        for max_scores in score_index:\n",
+    "            training_prompts.append([max_scores[0], image_list[iterator][max_scores[1]], prompt_list[iterator]])\n",
+    "            iterator += 1\n",
+    "\n",
+    "    training_prompts = [row[1:3] for row in sorted(training_prompts, key=lambda x: (x[0]), reverse=True)[:topk]]\n",
+    "    training_prompts = pd.DataFrame(training_prompts)\n",
+    "\n",
+    "    if not os.path.exists(f\"./trainingdataset/imagefolder/{epoch}/train/\"):\n",
+    "        os.makedirs(f\"./trainingdataset/imagefolder/{epoch}/train/\")\n",
+    "    if not os.path.exists(f\"./CustomModel/\"):\n",
+    "        os.makedirs(f\"./CustomModel/\")\n",
+    "    for i in range(len(training_prompts)):\n",
+    "        training_prompts.iloc[i, 0].save(f\"./trainingdataset/imagefolder/{epoch}/train/{i}.png\")\n",
+    "\n",
+    "    training_prompts[\"file_name\"] = [f\"{i}.png\" for i in range(len(training_prompts))]\n",
+    "    training_prompts.columns = [\"0\", \"text\", \"file_name\"]\n",
+    "    training_prompts.drop(\"0\", axis=1, inplace=True)\n",
+    "    training_prompts.to_csv(f\"./trainingdataset/imagefolder/{epoch}/train/metadata.csv\", index=False)\n",
+    "    torch.cuda.empty_cache()\n",
+    "\n",
+    "    if epoch < epochs:\n",
+    "        !accelerate launch --num_processes=1 --mixed_precision='fp16' --dynamo_backend='no' --num_machines=1 train_text_to_image_lora.py \\\n",
+    "        --pretrained_model_name_or_path=$MODEL_NAME \\\n",
+    "        --train_data_dir=$TRAINING_DIR \\\n",
+    "        --resolution=$RESOLUTION \\\n",
+    "        --train_batch_size=8 \\\n",
+    "        --gradient_accumulation_steps=1 \\\n",
+    "        --gradient_checkpointing \\\n",
+    "        --max_grad_norm=1 \\\n",
+    "        --mixed_precision=\"fp16\" \\\n",
+    "        --max_train_steps=$TRAINING_STEPS \\\n",
+    "        --learning_rate=$LEARNING_RATE \\\n",
+    "        --lr_warmup_steps=0 \\\n",
+    "        --enable_xformers_memory_efficient_attention \\\n",
+    "        --dataloader_num_workers=1 \\\n",
+    "        --output_dir=$OUTPUT_DIR \\\n",
+    "        --lr_warmup_steps=0 \\\n",
+    "        --seed=1234 \\\n",
+    "        --checkpointing_steps=$CHECKPOINTING_STEPS \\\n",
+    "        --resume_from_checkpoint=\"latest\" \\\n",
+    "        --lr_scheduler='constant' \n",
+    "\n",
+    "    pipe.unet.load_attn_procs(f\"./CustomModel/\")\n",
+    "    torch.cuda.empty_cache()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rglR5r5gahMv"
+   },
+   "source": [
+    "##Results\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "kcf9aY6od89O"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Generating results on the fine-tuned model\n",
+    "\n",
+    "# @markdown Check the box to generate images using the fine-tuned model.\n",
+    "generate_finetuned_model_images = True  # @param {type:\"boolean\"}\n",
+    "\n",
+    "if generate_finetuned_model_images:\n",
+    "    image_list = []\n",
+    "    pipe.unet.load_attn_procs(\"./CustomModel\")\n",
+    "    for step, prompt_list in enumerate(finetune_dataloader):\n",
+    "        image = pipe(\n",
+    "            prompt_list, num_images_per_prompt=num_images_per_prompt, width=resolution, height=resolution\n",
+    "        ).images\n",
+    "        image_list += image\n",
+    "        torch.cuda.empty_cache()\n",
+    "\n",
+    "    grid = image_grid(image_list, len(prompts), num_images_per_prompt)\n",
+    "    grid.save(\"trained.png\")\n",
+    "    grid.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [
+    "BzmCovNKkwbi",
+    "f4D64FI9pI38",
+    "BWH9vc1kvhvC",
+    "0RPeQGHUzUZp",
+    "Ak1jArUL0eCi",
+    "9U2P_PUN-5xX",
+    "rglR5r5gahMv"
+   ],
+   "provenance": []
+  },
+  "gpuClass": "premium",
+  "kernelspec": {
+   "display_name": "deepanshu",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "cd95ac8400f934ca97b7c7125945f5f2a4616fc88b7668f808354bfbb29c51b3"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/experimental/RAFT-diffusion/train_text_to_image_lora.py b/experimental/RAFT-diffusion/train_text_to_image_lora.py
index 49d87c79a..1ea20b045 100644
--- a/experimental/RAFT-diffusion/train_text_to_image_lora.py
+++ b/experimental/RAFT-diffusion/train_text_to_image_lora.py
@@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -23,6 +22,7 @@
 from typing import Optional
 
 import datasets
+import diffusers
 import numpy as np
 import torch
 import torch.nn.functional as F
@@ -32,20 +32,17 @@
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from datasets import load_dataset
-from huggingface_hub import HfFolder, Repository, create_repo, whoami, upload_folder
-from packaging import version
-from torchvision import transforms
-from tqdm.auto import tqdm
-from transformers import CLIPTextModel, CLIPTokenizer
-
-import diffusers
 from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel
 from diffusers.loaders import AttnProcsLayers
 from diffusers.models.attention_processor import LoRAAttnProcessor
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.import_utils import is_xformers_available
-
+from huggingface_hub import HfFolder, create_repo, upload_folder, whoami
+from packaging import version
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
 check_min_version("0.15.0.dev0")
@@ -74,7 +71,8 @@ def save_model_card(repo_name, images=None, base_model=str, dataset_name=str, re
     """
     model_card = f"""
 # LoRA text2image fine-tuning - {repo_name}
-These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset. You can find some example images in the following. \n
+These are LoRA adaption weights for {base_model}. The weights were fine-tuned on the {dataset_name} dataset.
+You can find some example images in the following. \n\n
 {img_str}
 """
     with open(os.path.join(repo_folder, "README.md"), "w") as f:
@@ -155,8 +153,7 @@ def parse_args():
         type=int,
         default=None,
         help=(
-            "For debugging purposes or quicker training, truncate the number of training examples to this "
-            "value if set."
+            "For debugging purposes or quicker training, truncate the number of training examples to this value if set."
         ),
     )
     parser.add_argument(
@@ -410,7 +407,7 @@ def main():
             ).repo_id
 
     # Load scheduler, tokenizer and models.
-    noise_scheduler = DDPMScheduler.from_pretrained( args.pretrained_model_name_or_path, subfolder="scheduler")
+    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
     tokenizer = CLIPTokenizer.from_pretrained(
         args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
     )
@@ -476,8 +473,10 @@ def main():
 
             xformers_version = version.parse(xformers.__version__)
             if xformers_version == version.parse("0.0.16"):
-                logger.warn(
-                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
+                logger.warning(
+                    "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during "
+                    "training, please update xFormers to at least 0.0.17. "
+                    "See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details."
                 )
             unet.enable_xformers_memory_efficient_attention()
         else:
@@ -499,10 +498,10 @@ def main():
     if args.use_8bit_adam:
         try:
             import bitsandbytes as bnb
-        except ImportError:
+        except ImportError as ex:
             raise ImportError(
                 "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
-            )
+            ) from ex
 
         optimizer_cls = bnb.optim.AdamW8bit
     else:
@@ -712,7 +711,7 @@ def collate_fn(examples):
                 # Sample a random timestep for each image
                 timesteps = torch.randint(0, noise_scheduler.num_train_timesteps, (bsz,), device=latents.device)
                 timesteps = timesteps.long()
-                
+
                 # Add noise to the latents according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
@@ -842,7 +841,6 @@ def collate_fn(examples):
     # for _ in range(args.num_validation_images):
     #     images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0])
 
-
     # for _ in range(1):
     #     images.append(pipeline('a photo of cat').images[0])
     # images[0].save('/root/autodl-tmp/deepanshu/output.png')
diff --git a/pyproject.toml b/pyproject.toml
index 723e5e1e2..96f807b1c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,17 +2,55 @@
 requires = ["setuptools >= 64"]
 build-backend = "setuptools.build_meta"
 
+# -------------------------------
+# tool.ruff - Linting configuration
+# -------------------------------
 [tool.ruff]
+# Note: While the formatter will attempt to format lines such that they remain within the line-length,
+# it isn't a hard upper bound, and formatted lines may exceed the line-length.
 target-version = "py39"
-indent-width = 4
+line-length = 120
+exclude = [
+    "tests/utils/test_data_utils.py",
+    "tests/pipeline/test_memory_safe_vllm_inferencer.py",
+    "tests/models/test_hf_decoder_model.py",
+    "src/lmflow/pipeline/utils/raft_trainer.py",
+    "src/lmflow/pipeline/raft_aligner.py",
+    "examples/chatbot_gradio.py",
+]
 
-[tool.ruff.lint.isort]
-lines-after-imports = 2
-known-first-party = ["lmflow"]
+[tool.ruff.lint]
+isort = {known-first-party = ["lmflow"]}
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # isort
+    "I",
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+    # `.log()` statement uses f-string
+    "G004",
+]
 
-[tool.ruff.format]
-quote-style = "double"
-indent-style = "space"
-docstring-code-format = true
-skip-magic-trailing-comma = false
-line-ending = "auto"
\ No newline at end of file
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*.py" = ["F841"]
+"src/lmflow/utils/conversation_template/*.py" = ["E501"]
+"src/lmflow/utils/llava_conversation_lib.py" = ["E501"]
+"src/lmflow/utils/constants.py" = ["E501"]
diff --git a/requirements.txt b/requirements.txt
index 66e885d6c..3db38164d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,6 @@ tokenizers>=0.13.3
 peft>=0.10.0
 torch>=2.0.1
 wandb
-deepspeed>=0.14.4
 sentencepiece
 transformers>=4.31.0
 cpm_kernels==1.0.11
diff --git a/scripts/bash.sh b/scripts/archive/bash.sh
similarity index 100%
rename from scripts/bash.sh
rename to scripts/archive/bash.sh
diff --git a/scripts/convert_llama_weights_to_hf.py b/scripts/archive/convert_llama_weights_to_hf.py
similarity index 99%
rename from scripts/convert_llama_weights_to_hf.py
rename to scripts/archive/convert_llama_weights_to_hf.py
index 91d0dfad0..def86a4cc 100644
--- a/scripts/convert_llama_weights_to_hf.py
+++ b/scripts/archive/convert_llama_weights_to_hf.py
@@ -19,10 +19,8 @@
 import shutil
 
 import torch
-
 from transformers import LlamaConfig, LlamaForCausalLM
 
-
 """
 Sample usage:
 
@@ -63,7 +61,7 @@ def compute_intermediate_size(n):
 
 
 def read_json(path):
-    with open(path, "r") as f:
+    with open(path) as f:
         return json.load(f)
 
 
diff --git a/output_models/download.sh b/scripts/archive/download_model.sh
similarity index 100%
rename from output_models/download.sh
rename to scripts/archive/download_model.sh
diff --git a/scripts/export_llama_state_dict_checkpoint.py b/scripts/archive/export_llama_state_dict_checkpoint.py
similarity index 76%
rename from scripts/export_llama_state_dict_checkpoint.py
rename to scripts/archive/export_llama_state_dict_checkpoint.py
index 1a295ec57..990b2979c 100644
--- a/scripts/export_llama_state_dict_checkpoint.py
+++ b/scripts/archive/export_llama_state_dict_checkpoint.py
@@ -4,25 +4,17 @@
 import os
 
 import torch
-import transformers
 from peft import PeftModel
 from transformers import LlamaForCausalLM, LlamaTokenizer  # noqa: E402
 
 
 def permute(w):
-    return (
-        w.view(n_heads, dim // n_heads // 2, 2, dim)
-        .transpose(1, 2)
-        .reshape(dim, dim)
-    )
+    return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
 
 
 def unpermute(w):
-    return (
-        w.view(n_heads, 2, dim // n_heads // 2, dim)
-        .transpose(1, 2)
-        .reshape(dim, dim)
-    )
+    return w.view(n_heads, 2, dim // n_heads // 2, dim).transpose(1, 2).reshape(dim, dim)
+
 
 def translate_state_dict_key(k):  # noqa: C901
     k = k.replace("base_model.model.", "")
@@ -61,43 +53,45 @@ def translate_state_dict_key(k):  # noqa: C901
         print(k)
         raise NotImplementedError
 
+
 PARAM_LIST = {
-7:{
-    "dim": 4096,
-    "multiple_of": 256,
-    "n_heads": 32,
-    "n_layers": 32,
-    "norm_eps": 1e-06,
-    "vocab_size": -1,
-},
-13:{
-    "dim": 5120,
-    "multiple_of": 256,
-    "n_heads": 40,
-    "n_layers": 40,
-    "norm_eps": 1e-06,
-    "vocab_size": -1,
-},
-33:{
-    "dim": 6656,
-    "multiple_of": 256,
-    "n_heads": 52,
-    "n_layers": 60,
-    "norm_eps": 1e-06,
-    "vocab_size": -1,
-}}
+    7: {
+        "dim": 4096,
+        "multiple_of": 256,
+        "n_heads": 32,
+        "n_layers": 32,
+        "norm_eps": 1e-06,
+        "vocab_size": -1,
+    },
+    13: {
+        "dim": 5120,
+        "multiple_of": 256,
+        "n_heads": 40,
+        "n_layers": 40,
+        "norm_eps": 1e-06,
+        "vocab_size": -1,
+    },
+    33: {
+        "dim": 6656,
+        "multiple_of": 256,
+        "n_heads": 52,
+        "n_layers": 60,
+        "norm_eps": 1e-06,
+        "vocab_size": -1,
+    },
+}
 
 
 BASE_MODEL = os.environ.get("BASE_MODEL", None)
-assert (
-    BASE_MODEL
-), "Please specify a value for BASE_MODEL environment variable, e.g. `export BASE_MODEL=decapoda-research/llama-30b-hf`"  # noqa: E501
+assert BASE_MODEL, (
+    "Please specify a value for BASE_MODEL environment variable, e.g. "
+    "`export BASE_MODEL=decapoda-research/llama-30b-hf`"
+)  # noqa: E501
 LORA_MODEL = os.environ.get("LORA_MODEL", None)
 
 MODEL_SIZE = int(os.environ.get("MODEL_SIZE", None))
-assert (
-    MODEL_SIZE
-), "Please specify a value for MODEL_SIZE environment variable, e.g. `export MODEL_SIZE=33`"  # noqa: E501
+assert MODEL_SIZE, "Please specify a value for MODEL_SIZE environment variable, e.g. "
+"`export MODEL_SIZE=33`"  # noqa: E501
 
 
 tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
@@ -117,11 +111,9 @@ def translate_state_dict_key(k):  # noqa: C901
 dim = params["dim"]
 dims_per_head = dim // n_heads
 base = 10000.0
-inv_freq = 1.0 / (
-    base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head)
-)
+inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
 
-if not (LORA_MODEL is None):
+if LORA_MODEL is not None:
     lora_model = PeftModel.from_pretrained(
         base_model,
         LORA_MODEL,
@@ -129,7 +121,6 @@ def translate_state_dict_key(k):  # noqa: C901
         torch_dtype=torch.float16,
     )
 
-
     # merge weights
     for layer in lora_model.base_model.model.model.layers:
         layer.self_attn.q_proj.merge_weights = True
@@ -139,11 +130,6 @@ def translate_state_dict_key(k):  # noqa: C901
 
     lora_model_sd = lora_model.state_dict()
 
-
-
-
-
-
     new_state_dict = {}
     for k, v in lora_model_sd.items():
         new_k = translate_state_dict_key(k)
@@ -165,7 +151,6 @@ def translate_state_dict_key(k):  # noqa: C901
                 new_state_dict[new_k] = v
 
 
-
 os.makedirs("./ckpt", exist_ok=True)
 
 torch.save(new_state_dict, "./ckpt/consolidated.00.pth")
diff --git a/scripts/run_all_benchmark.sh b/scripts/archive/run_all_benchmark.sh
similarity index 71%
rename from scripts/run_all_benchmark.sh
rename to scripts/archive/run_all_benchmark.sh
index 102d1d623..0516766c1 100644
--- a/scripts/run_all_benchmark.sh
+++ b/scripts/archive/run_all_benchmark.sh
@@ -36,14 +36,14 @@ fi
 log_dir=output_dir/${model_name}_lmflow_chat_nll_eval
 mkdir -p ${log_dir}
 echo "[Evaluating] Evaluate on LMFlow_chat"
-./scripts/run_benchmark.sh ${extra_args} --dataset_name lmflow_chat_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
+./scripts/archive/run_benchmark.sh ${extra_args} --dataset_name lmflow_chat_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
 
 log_dir=output_dir/${model_name}_all_nll_eval
 mkdir -p ${log_dir}
 echo "[Evaluating] Evaluate on [commonsense, wiki, instruction_following (gpt4) ] nll evaluation"
-./scripts/run_benchmark.sh ${extra_args} --dataset_name all_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
+./scripts/archive/run_benchmark.sh ${extra_args} --dataset_name all_nll_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
 
 log_dir=output_dir/${model_name}_commonsense_qa_eval
 mkdir -p ${log_dir}
 echo "[Evaluating] Evaluate on commonsense QA Accuracy evaluation"
-./scripts/run_benchmark.sh ${extra_args} --dataset_name commonsense_qa_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
\ No newline at end of file
+./scripts/archive/run_benchmark.sh ${extra_args} --dataset_name commonsense_qa_eval | tee ${log_dir}/benchmark.log 2> ${log_dir}/benchmark.err
\ No newline at end of file
diff --git a/scripts/run_app.sh b/scripts/archive/run_app.sh
similarity index 73%
rename from scripts/run_app.sh
rename to scripts/archive/run_app.sh
index 6b81bf68c..7d9146630 100644
--- a/scripts/run_app.sh
+++ b/scripts/archive/run_app.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singlegpu_config.yaml service/app.py \
+CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/archive/accelerate_singlegpu_config.yaml service/app.py \
     --model_name_or_path gpt2 \
     --torch_dtype bfloat16 \
     --max_new_tokens 200
\ No newline at end of file
diff --git a/scripts/run_benchmark.sh b/scripts/archive/run_benchmark.sh
similarity index 100%
rename from scripts/run_benchmark.sh
rename to scripts/archive/run_benchmark.sh
diff --git a/scripts/run_chatbot.sh b/scripts/archive/run_chatbot.sh
similarity index 73%
rename from scripts/run_chatbot.sh
rename to scripts/archive/run_chatbot.sh
index 8e1c49be0..634e7b117 100755
--- a/scripts/run_chatbot.sh
+++ b/scripts/archive/run_chatbot.sh
@@ -12,11 +12,10 @@ if [ $# -ge 2 ]; then
 fi
 
     # --temperature 0.7 \
-accelerate launch --config_file configs/accelerator_multigpu_config.yaml \
+accelerate launch --config_file configs/archive/accelerate_multigpu_config.yaml \
   examples/chatbot.py \
-    --deepspeed configs/ds_config_chatbot.json \
+    --deepspeed configs/archive/ds_config_chatbot.json \
     --model_name_or_path ${model} \
-    --use_accelerator True \
     --max_new_tokens 256 \
     --temperature 1.0 \
     --end_string "#" \
diff --git a/scripts/run_chatbot_chatglm.sh b/scripts/archive/run_chatbot_chatglm.sh
similarity index 83%
rename from scripts/run_chatbot_chatglm.sh
rename to scripts/archive/run_chatbot_chatglm.sh
index af2fe4064..efaa40f47 100755
--- a/scripts/run_chatbot_chatglm.sh
+++ b/scripts/archive/run_chatbot_chatglm.sh
@@ -12,6 +12,6 @@ fi
 CUDA_VISIBLE_DEVICES=0 \
   deepspeed examples/chatbot.py \
       --arch_type encoder_decoder \
-      --deepspeed configs/ds_config_chatbot.json \
+      --deepspeed configs/archive/ds_config_chatbot.json \
       --model_name_or_path ${model} \
       ${lora_args}
\ No newline at end of file
diff --git a/scripts/run_chatbot_cpu.sh b/scripts/archive/run_chatbot_cpu.sh
similarity index 82%
rename from scripts/run_chatbot_cpu.sh
rename to scripts/archive/run_chatbot_cpu.sh
index 300c0fdc3..a53f63a55 100755
--- a/scripts/run_chatbot_cpu.sh
+++ b/scripts/archive/run_chatbot_cpu.sh
@@ -11,7 +11,7 @@ fi
 
 CUDA_VISIBLE_DEVICES="" \
   python examples/chatbot.py \
-      --deepspeed configs/ds_config_chatbot.json \
+      --deepspeed configs/archive/ds_config_chatbot.json \
       --model_name_or_path ${model} \
       --device "cpu" \
       ${lora_args}
diff --git a/scripts/run_detail_gpu_memory.sh b/scripts/archive/run_detail_gpu_memory.sh
similarity index 100%
rename from scripts/run_detail_gpu_memory.sh
rename to scripts/archive/run_detail_gpu_memory.sh
diff --git a/scripts/run_dpo_align.sh b/scripts/archive/run_dpo_align.sh
similarity index 100%
rename from scripts/run_dpo_align.sh
rename to scripts/archive/run_dpo_align.sh
diff --git a/scripts/run_dpov2_align.sh b/scripts/archive/run_dpov2_align.sh
similarity index 53%
rename from scripts/run_dpov2_align.sh
rename to scripts/archive/run_dpov2_align.sh
index d96908931..d9430abff 100644
--- a/scripts/run_dpov2_align.sh
+++ b/scripts/archive/run_dpov2_align.sh
@@ -1,59 +1,40 @@
 #!/bin/bash
-
-# Parses arguments
-run_name=dpov2_align
 model_name_or_path=meta-llama/Meta-Llama-3-8B-Instruct
-reference_model_name_or_path=meta-llama/Meta-Llama-3-8B-Instruct
 dataset_path=data/iterative-prompt/train
-eval_dataset_path=data/iterative-prompt/eval
-output_dir=output_models/${run_name}
+output_dir=output_models/dpov2_align
 
-while [[ $# -ge 1 ]]; do
-  key="$1"
-  case ${key} in
-    -r|--run_name)
-      run_name="$2"
-      shift
-      ;;
-    --model_name_or_path)
-      model_name_or_path="$2"
-      shift
-      ;;
-    --reference_model_name_or_path)
-      reference_model_name_or_path="$2"
-      shift
-      ;;
-    --dataset_path)
-      dataset_path="$2"
-      shift
-      ;;
-    --eval_dataset_path)
-      eval_dataset_path="$2"
-      shift
-      ;;
-    -o|--output_dir)
-      output_dir="$2"
-      shift
-      ;;
-    *)
-      echo "error: unknown option \"${key}\"" 1>&2
-      exit 1
-  esac
-  shift
-done
+# DPO related arguments
+reference_model_name_or_path=meta-llama/Meta-Llama-3-8B-Instruct
+eval_dataset_path=data/iterative-prompt/eval
+margin_scale=1.0
+max_prompt_length=1000
+loss_type=sigmoid
+sampling_paired_method=max_min
+mask_prompt=True
+length_penalty=0
 
+# Align
+exp_id=dpov2_align
 project_dir=$(cd "$(dirname $0)"/..; pwd)
-log_dir=${project_dir}/log/${run_name}
+log_dir=${project_dir}/log/${exp_id}
 mkdir -p ${output_dir} ${log_dir}
 
-accelerate launch --config_file configs/accelerate_dsz3_config.yaml \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
   examples/dpov2_train.py \
     --model_name_or_path ${model_name_or_path} \
+    --trust_remote_code 0 \
     --reference_model_name_or_path ${reference_model_name_or_path} \
     --do_train True \
     --dataset_path ${dataset_path} \
     --eval_dataset_path ${eval_dataset_path} \
+    --margin_scale ${margin_scale} \
+    --max_prompt_length ${max_prompt_length} \
+    --loss_type ${loss_type} \
+    --sampling_paired_method ${sampling_paired_method} \
+    --mask_prompt ${mask_prompt} \
+    --length_penalty ${length_penalty} \
     --bf16 True \
+    --torch_dtype bfloat16 \
     --learning_rate 5e-7 \
     --lr_scheduler_type cosine \
     --warmup_steps 100 \
@@ -62,20 +43,15 @@ accelerate launch --config_file configs/accelerate_dsz3_config.yaml \
     --per_device_eval_batch_size 1 \
     --gradient_accumulation_steps 16 \
     --gradient_checkpointing True \
-    --margin_scale 1.0 \
-    --max_prompt_length 1000 \
     --num_train_epochs 2 \
     --logging_steps 2 \
     --save_strategy epoch \
     --save_steps 5000 \
     --evaluation_strategy steps \
     --eval_steps 100 \
-    --loss_type sigmoid \
     --output_dir ${output_dir} \
-    --run_name ${run_name} \
-    --sampling_paired_method max_min \
+    --run_name ${exp_id} \
     --report_to wandb \
-    --mask_prompt True \
-    --length_penalty 0 \
+    --seed 42 \
     | tee ${log_dir}/train.log \
     2> ${log_dir}/train.err
\ No newline at end of file
diff --git a/scripts/run_evaluation.sh b/scripts/archive/run_evaluation.sh
similarity index 100%
rename from scripts/run_evaluation.sh
rename to scripts/archive/run_evaluation.sh
diff --git a/scripts/run_evaluation_accelerator.sh b/scripts/archive/run_evaluation_accelerator.sh
similarity index 83%
rename from scripts/run_evaluation_accelerator.sh
rename to scripts/archive/run_evaluation_accelerator.sh
index 8959f6f4b..252793e2c 100644
--- a/scripts/run_evaluation_accelerator.sh
+++ b/scripts/archive/run_evaluation_accelerator.sh
@@ -4,7 +4,7 @@ if [ ! -d data/MedQA-USMLE ]; then
   cd data && ./download.sh MedQA-USMLE && cd -
 fi
 
-CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singlegpu_config.yaml examples/evaluation.py \
+CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/archive/accelerate_singlegpu_config.yaml examples/evaluation.py \
     --answer_type usmle \
     --model_name_or_path gpt2-large \
     --dataset_path data/MedQA-USMLE/validation \
@@ -13,5 +13,4 @@ CUDA_VISIBLE_DEVICES=0 accelerate launch --config_file configs/accelerator_singl
     --metric accuracy \
     --output_dir output_dir/accelerator_1_card \
     --inference_batch_size_per_device 1 \
-    --use_accelerator_for_evaluator True \
     --torch_dtype bfloat16
diff --git a/scripts/run_evaluation_with_lora.sh b/scripts/archive/run_evaluation_with_lora.sh
similarity index 100%
rename from scripts/run_evaluation_with_lora.sh
rename to scripts/archive/run_evaluation_with_lora.sh
diff --git a/scripts/archive/run_finetune.sh b/scripts/archive/run_finetune.sh
new file mode 100755
index 000000000..9bd3d5793
--- /dev/null
+++ b/scripts/archive/run_finetune.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Please run this script under ${project_id} in project directory of
+#   https://github.com/shizhediao/llm-ft
+#     COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4
+
+# Parses arguments
+model_name_or_path=gpt2
+dataset_path=data/alpaca/train_conversation
+output_dir=output_models/finetune
+deepspeed_args="--master_port=11000"
+conversation_template=llama2
+
+# Safety related arguments
+trust_remote_code=0
+
+while [[ $# -ge 1 ]]; do
+  key="$1"
+  case ${key} in
+    -m|--model_name_or_path)
+      model_name_or_path="$2"
+      shift
+      ;;
+    -d|--dataset_path)
+      dataset_path="$2"
+      shift
+      ;;
+    -o|--output_model_path)
+      output_dir="$2"
+      shift
+      ;;
+    --conversation_template)
+      conversation_template="$2"
+      shift
+      ;;
+    --deepspeed_args)
+      deepspeed_args="$2"
+      shift
+      ;;
+    --trust_remote_code)
+      trust_remote_code="$2"
+      shift
+      ;;
+    *)
+      echo "error: unknown option \"${key}\"" 1>&2
+      exit 1
+  esac
+  shift
+done
+
+# Finetune
+exp_id=finetune
+project_dir=$(cd "$(dirname $0)"/..; pwd)
+log_dir=${project_dir}/log/${exp_id}
+mkdir -p ${output_dir} ${log_dir}
+
+deepspeed ${deepspeed_args} \
+  examples/finetune.py \
+    --model_name_or_path ${model_name_or_path} \
+    --trust_remote_code ${trust_remote_code} \
+    --dataset_path ${dataset_path} \
+    --output_dir ${output_dir} --overwrite_output_dir \
+    --conversation_template ${conversation_template} \
+    --num_train_epochs 0.01 \
+    --learning_rate 2e-5 \
+    --disable_group_texts 1 \
+    --block_size 256 \
+    --per_device_train_batch_size 1 \
+    --deepspeed configs/archive/ds_config_zero3.json \
+    --fp16 \
+    --run_name finetune \
+    --validation_split_percentage 0 \
+    --logging_steps 20 \
+    --do_train \
+    --ddp_timeout 72000 \
+    --save_steps 5000 \
+    --dataloader_num_workers 1 \
+    > >(tee ${log_dir}/train.log) \
+    2> >(tee ${log_dir}/train.err >&2)
diff --git a/scripts/archive/run_finetune_with_custom_optim.sh b/scripts/archive/run_finetune_with_custom_optim.sh
new file mode 100644
index 000000000..09315d4d3
--- /dev/null
+++ b/scripts/archive/run_finetune_with_custom_optim.sh
@@ -0,0 +1,330 @@
+#!/bin/bash
+# Please run this script under ${project_id} in project directory of
+#   https://github.com/shizhediao/llm-ft
+#     COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4
+# Parses arguments
+model_name_or_path=gpt2
+dataset_path=data/alpaca/train_conversation
+
+# Other optional arguments that can improve memory saving
+gradient_checkpointing=True
+use_flash_attention=0
+gradient_accumulation_steps=1
+batch_size=1
+block_size=256
+per_device_train_batch_size=1
+conversation_template=llama2
+optim=dummy
+# Select an optimizer from the following options:
+# - 'adamw_torch'
+# - 'adafactor'
+# - 'sgd'
+# - 'lion_8bit'
+# - 'lion_32bit'
+# - 'rmsprop'
+# Additional optimizers are shown below
+learning_rate=1e-5
+lr_schedule=cosine
+beta1=0.9
+beta2=0.999
+beta3=0.99
+weight_decay=0
+momentum=0
+num_epoch=0.01
+use_deepspeed=1
+seed=42
+
+# Safety related arguments
+trust_remote_code=0
+
+# Enable model parallelism for multiple gpus, modify this if you prefer
+# customized deepspeed zero-redundancy optimization settings
+num_gpu=$(python -c "import torch; print(torch.cuda.device_count())")
+ds_config_file=configs/archive/ds_config_zero0_no_offload.json
+if [[ ${num_gpu} -ge 2 ]]; then
+  ds_config_file=configs/archive/ds_config_zero2_no_offload.json
+fi
+
+while [[ $# -ge 1 ]]; do
+  key="$1"
+  case ${key} in
+    -m|--model_name_or_path)
+      model_name_or_path="$2"
+      shift
+      ;;
+    -d|--dataset_path)
+      dataset_path="$2"
+      shift
+      ;;
+    -o|--output_model_path)
+      output_dir="$2"
+      shift
+      ;;
+    --lisa_activated_layers)
+      lisa_activated_layers="$2"
+      shift
+      ;;
+    --lisa_interval_steps)
+      lisa_interval_steps="$2"
+      shift
+      ;;
+    --gradient_checkpointing)
+      gradient_checkpointing="$2"
+      shift
+      ;;
+    --deepspeed)
+      ds_config_file="$2"
+      shift
+      ;;
+    --use_flash_attention)
+      use_flash_attention="$2"
+      shift
+      ;;
+    --gradient_accumulation_steps)
+      gradient_accumulation_steps="$2"
+      shift
+      ;;
+    --block_size)
+      block_size="$2"
+      shift
+      ;;
+    --conversation_template)
+      conversation_template="$2"
+      shift
+      ;;
+    --per_device_train_batch_size|--batch_size)
+      per_device_train_batch_size="$2"
+      batch_size="$2"
+      shift
+      ;;
+    --trust_remote_code)
+      trust_remote_code="$2"
+      shift
+      ;;
+    --run_name)
+      run_name="$2"
+      shift
+      ;;
+    --optim)
+      optim="$2"
+      shift
+      ;;
+    --lr)
+      learning_rate="$2"
+      shift
+      ;;
+    --beta1)
+      beta1="$2"
+      shift
+      ;;
+    --beta2)
+      beta2="$2"
+      shift
+      ;;
+    --beta3)
+      beta3="$2"
+      shift
+      ;;
+    --weight_decay)
+      weight_decay="$2"
+      shift
+      ;;
+    --momentum)
+      momentum="$2"
+      shift
+      ;;
+    -n|--num_epoch)
+      num_epoch="$2"
+      shift
+      ;;
+    --lr_schedule)
+      lr_schedule="$2"
+      shift
+      ;;
+    --use_deepspeed)
+      use_deepspeed="$2"
+      shift
+      ;;
+    --seed)
+      seed="$2"
+      shift
+      ;;
+    *)
+      echo "error: unknown option \"${key}\"" 1>&2
+      exit 1
+  esac
+  shift
+done
+
+deepspeed_args="--master_port=1103"
+
+optim_suffix_args=""
+if [ "${optim}" == "dummy" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_dummy_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_dummy_beta2 ${beta2}"
+elif [ "${optim}" == "adabelief" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "adabound" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "lars" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_momentum ${momentum}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "lamb" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "adamax" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "nadam" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "radam" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "adamp" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "sgdp" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_momentum ${momentum}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "yogi" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "sophia" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "adan" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_beta3 ${beta3}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "adam" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+elif [ "${optim}" == "novograd" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "adadelta" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+elif [ "${optim}" == "adagrad" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+elif [ "${optim}" == "adamw_schedule_free" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_beta1 ${beta1}"
+  optim_suffix_args+=" --optim_beta2 ${beta2}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+elif [ "${optim}" == "sgd_schedule_free" ]; then
+  optim_suffix_args="--use_customized_optim 1"
+  optim_suffix_args+=" --customized_optim ${optim}"
+  optim_suffix_args+=" --optim_momentum ${momentum}"
+  optim_suffix_args+=" --optim_weight_decay ${weight_decay}"
+else
+  optim_suffix_args="--optim ${optim}"
+  optim_suffix_args+=" --adam_beta1 ${beta1}"
+  optim_suffix_args+=" --adam_beta2 ${beta2}"
+fi
+
+# Finetune
+exp_id=alpaca_${optim}_lr-${learning_rate}_beta1-${beta1}_beta2-${beta2}_lr-sched-${lr_schedule}_model-$(basename ${model_name_or_path})_batch-size-${batch_size}x${gradient_accumulation_steps}_seed-${seed}
+echo "$(date): ${exp_id}..."
+
+tmp_dir=tmp
+mkdir -p ${tmp_dir}
+
+prefix=${exp_id}
+if [ -f ${tmp_dir}/${prefix}.mark ]; then
+  exit 0
+fi
+
+trap "rm -f ${tmp_dir}/${prefix}.mark" SIGINT SIGTERM SIGKILL
+touch ${tmp_dir}/${prefix}.mark
+
+project_dir=$(cd "$(dirname $0)"/..; pwd)
+log_dir=${project_dir}/log/${exp_id}
+output_dir=output_models/${exp_id}
+mkdir -p ${output_dir} ${log_dir}
+
+exe="deepspeed ${deepspeed_args}"
+if [[ ${use_deepspeed} -eq 0 ]]; then
+  exe=python
+fi
+${exe} examples/finetune.py \
+    --model_name_or_path ${model_name_or_path} \
+    --trust_remote_code ${trust_remote_code} \
+    --dataset_path ${dataset_path} \
+    --output_dir ${output_dir} --overwrite_output_dir \
+    --conversation_template ${conversation_template} \
+    --num_train_epochs ${num_epoch} \
+    --learning_rate ${learning_rate} \
+    --lr_scheduler_type ${lr_schedule} \
+    --disable_group_texts 1 \
+    --block_size ${block_size} \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --bf16 \
+    --deepspeed configs/archive/ds_config_zero2_no_offload.json \
+    --torch_dtype bfloat16 \
+    --run_name ${exp_id} \
+    --validation_split_percentage 0 \
+    --logging_steps 1 \
+    --do_train \
+    --ddp_timeout 72000 \
+    --save_steps 5000 \
+    --dataloader_num_workers 1 \
+    --gradient_checkpointing ${gradient_checkpointing} \
+    --use_flash_attention ${use_flash_attention} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --seed ${seed} \
+    ${optim_suffix_args} \
+    | tee ${log_dir}/train.log \
+    2> ${log_dir}/train.err
+
+
+if [[ $? -ne 0 ]]; then
+  echo "$(date): failed"
+  rm -f ${tmp_dir}/${prefix}.mark
+fi
diff --git a/scripts/archive/run_finetune_with_lisa.sh b/scripts/archive/run_finetune_with_lisa.sh
new file mode 100755
index 000000000..3b020445b
--- /dev/null
+++ b/scripts/archive/run_finetune_with_lisa.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+# Please run this script under ${project_id} in project directory of
+#   https://github.com/shizhediao/llm-ft
+#     COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4
+
+# Parses arguments
+model_name_or_path=meta-llama/Llama-2-7b-hf
+dataset_path=data/alpaca/train_conversation
+output_dir=output_models/finetune_lisa
+lisa_activated_layers=1
+lisa_interval_steps=20
+
+# Other optional arguments that can improve memory saving
+gradient_checkpointing=True
+use_flash_attention=0
+gradient_accumulation_steps=1
+block_size=256
+per_device_train_batch_size=1
+conversation_template=llama2
+
+# Safety related arguments
+trust_remote_code=0
+
+# Enable model parallelism for multiple gpus, modify this if you prefer
+# customized deepspeed zero-redundancy optimization settings
+num_gpu=$(python -c "import torch; print(torch.cuda.device_count())")
+ds_config_file=configs/archive/ds_config_zero0_no_offload.json
+if [ ${num_gpu} -ge 2 ]; then
+  ds_config_file=configs/archive/ds_config_zero2_no_offload.json
+fi
+
+while [[ $# -ge 1 ]]; do
+  key="$1"
+  case ${key} in
+    -m|--model_name_or_path)
+      model_name_or_path="$2"
+      shift
+      ;;
+    -d|--dataset_path)
+      dataset_path="$2"
+      shift
+      ;;
+    -o|--output_model_path)
+      output_dir="$2"
+      shift
+      ;;
+    --lisa_activated_layers)
+      lisa_activated_layers="$2"
+      shift
+      ;;
+    --lisa_interval_steps)
+      lisa_interval_steps="$2"
+      shift
+      ;;
+    --gradient_checkpointing)
+      gradient_checkpointing="$2"
+      shift
+      ;;
+    --deepspeed)
+      ds_config_file="$2"
+      shift
+      ;;
+    --use_flash_attention)
+      use_flash_attention="$2"
+      shift
+      ;;
+    --gradient_accumulation_steps)
+      gradient_accumulation_steps="$2"
+      shift
+      ;;
+    --block_size)
+      block_size="$2"
+      shift
+      ;;
+    --conversation_template)
+      conversation_template="$2"
+      shift
+      ;;
+    --per_device_train_batch_size|--batch_size)
+      per_device_train_batch_size="$2"
+      shift
+      ;;
+    --trust_remote_code)
+      trust_remote_code="$2"
+      shift
+      ;;
+    *)
+      echo "error: unknown option \"${key}\"" 1>&2
+      exit 1
+  esac
+  shift
+done
+
+# Finetune
+exp_id=finetune
+project_dir=$(cd "$(dirname $0)"/..; pwd)
+log_dir=${project_dir}/log/${exp_id}
+mkdir -p ${output_dir} ${log_dir}
+
+python examples/finetune.py \
+    --model_name_or_path ${model_name_or_path} \
+    --trust_remote_code ${trust_remote_code} \
+    --dataset_path ${dataset_path} \
+    --output_dir ${output_dir} --overwrite_output_dir \
+    --conversation_template ${conversation_template} \
+    --num_train_epochs 1 \
+    --learning_rate 1e-5 \
+    --disable_group_texts 1 \
+    --block_size ${block_size} \
+    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --run_name finetune \
+    --optim paged_adamw_32bit \
+    --validation_split_percentage 0 \
+    --logging_steps 20 \
+    --do_train \
+    --ddp_timeout 72000 \
+    --save_steps 5000 \
+    --dataloader_num_workers 1 \
+    --gradient_checkpointing ${gradient_checkpointing} \
+    --use_flash_attention ${use_flash_attention} \
+    --gradient_accumulation_steps ${gradient_accumulation_steps} \
+    --use_lisa 1 \
+    --lisa_activated_layers ${lisa_activated_layers} \
+    --lisa_interval_steps ${lisa_interval_steps} \
+    | tee ${log_dir}/train.log \
+    2> ${log_dir}/train.err
diff --git a/scripts/archive/run_finetune_with_lora.sh b/scripts/archive/run_finetune_with_lora.sh
new file mode 100755
index 000000000..c2ab0372e
--- /dev/null
+++ b/scripts/archive/run_finetune_with_lora.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+# Please run this script under ${project_id} in project directory of
+
+# Parses arguments
+model_name_or_path=gpt2
+dataset_path=data/alpaca/train_conversation
+conversation_template=llama2
+output_dir=output_models/finetune
+deepspeed_args="--master_port=11000"
+
+# Safety related arguments
+trust_remote_code=0
+
+while [[ $# -ge 1 ]]; do
+  key="$1"
+  case ${key} in
+    -m|--model_name_or_path)
+      model_name_or_path="$2"
+      shift
+      ;;
+    -d|--dataset_path)
+      dataset_path="$2"
+      shift
+      ;;
+    --conversation_template)
+      conversation_template="$2"
+      shift
+      ;;
+    -o|--output_lora_path)
+      output_dir="$2"
+      shift
+      ;;
+    --deepspeed_args)
+      deepspeed_args="$2"
+      shift
+      ;;
+    --trust_remote_code)
+      trust_remote_code="$2"
+      shift
+      ;;
+    *)
+      echo "error: unknown option \"${key}\"" 1>&2
+      exit 1
+  esac
+  shift
+done
+
+# Finetune
+exp_id=finetune_with_lora
+project_dir=$(cd "$(dirname $0)"/..; pwd)
+log_dir=${project_dir}/log/${exp_id}
+mkdir -p ${output_dir} ${log_dir}
+
+deepspeed ${deepspeed_args} \
+  examples/finetune.py \
+    --model_name_or_path ${model_name_or_path} \
+    --trust_remote_code ${trust_remote_code} \
+    --dataset_path ${dataset_path} \
+    --conversation_template ${conversation_template} \
+    --output_dir ${output_dir} --overwrite_output_dir \
+    --num_train_epochs 0.01 \
+    --learning_rate 1e-4 \
+    --block_size 512 \
+    --per_device_train_batch_size 1 \
+    --use_lora 1 \
+    --lora_r 8 \
+    --save_aggregated_lora 0\
+    --deepspeed configs/archive/ds_config_zero2.json \
+    --fp16 \
+    --run_name ${exp_id} \
+    --validation_split_percentage 0 \
+    --logging_steps 20 \
+    --do_train \
+    --ddp_timeout 72000 \
+    --save_steps 5000 \
+    --dataloader_num_workers 1 \
+    | tee ${log_dir}/train.log \
+    2> ${log_dir}/train.err
diff --git a/scripts/archive/run_finetune_with_qlora.sh b/scripts/archive/run_finetune_with_qlora.sh
new file mode 100644
index 000000000..c620580de
--- /dev/null
+++ b/scripts/archive/run_finetune_with_qlora.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+# Please run this script under ${project_id} in project directory of
+
+# Parses arguments
+model_name_or_path=meta-llama/Llama-2-13b-hf
+dataset_path=data/alpaca/train_conversation
+conversation_template=llama2
+output_dir=output_models/finetune
+deepspeed_args="--master_port=11000"
+
+# Safety related arguments
+trust_remote_code=0
+
+while [[ $# -ge 1 ]]; do
+  key="$1"
+  case ${key} in
+    -m|--model_name_or_path)
+      model_name_or_path="$2"
+      shift
+      ;;
+    -d|--dataset_path)
+      dataset_path="$2"
+      shift
+      ;;
+    --conversation_template)
+      conversation_template="$2"
+      shift
+      ;;
+    -o|--output_model_path)
+      output_dir="$2"
+      shift
+      ;;
+    --deepspeed_args)
+      deepspeed_args="$2"
+      shift
+      ;;
+    --trust_remote_code)
+      trust_remote_code="$2"
+      shift
+      ;;
+    *)
+      echo "error: unknown option \"${key}\"" 1>&2
+      exit 1
+  esac
+  shift
+done
+
+# Finetune
+exp_id=finetune_with_lora
+project_dir=$(cd "$(dirname $0)"/..; pwd)
+log_dir=${project_dir}/log/${exp_id}
+mkdir -p ${output_dir} ${log_dir}
+
+deepspeed ${deepspeed_args} \
+  examples/finetune.py \
+    --model_name_or_path ${model_name_or_path} \
+    --trust_remote_code ${trust_remote_code} \
+    --dataset_path ${dataset_path} \
+    --conversation_template ${conversation_template} \
+    --output_dir ${output_dir} --overwrite_output_dir \
+    --num_train_epochs 0.01 \
+    --learning_rate 1e-4 \
+    --block_size 512 \
+    --per_device_train_batch_size 1 \
+    --use_qlora 1 \
+    --save_aggregated_lora 0 \
+    --deepspeed configs/archive/ds_config_zero2.json \
+    --fp16 \
+    --run_name ${exp_id} \
+    --validation_split_percentage 0 \
+    --logging_steps 20 \
+    --do_train \
+    --ddp_timeout 72000 \
+    --save_steps 5000 \
+    --dataloader_num_workers 1 \
+    | tee ${log_dir}/train.log \
+    2> ${log_dir}/train.err
\ No newline at end of file
diff --git a/scripts/run_inference.sh b/scripts/archive/run_inference.sh
similarity index 70%
rename from scripts/run_inference.sh
rename to scripts/archive/run_inference.sh
index 1a3241c1f..3e89ac3e4 100755
--- a/scripts/run_inference.sh
+++ b/scripts/archive/run_inference.sh
@@ -11,11 +11,9 @@ if [ $# -ge 2 ]; then
   lora_args="--lora_model_path $2"
 fi
 
-accelerate launch --config_file configs/accelerator_multigpu_config.yaml \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
   examples/inference.py \
-    --deepspeed configs/ds_config_chatbot.json \
     --model_name_or_path ${model} \
-    --use_accelerator True \
     --max_new_tokens 256 \
     --temperature 1.0 \
     ${lora_args}
diff --git a/scripts/run_inference_multimodal_model.sh b/scripts/archive/run_inference_multimodal_model.sh
similarity index 84%
rename from scripts/run_inference_multimodal_model.sh
rename to scripts/archive/run_inference_multimodal_model.sh
index ba150fa03..df54d192b 100755
--- a/scripts/run_inference_multimodal_model.sh
+++ b/scripts/archive/run_inference_multimodal_model.sh
@@ -11,7 +11,7 @@ fi
 
 CUDA_VISIBLE_DEVICES=0 \
   deepspeed examples/inference.py \
-      --deepspeed configs/ds_config_multimodal.json \
+      --deepspeed configs/archive/ds_config_multimodal.json \
       --model_name_or_path ${model} \
       --arch_type vision_encoder_decoder \
       ${lora_args}
diff --git a/scripts/run_iterative_dpo.sh b/scripts/archive/run_iterative_dpo.sh
similarity index 100%
rename from scripts/run_iterative_dpo.sh
rename to scripts/archive/run_iterative_dpo.sh
diff --git a/scripts/run_multistage_finetune.sh b/scripts/archive/run_multistage_finetune.sh
similarity index 95%
rename from scripts/run_multistage_finetune.sh
rename to scripts/archive/run_multistage_finetune.sh
index 701540e8f..096c9e1e1 100755
--- a/scripts/run_multistage_finetune.sh
+++ b/scripts/archive/run_multistage_finetune.sh
@@ -31,7 +31,7 @@ deepspeed ${deepspeed_args} \
     --use_lora 1 \
     --lora_r 8 \
     --save_aggregated_lora 1 \
-    --deepspeed configs/ds_config_zero2.json \
+    --deepspeed configs/archive/ds_config_zero2.json \
     --bf16 \
     --run_name finetune_with_lora \
     --validation_split_percentage 0 \
diff --git a/scripts/run_raft_align.sh b/scripts/archive/run_raft_align.sh
similarity index 96%
rename from scripts/run_raft_align.sh
rename to scripts/archive/run_raft_align.sh
index 9cb5a9717..1215d33cf 100755
--- a/scripts/run_raft_align.sh
+++ b/scripts/archive/run_raft_align.sh
@@ -25,7 +25,7 @@ deepspeed ${deepspeed_args} \
     --learning_rate 2e-5 \
     --lr_scheduler_type "constant" \
     --bf16 \
-    --deepspeed configs/ds_config_zero2.json \
+    --deepspeed configs/archive/ds_config_zero2.json \
     --dataset_path ${project_dir}/data/hh_rlhf/rlhf_prompt \
     --output_reward_path ${project_dir}/tmp/raft_aligner/reward.txt \
     --output_dir ${output_dir} --overwrite_output_dir \
diff --git a/scripts/run_reward_modeling.sh b/scripts/archive/run_reward_modeling.sh
similarity index 92%
rename from scripts/run_reward_modeling.sh
rename to scripts/archive/run_reward_modeling.sh
index 311384576..9b8c2b9d6 100644
--- a/scripts/run_reward_modeling.sh
+++ b/scripts/archive/run_reward_modeling.sh
@@ -5,7 +5,6 @@ model_name_or_path=google/gemma-2b-it
 train_dataset_path=data/ultrafeedback-binarized-preferences-cleaned/train
 eval_dataset_path=data/ultrafeedback-binarized-preferences-cleaned/train
 output_dir=output_models/reward_modeling
-deepspeed_args="--master_port=11345 --include localhost:6"
 conversation_template=gemma
 
 # Safety related arguments
@@ -34,10 +33,6 @@ while [[ $# -ge 1 ]]; do
       conversation_template="$2"
       shift
       ;;
-    --deepspeed_args)
-      deepspeed_args="$2"
-      shift
-      ;;
     --trust_remote_code)
       trust_remote_code="$2"
       shift
@@ -55,9 +50,8 @@ project_dir=$(cd "$(dirname $0)"/..; pwd)
 log_dir=${project_dir}/log/${exp_id}
 mkdir -p ${output_dir} ${log_dir}
 
-deepspeed ${deepspeed_args} \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
     examples/reward_modeling.py \
-        --deepspeed configs/ds_config_zero3.json \
         --model_name_or_path ${model_name_or_path} \
         --arch_type "text_regression" \
         --do_train True \
@@ -78,6 +72,7 @@ deepspeed ${deepspeed_args} \
         --gradient_checkpointing True \
         --remove_unused_columns False \
         --bf16 True \
+        --torch_dtype bfloat16 \
         --logging_strategy "steps" \
         --logging_steps 10 \
         --optim "paged_adamw_32bit" \
diff --git a/scripts/run_reward_modeling_with_lisa.sh b/scripts/archive/run_reward_modeling_with_lisa.sh
similarity index 92%
rename from scripts/run_reward_modeling_with_lisa.sh
rename to scripts/archive/run_reward_modeling_with_lisa.sh
index 64716259a..f3ad3ea2f 100644
--- a/scripts/run_reward_modeling_with_lisa.sh
+++ b/scripts/archive/run_reward_modeling_with_lisa.sh
@@ -5,7 +5,6 @@ model_name_or_path=google/gemma-2b-it
 train_dataset_path=data/ultrafeedback-binarized-preferences-cleaned/train
 eval_dataset_path=data/ultrafeedback-binarized-preferences-cleaned/train
 output_dir=output_models/reward_modeling_lisa
-deepspeed_args="--master_port=11345 --include localhost:6"
 conversation_template=gemma
 lisa_activated_layers=1
 lisa_interval_steps=20
@@ -44,10 +43,6 @@ while [[ $# -ge 1 ]]; do
       conversation_template="$2"
       shift
       ;;
-    --deepspeed_args)
-      deepspeed_args="$2"
-      shift
-      ;;
     --trust_remote_code)
       trust_remote_code="$2"
       shift
@@ -60,14 +55,13 @@ while [[ $# -ge 1 ]]; do
 done
 
 # Finetune
-exp_id=reward_modeling
+exp_id=reward_modeling_lisa
 project_dir=$(cd "$(dirname $0)"/..; pwd)
 log_dir=${project_dir}/log/${exp_id}
 mkdir -p ${output_dir} ${log_dir}
 
-deepspeed ${deepspeed_args} \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
     examples/reward_modeling.py \
-        --deepspeed configs/ds_config_zero3.json \
         --model_name_or_path ${model_name_or_path} \
         --arch_type "text_regression" \
         --do_train True \
@@ -88,6 +82,7 @@ deepspeed ${deepspeed_args} \
         --gradient_checkpointing True \
         --remove_unused_columns False \
         --bf16 True \
+        --torch_dtype bfloat16 \
         --logging_strategy "steps" \
         --logging_steps 10 \
         --optim "paged_adamw_32bit" \
diff --git a/scripts/run_reward_modeling_with_lora.sh b/scripts/archive/run_reward_modeling_with_lora.sh
similarity index 91%
rename from scripts/run_reward_modeling_with_lora.sh
rename to scripts/archive/run_reward_modeling_with_lora.sh
index 1afbd9784..f0fe44622 100644
--- a/scripts/run_reward_modeling_with_lora.sh
+++ b/scripts/archive/run_reward_modeling_with_lora.sh
@@ -5,7 +5,6 @@ model_name_or_path=google/gemma-2b-it
 train_dataset_path=data/ultrafeedback-binarized-preferences-cleaned/train
 eval_dataset_path=data/ultrafeedback-binarized-preferences-cleaned/train
 output_dir=output_models/reward_modeling_lora
-deepspeed_args="--master_port=11345 --include localhost:6"
 conversation_template=gemma
 
 # Safety related arguments
@@ -34,10 +33,6 @@ while [[ $# -ge 1 ]]; do
       conversation_template="$2"
       shift
       ;;
-    --deepspeed_args)
-      deepspeed_args="$2"
-      shift
-      ;;
     --trust_remote_code)
       trust_remote_code="$2"
       shift
@@ -50,14 +45,13 @@ while [[ $# -ge 1 ]]; do
 done
 
 # Finetune
-exp_id=reward_modeling
+exp_id=reward_modeling_lora
 project_dir=$(cd "$(dirname $0)"/..; pwd)
 log_dir=${project_dir}/log/${exp_id}
 mkdir -p ${output_dir} ${log_dir}
 
-deepspeed ${deepspeed_args} \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
     examples/reward_modeling.py \
-        --deepspeed configs/ds_config_zero3.json \
         --model_name_or_path ${model_name_or_path} \
         --arch_type "text_regression" \
         --do_train True \
@@ -78,6 +72,7 @@ deepspeed ${deepspeed_args} \
         --gradient_checkpointing True \
         --remove_unused_columns False \
         --bf16 True \
+        --torch_dtype bfloat16 \
         --logging_strategy "steps" \
         --logging_steps 10 \
         --optim "paged_adamw_32bit" \
diff --git a/scripts/run_rm_inference.sh b/scripts/archive/run_rm_inference.sh
similarity index 98%
rename from scripts/run_rm_inference.sh
rename to scripts/archive/run_rm_inference.sh
index 701daf120..32d68dff1 100644
--- a/scripts/run_rm_inference.sh
+++ b/scripts/archive/run_rm_inference.sh
@@ -61,7 +61,6 @@ accelerate launch --config_file configs/accelerator_multigpu_config.yaml \
         --trust_remote_code ${trust_remote_code} \
         --model_name_or_path ${model_name_or_path} \
         --arch_type text_regression \
-        --use_accelerator True \
         --block_size 4096 \
         --inference_batch_size 16 \
         --dataset_path ${dataset_path} \
diff --git a/scripts/run_tool.sh b/scripts/archive/run_tool.sh
similarity index 100%
rename from scripts/run_tool.sh
rename to scripts/archive/run_tool.sh
diff --git a/scripts/run_vllm_inference.sh b/scripts/archive/run_vllm_inference.sh
similarity index 100%
rename from scripts/run_vllm_inference.sh
rename to scripts/archive/run_vllm_inference.sh
diff --git a/scripts/data_preprocess/add_end_mark.py b/scripts/data_preprocess/add_end_mark.py
deleted file mode 100644
index 900bf1646..000000000
--- a/scripts/data_preprocess/add_end_mark.py
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""
-Adds prompt structure to a text2text dataset.
-"""
-from __future__ import absolute_import
-
-import argparse
-import json
-import textwrap
-import sys
-
-def parse_argument(sys_argv):
-    """Parses arguments from command line.
-    Args:
-        sys_argv: the list of arguments (strings) from command line.
-    Returns:
-        A struct whose member corresponds to the required (optional) variable.
-        For example,
-        ```
-        args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
-        args.input       # 'a.txt'
-        args.num         # 10
-        ```
-    """
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    # Training parameters
-    parser.add_argument(
-        "--dataset_path", type=str,
-        default=None,
-        help=textwrap.dedent("input dataset path, reads from stdin by default")
-    )
-    parser.add_argument(
-        "--output_path", type=str,
-        default=None,
-        help=textwrap.dedent("output dataset path, writes to stdout by default")
-    )
-    parser.add_argument(
-        "--end_mark", type=str,
-        default="###",
-        help=textwrap.dedent("end mark that append to the end of output")
-    )
-
-    # Parses from commandline
-    args = parser.parse_args(sys_argv[1:])
-
-    return args
-
-
-def main():
-    args = parse_argument(sys.argv)
-    if args.dataset_path is not None:
-        with open(args.dataset_path, "r") as fin:
-            data_dict = json.load(fin)
-    else:
-        data_dict = json.load(sys.stdin)
-
-    output_field_map = {
-        "text_only": "text",
-        "text2text": "output",
-    }
-    data_dict_type = data_dict["type"]
-    if not data_dict_type in output_field_map:
-        raise NotImplementedError(
-            "only support text_only or text2text dataset"
-        )
-
-    output_field = output_field_map[data_dict_type]
-    
-    num_instances = len(data_dict["instances"])
-    for i in range(num_instances):
-        data_dict["instances"][i][output_field] += args.end_mark
-
-    if args.output_path is not None:
-        with open(args.output_path, "w") as fout:
-            json.dump(data_dict, fout, indent=4, ensure_ascii=False)
-    else:
-        json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/data_preprocess/add_prompt.py b/scripts/data_preprocess/add_prompt.py
deleted file mode 100644
index ee477e105..000000000
--- a/scripts/data_preprocess/add_prompt.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""
-Adds prompt structure to a text2text dataset.
-"""
-from __future__ import absolute_import
-
-import argparse
-import json
-import textwrap
-import sys
-
-def parse_argument(sys_argv):
-    """Parses arguments from command line.
-    Args:
-        sys_argv: the list of arguments (strings) from command line.
-    Returns:
-        A struct whose member corresponds to the required (optional) variable.
-        For example,
-        ```
-        args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
-        args.input       # 'a.txt'
-        args.num         # 10
-        ```
-    """
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    # Training parameters
-    parser.add_argument(
-        "--dataset_path", type=str,
-        default=None,
-        help=textwrap.dedent("input dataset path, reads from stdin by default")
-    )
-    parser.add_argument(
-        "--output_path", type=str,
-        default=None,
-        help=textwrap.dedent("output dataset path, writes to stdout by default")
-    )
-    parser.add_argument(
-        "--prompt_structure", type=str,
-        default="{input}",
-        help=textwrap.dedent("prompt structure to augment input")
-    )
-
-    # Parses from commandline
-    args = parser.parse_args(sys_argv[1:])
-
-    return args
-
-
-def main():
-    args = parse_argument(sys.argv)
-    if args.dataset_path is not None:
-        with open(args.dataset_path, "r") as fin:
-            data_dict = json.load(fin)
-    else:
-        data_dict = json.load(sys.stdin)
-
-    if data_dict["type"] != "text2text":
-        raise NotImplementedError(
-            "only support text2text prompt augmentation"
-        )
-
-    data_dict["instances"] = [
-        {
-            "input": args.prompt_structure.format(input=instance["input"]),
-            "output": instance["output"],
-        }
-        for instance in data_dict["instances"]
-    ]
-    if args.output_path is not None:
-        with open(args.output_path, "w") as fout:
-            json.dump(data_dict, fout, indent=4, ensure_ascii=False)
-    else:
-        json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/data_preprocess/concat.py b/scripts/data_preprocess/concat.py
deleted file mode 100644
index 7cafd7c9c..000000000
--- a/scripts/data_preprocess/concat.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""
-Merges an extra dataset into current dataset.
-"""
-from __future__ import absolute_import
-
-import argparse
-import json
-import textwrap
-import sys
-
-def parse_argument(sys_argv):
-    """Parses arguments from command line.
-    Args:
-        sys_argv: the list of arguments (strings) from command line.
-    Returns:
-        A struct whose member corresponds to the required (optional) variable.
-        For example,
-        ```
-        args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
-        args.input       # 'a.txt'
-        args.num         # 10
-        ```
-    """
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    # Training parameters
-    parser.add_argument(
-        "--output_path", type=str,
-        default=None,
-        help=textwrap.dedent("output dataset path, writes to stdout by default")
-    )
-    parser.add_argument(
-        "--merge_from_path", type=str,
-        nargs="+",
-        help=textwrap.dedent(
-            "dataset path of the extra dataset that will be merged"
-            " into input dataset"
-        )
-    )
-
-    # Parses from commandline
-    args = parser.parse_args(sys_argv[1:])
-
-    return args
-
-
-def main():
-    args = parse_argument(sys.argv)
-
-    if args.merge_from_path is not None:
-        for i in range(0, len(args.merge_from_path)):
-            with open(args.merge_from_path[i], "r") as fin:
-                extra_data_dict = json.load(fin)
-            if i == 0:
-                data_dict = extra_data_dict
-            else:
-                if data_dict["type"] != extra_data_dict["type"]:
-                    raise ValueError(
-                        'two dataset have different types:'
-                        f' input dataset: "{data_dict["type"]}";'
-                        f' merge from dataset: "{extra_data_dict["type"]}"'
-                    )
-                data_dict["instances"].extend(extra_data_dict["instances"])
-    else:
-        raise ValueError("No merge files specified")
-
-    if args.output_path is not None:
-        with open(args.output_path, "w") as fout:
-            json.dump(data_dict, fout, indent=4, ensure_ascii=False)
-    else:
-        json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/data_preprocess/concat_shuffle_split.py b/scripts/data_preprocess/concat_shuffle_split.py
deleted file mode 100644
index d83dfc98e..000000000
--- a/scripts/data_preprocess/concat_shuffle_split.py
+++ /dev/null
@@ -1,149 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""
-This script is designed for handling large datasets. 
-It merges multiple datasets located in the same directory, shuffles them, and splits them into training, evaluation, and testing sets. 
-The training set is further divided into 10 folds.
-"""
-from __future__ import absolute_import
-
-import argparse
-import json
-import textwrap
-import sys
-import os 
-import random
-import gc
-
-def parse_argument(sys_argv):
-    """Parses arguments from command line.
-    Args:
-        sys_argv: the list of arguments (strings) from command line.
-    Returns:
-        A struct whose member corresponds to the required (optional) variable.
-        For example,
-        ```
-        args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
-        args.input       # 'a.txt'
-        args.num         # 10
-        ```
-    """
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    # Training parameters
-    parser.add_argument(
-        "--output_path", type=str,
-        default=None,
-        help=textwrap.dedent("output dataset path, writes to stdout by default")
-    )
-    parser.add_argument(
-        "--merge_from_path", type=str,
-        nargs="+",
-        help=textwrap.dedent(
-            "dataset path of the extra dataset that will be merged"
-            " into input dataset"
-        )
-    )
-    parser.add_argument(
-        "--seed", type=int, default=42,
-        help=textwrap.dedent("pseudorandom seed")
-    )
-    parser.add_argument(
-        "--eval_size", type=int, default=200,
-        help=textwrap.dedent("size of eval dataset")
-    )
-    parser.add_argument(
-        "--test_size", type=int, default=1000,
-        help=textwrap.dedent("size of test dataset")
-    )
-    parser.add_argument(
-        "--k", type=int, default=10,
-        help=textwrap.dedent("the train dataset will be divide into k folds")
-    )
-    # Parses from commandline
-    args = parser.parse_args(sys_argv[1:])
-
-    return args
-
-
-def main():
-    args = parse_argument(sys.argv)
-
-    # concat 
-    if args.merge_from_path is not None:
-        for i in range(0, len(args.merge_from_path)):
-            with open(args.merge_from_path[i], "r") as fin:
-                extra_data_dict = json.load(fin)
-            if i == 0:
-                data_dict = extra_data_dict
-            else:
-                if data_dict["type"] != extra_data_dict["type"]:
-                    raise ValueError(
-                        'two dataset have different types:'
-                        f' input dataset: "{data_dict["type"]}";'
-                        f' merge from dataset: "{extra_data_dict["type"]}"'
-                    )
-                data_dict["instances"].extend(extra_data_dict["instances"])
-    else:
-        raise ValueError("No merge files specified")
-    del extra_data_dict
-    gc.collect()
-    print('finish concat')
-
-    # shuffle 
-    random.seed(args.seed)
-    random.shuffle(data_dict["instances"])
-    print('finish shuffle')
-    # split to train, eval, test
-    train_data_dict = {"type":data_dict["type"],"instances":data_dict["instances"][args.eval_size:-args.test_size]}
-    eval_data_dict = {"type":data_dict["type"],"instances":data_dict["instances"][:args.eval_size]}
-    test_data_dict = {"type":data_dict["type"],"instances":data_dict["instances"][-args.test_size:]}
-    del data_dict
-    gc.collect()
-
-    # divide train in 10 folds
-    num_instances = len(train_data_dict["instances"])
-    split_size = num_instances // args.k
-    split_data = []
-    for i in range(args.k):
-        if i <  args.k-1:
-            split = train_data_dict["instances"][i*split_size : (i+1)*split_size]
-        else:
-            # Last split may have remaining instances
-            split = train_data_dict["instances"][i*split_size:]
-        split_data.append({'type': train_data_dict["type"], 'instances': split})
-
-    del train_data_dict
-    gc.collect()
-
-    print('finish split')
-    # save dataset under output_path
-
-    if args.output_path is  None:
-        args.output_path = sys.stdout
-
-    train_save_path=os.path.join(args.output_path,"train_{k}_folds".format(k=args.k))
-    if not os.path.exists(train_save_path):
-        os.makedirs(train_save_path)
-    for i in range(args.k):
-        with open(train_save_path+"/train_"+str(i)+".json", 'w') as f:
-            json.dump(split_data[i], f,  indent=4, ensure_ascii=False)
-
-    eval_save_path=os.path.join(args.output_path,"eval")
-    if not os.path.exists(eval_save_path):
-        os.makedirs(eval_save_path)
-    with open(eval_save_path+'/eval.json','w') as f:
-        json.dump(eval_data_dict,f,indent=4,ensure_ascii=False)
-
-    test_save_path=os.path.join(args.output_path,"test")
-    if not os.path.exists(test_save_path):
-        os.makedirs(test_save_path)
-    with open(test_save_path+'/test.json','w') as f:
-        json.dump(test_data_dict,f,indent=4,ensure_ascii=False)
-
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/data_preprocess/count.py b/scripts/data_preprocess/count.py
deleted file mode 100644
index ab3df108d..000000000
--- a/scripts/data_preprocess/count.py
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""
-Counts number of instances in a dataset.
-"""
-from __future__ import absolute_import
-
-import argparse
-import json
-import random
-import sys
-import textwrap
-
-def parse_argument(sys_argv):
-    """Parses arguments from command line.
-    Args:
-        sys_argv: the list of arguments (strings) from command line.
-    Returns:
-        A struct whose member corresponds to the required (optional) variable.
-        For example,
-        ```
-        args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
-        args.input       # 'a.txt'
-        args.num         # 10
-        ```
-    """
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    # Training parameters
-    parser.add_argument(
-        "--dataset_path", type=str,
-        default=None,
-        help="input dataset path, reads from stdin by default"
-    )
-
-    # Parses from commandline
-    args = parser.parse_args(sys_argv[1:])
-
-    return args
-
-
-def main():
-    args = parse_argument(sys.argv)
-    if args.dataset_path is not None:
-        with open(args.dataset_path, "r") as fin:
-            data_dict = json.load(fin)
-    else:
-        data_dict = json.load(sys.stdin)
-
-    num_instances = len(data_dict["instances"])
-    print(num_instances)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/data_preprocess/merge.py b/scripts/data_preprocess/merge.py
deleted file mode 100644
index 2dcb99512..000000000
--- a/scripts/data_preprocess/merge.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""
-Merges an extra dataset into current dataset.
-"""
-from __future__ import absolute_import
-
-import argparse
-import json
-import textwrap
-import sys
-
-def parse_argument(sys_argv):
-    """Parses arguments from command line.
-    Args:
-        sys_argv: the list of arguments (strings) from command line.
-    Returns:
-        A struct whose member corresponds to the required (optional) variable.
-        For example,
-        ```
-        args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
-        args.input       # 'a.txt'
-        args.num         # 10
-        ```
-    """
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter)
-    
-    parser.add_argument(
-        "--dataset_path", type=str,
-        default=None,
-        help=textwrap.dedent("input dataset path, reads from stdin by default")
-    )
-    # Training parameters
-    parser.add_argument(
-        "--output_path", type=str,
-        default=None,
-        help=textwrap.dedent("output dataset path, writes to stdout by default")
-    )
-    parser.add_argument(
-        "--merge_from_path", type=str,
-        nargs="+",
-        help=textwrap.dedent(
-            "dataset path of the extra dataset that will be merged"
-            " into input dataset"
-        )
-    )
-
-    # Parses from commandline
-    args = parser.parse_args(sys_argv[1:])
-
-    return args
-
-
-def main():
-    args = parse_argument(sys.argv)
-
-    if args.dataset_path is not None:
-        with open(args.dataset_path, "r") as fin:
-            data_dict = json.load(fin)
-    else:
-        data_dict = json.load(sys.stdin)
-    
-    if args.merge_from_path is not None:
-        for i in range(0, len(args.merge_from_path)):
-            with open(args.merge_from_path[i], "r") as fin:
-                extra_data_dict = json.load(fin)
-
-            if data_dict["type"] != extra_data_dict["type"]:
-                raise ValueError(
-                    'two dataset have different types:'
-                    f' input dataset: "{data_dict["type"]}";'
-                    f' merge from dataset: "{extra_data_dict["type"]}"'
-                )
-            data_dict["instances"].extend(extra_data_dict["instances"])
-
-
-    if args.output_path is not None:
-        with open(args.output_path, "w") as fout:
-            json.dump(data_dict, fout, indent=4, ensure_ascii=False)
-    else:
-        json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/data_preprocess/raw2textonly.py b/scripts/data_preprocess/raw2textonly.py
deleted file mode 100644
index eb64c512d..000000000
--- a/scripts/data_preprocess/raw2textonly.py
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""
-Converts a raw text file, separated by lines, into a "text-only" formatted json.
-"""
-from __future__ import absolute_import
-
-import argparse
-import json
-import textwrap
-import sys
-
-def parse_argument(sys_argv):
-    """Parses arguments from command line.
-    Args:
-        sys_argv: the list of arguments (strings) from command line.
-    Returns:
-        A struct whose member corresponds to the required (optional) variable.
-        For example,
-        ```
-        args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
-        args.input       # 'a.txt'
-        args.num         # 10
-        ```
-    """
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    # Training parameters
-    parser.add_argument(
-        "--dataset_path", type=str,
-        default=None,
-        help=textwrap.dedent("input dataset path, reads from stdin by default")
-    )
-    parser.add_argument(
-        "--output_path", type=str,
-        default=None,
-        help=textwrap.dedent("output dataset path, writes to stdout by default")
-    )
-
-    # Parses from commandline
-    args = parser.parse_args(sys_argv[1:])
-
-    return args
-
-
-def raw2textonly(fin):
-    """
-    Converts raw text to text-only format.
-
-    Args:
-        fin: the input file description of the raw text file.
-    Returns:
-        a dict with "text-only" format.
-    """
-    data_dict = {
-        "type": "text_only",
-        "instances": [ { "text": line.strip() } for line in fin ],
-    }
-    return data_dict
-
-
-def main():
-    args = parse_argument(sys.argv)
-
-    if args.dataset_path is not None:
-        with open(args.dataset_path, "r") as fin:
-            data_dict = raw2textonly(fin)
-    else:
-        data_dict = raw2textonly(sys.stdin)
-
-    if args.output_path is not None:
-        with open(args.output_path, "w") as fout:
-            json.dump(data_dict, fout, indent=4, ensure_ascii=False)
-    else:
-        json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/data_preprocess/run_data_preprocess.sh b/scripts/data_preprocess/run_data_preprocess.sh
deleted file mode 100644
index 8b515de9a..000000000
--- a/scripts/data_preprocess/run_data_preprocess.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-# Run this shell script under project directory
-
-# For sample.py
-python scripts/data_preprocess/sample.py \
-    --dataset_path ./data/example_dataset/train/train_50.json \
-    --output_path ./data/example_dataset/train/train_50_sample.json \
-    --ratio 0.5
-
-# For shuffle.py
-python scripts/data_preprocess/shuffle.py \
-    --dataset_path ./data/example_dataset/train/train_50_sample.json \
-    --output_path ./data/example_dataset/train/train_50_sample_shuffle.json
-
-# For merge.py : you can specify multiple files to merge
-python scripts/data_preprocess/merge.py \
-    --dataset_path ./data/example_dataset/train/train_50.json \
-    --merge_from_path ./data/example_dataset/train/train_50_sample_shuffle.json \
-    ./data/example_dataset/train/train_50_sample.json  \
-    --output_path ./data/example_dataset/train/train_merge.json \
-
-# For concat.py: if you simply want to merge multiple files or a directory, use following.
-# You can also specify multiple files after --merge_from_path
-python scripts/data_preprocess/concat.py \
-    --merge_from_path ./data/example_dataset/train/*.json \
-    --output_path ./data/example_dataset/train/train_merge.json \
-
-# For concat_shuffle_split.py: if you simply want to merge multiple files or a directory, use following.
-python scripts/data_preprocess/concat_shuffle_split.py \
-    --merge_from_path ./data/example_dataset/train/*.json \
-    --output_path ./data/processed_dataset/ \
\ No newline at end of file
diff --git a/scripts/data_preprocess/sample.py b/scripts/data_preprocess/sample.py
deleted file mode 100644
index 0a503d872..000000000
--- a/scripts/data_preprocess/sample.py
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""
-Samples a certain ratio of instances from a dataset.
-"""
-from __future__ import absolute_import
-
-import argparse
-import json
-import random
-import sys
-import textwrap
-
-def parse_argument(sys_argv):
-    """Parses arguments from command line.
-    Args:
-        sys_argv: the list of arguments (strings) from command line.
-    Returns:
-        A struct whose member corresponds to the required (optional) variable.
-        For example,
-        ```
-        args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
-        args.input       # 'a.txt'
-        args.num         # 10
-        ```
-    """
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    # Training parameters
-    parser.add_argument(
-        "--dataset_path", type=str,
-        default=None,
-        help="input dataset path, reads from stdin by default"
-    )
-    parser.add_argument(
-        "--output_path", type=str,
-        default=None,
-        help="output dataset path, writes to stdout by default"
-    )
-    parser.add_argument(
-        "--ratio", type=float, required=True,
-        help="sample ratio, will be floored if number of samples is not a int"
-    )
-    parser.add_argument(
-        "--seed", type=int, default=42,
-        help="pseudorandom seed"
-    )
-
-    # Parses from commandline
-    args = parser.parse_args(sys_argv[1:])
-
-    return args
-
-
-def main():
-    args = parse_argument(sys.argv)
-    if args.dataset_path is not None:
-        with open(args.dataset_path, "r") as fin:
-            data_dict = json.load(fin)
-    else:
-        data_dict = json.load(sys.stdin)
-
-    random.seed(args.seed)
-    num_instances = len(data_dict["instances"])
-    num_sample = int(num_instances * args.ratio)
-
-    data_dict["instances"] = random.sample(data_dict["instances"], num_sample)
-
-    if args.output_path is not None:
-        with open(args.output_path, "w") as fout:
-            json.dump(data_dict, fout, indent=4, ensure_ascii=False)
-    else:
-        json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/data_preprocess/shuffle.py b/scripts/data_preprocess/shuffle.py
deleted file mode 100644
index 7572efacd..000000000
--- a/scripts/data_preprocess/shuffle.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2023 Statistics and Machine Learning Research Group at HKUST. All rights reserved.
-"""
-Samples a certain ratio of instances from a dataset.
-"""
-from __future__ import absolute_import
-
-import argparse
-import json
-import random
-import sys
-import textwrap
-
-def parse_argument(sys_argv):
-    """Parses arguments from command line.
-    Args:
-        sys_argv: the list of arguments (strings) from command line.
-    Returns:
-        A struct whose member corresponds to the required (optional) variable.
-        For example,
-        ```
-        args = parse_argument(['main.py' '--input', 'a.txt', '--num', '10'])
-        args.input       # 'a.txt'
-        args.num         # 10
-        ```
-    """
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.RawTextHelpFormatter)
-
-    # Training parameters
-    parser.add_argument(
-        "--dataset_path", type=str,
-        default=None,
-        help="input dataset path, reads from stdin by default"
-    )
-    parser.add_argument(
-        "--output_path", type=str,
-        default=None,
-        help="output dataset path, writes to stdout by default"
-    )
-    parser.add_argument(
-        "--seed", type=int, default=42,
-        help="pseudorandom seed"
-    )
-
-    # Parses from commandline
-    args = parser.parse_args(sys_argv[1:])
-
-    return args
-
-
-def main():
-    args = parse_argument(sys.argv)
-    if args.dataset_path is not None:
-        with open(args.dataset_path, "r") as fin:
-            data_dict = json.load(fin)
-    else:
-        data_dict = json.load(sys.stdin)
-
-    random.seed(args.seed)
-    random.shuffle(data_dict["instances"])
-
-    if args.output_path is not None:
-        with open(args.output_path, "w") as fout:
-            json.dump(data_dict, fout, indent=4, ensure_ascii=False)
-    else:
-        json.dump(data_dict, sys.stdout, indent=4, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/multimodal/run_finetune_multi_modal_stage1.sh b/scripts/multimodal/run_finetune_multi_modal_stage1.sh
index 743301b21..8e0180d04 100755
--- a/scripts/multimodal/run_finetune_multi_modal_stage1.sh
+++ b/scripts/multimodal/run_finetune_multi_modal_stage1.sh
@@ -51,7 +51,7 @@ mkdir -p ${output_dir} ${log_dir}
 
 deepspeed ${deepspeed_args} \
   examples/finetune_multi_modal.py \
-    --deepspeed configs/ds_config_multimodal.json \
+    --deepspeed configs/archive/ds_config_multimodal.json \
     --arch_type vision_encoder_decoder \
     --llava_loading True \
     --model_name_or_path ${model_name_or_path} \
diff --git a/scripts/multimodal/run_finetune_multi_modal_stage2.sh b/scripts/multimodal/run_finetune_multi_modal_stage2.sh
index 3884c3c99..f0ea10f92 100755
--- a/scripts/multimodal/run_finetune_multi_modal_stage2.sh
+++ b/scripts/multimodal/run_finetune_multi_modal_stage2.sh
@@ -66,7 +66,7 @@ mkdir -p ${output_dir} ${log_dir}
 # default in llava is 16
 deepspeed ${deepspeed_args} \
   examples/finetune_multi_modal.py \
-    --deepspeed configs/ds_config_zero2.json \
+    --deepspeed configs/archive/ds_config_zero2.json \
     --arch_type vision_encoder_decoder \
     --llava_loading True \
     --model_name_or_path ${model_name_or_path} \
diff --git a/scripts/multimodal/run_vis_chatbot_blip2.sh b/scripts/multimodal/run_vis_chatbot_blip2.sh
index 0c0b803f3..492409a88 100755
--- a/scripts/multimodal/run_vis_chatbot_blip2.sh
+++ b/scripts/multimodal/run_vis_chatbot_blip2.sh
@@ -1,6 +1,6 @@
 model=Salesforce/blip2-opt-2.7b
 deepspeed examples/vis_chatbot.py --model_name_or_path ${model} \
-                                  --deepspeed configs/ds_config_vis_chatbot.json \
+                                  --deepspeed configs/archive/ds_config_vis_chatbot.json \
                                   --arch_type vision_encoder_decoder \
                                   --task vqa \
                                   ${@:1}
diff --git a/scripts/multimodal/run_vis_chatbot_gradio_minigpt4.sh b/scripts/multimodal/run_vis_chatbot_gradio_minigpt4.sh
index 0e0556e81..280461f1e 100755
--- a/scripts/multimodal/run_vis_chatbot_gradio_minigpt4.sh
+++ b/scripts/multimodal/run_vis_chatbot_gradio_minigpt4.sh
@@ -14,7 +14,7 @@ model=Salesforce/blip2-flan-t5-xxl
 # 
 # deepspeed --master_port=11005 examples/vis_chatbot_gradio.py \
 #     --model_name_or_path ${model} \
-#     --deepspeed configs/ds_config_multimodal.json \
+#     --deepspeed configs/archive/ds_config_multimodal.json \
 #     --arch_type vision_encoder_decoder \
 #     --task vqa \
 #     --custom_model \
@@ -37,7 +37,7 @@ fi
 
 deepspeed --master_port=11005 examples/vis_chatbot_gradio.py \
     --model_name_or_path ${model} \
-    --deepspeed configs/ds_config_vis_chatbot.json \
+    --deepspeed configs/archive/ds_config_vis_chatbot.json \
     --arch_type vision_encoder_decoder \
     --task vqa \
     --custom_model \
diff --git a/scripts/multimodal/run_vis_chatbot_llava.sh b/scripts/multimodal/run_vis_chatbot_llava.sh
index 028cbaecb..79c358d64 100755
--- a/scripts/multimodal/run_vis_chatbot_llava.sh
+++ b/scripts/multimodal/run_vis_chatbot_llava.sh
@@ -18,7 +18,7 @@ fi
 
 deepspeed ${deepspeed_args} \
     examples/vis_chatbot.py \
-    --deepspeed configs/ds_config_vis_chatbot.json \
+    --deepspeed configs/archive/ds_config_vis_chatbot.json \
     --arch_type vision_encoder_decoder \
     --task vqa \
     --custom_model True \
diff --git a/scripts/multimodal/run_vis_chatbot_minigpt4.sh b/scripts/multimodal/run_vis_chatbot_minigpt4.sh
index e5a445da5..2d3e6bab3 100755
--- a/scripts/multimodal/run_vis_chatbot_minigpt4.sh
+++ b/scripts/multimodal/run_vis_chatbot_minigpt4.sh
@@ -12,7 +12,7 @@ if [ ! -f output_models/pretrained_minigpt4_7b_converted.pth ]; then
       --save_path output_models/pretrained_minigpt4_7b_converted.pth
 fi
 
-deepspeed ${deepspeed_args} examples/vis_chatbot.py --model_name_or_path ${model} --deepspeed configs/ds_config_vis_chatbot.json --arch_type vision_encoder_decoder --task vqa --custom_model \
+deepspeed ${deepspeed_args} examples/vis_chatbot.py --model_name_or_path ${model} --deepspeed configs/archive/ds_config_vis_chatbot.json --arch_type vision_encoder_decoder --task vqa --custom_model \
                             --chatbot_type mini_gpt \
                             --prompt_structure "{input_text}###Assistant:" \
                             --pretrained_language_projection_path output_models/pretrained_minigpt4_7b_converted.pth \
diff --git a/scripts/run_finetune.sh b/scripts/run_finetune.sh
index d78458e20..b317e2188 100755
--- a/scripts/run_finetune.sh
+++ b/scripts/run_finetune.sh
@@ -1,78 +1,40 @@
 #!/bin/bash
-# Please run this script under ${project_id} in project directory of
-#   https://github.com/shizhediao/llm-ft
-#     COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4
-
-# Parses arguments
-model_name_or_path=gpt2
+model_name_or_path=meta-llama/Llama-3.2-3B-Instruct
 dataset_path=data/alpaca/train_conversation
+conversation_template=llama3
 output_dir=output_models/finetune
-deepspeed_args="--master_port=11000"
-conversation_template=llama2
-
-# Safety related arguments
-trust_remote_code=0
-
-while [[ $# -ge 1 ]]; do
-  key="$1"
-  case ${key} in
-    -m|--model_name_or_path)
-      model_name_or_path="$2"
-      shift
-      ;;
-    -d|--dataset_path)
-      dataset_path="$2"
-      shift
-      ;;
-    -o|--output_model_path)
-      output_dir="$2"
-      shift
-      ;;
-    --conversation_template)
-      conversation_template="$2"
-      shift
-      ;;
-    --deepspeed_args)
-      deepspeed_args="$2"
-      shift
-      ;;
-    --trust_remote_code)
-      trust_remote_code="$2"
-      shift
-      ;;
-    *)
-      echo "error: unknown option \"${key}\"" 1>&2
-      exit 1
-  esac
-  shift
-done
 
 # Finetune
 exp_id=finetune
-project_dir=$(cd "$(dirname $0)"/..; pwd)
-log_dir=${project_dir}/log/${exp_id}
+log_dir=${output_dir}/log/
 mkdir -p ${output_dir} ${log_dir}
 
-deepspeed ${deepspeed_args} \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
   examples/finetune.py \
     --model_name_or_path ${model_name_or_path} \
-    --trust_remote_code ${trust_remote_code} \
+    --trust_remote_code 0 \
     --dataset_path ${dataset_path} \
     --output_dir ${output_dir} --overwrite_output_dir \
     --conversation_template ${conversation_template} \
-    --num_train_epochs 0.01 \
-    --learning_rate 2e-5 \
     --disable_group_texts 1 \
-    --block_size 256 \
+    --num_train_epochs 1 \
+    --block_size 512 \
     --per_device_train_batch_size 1 \
-    --deepspeed configs/ds_config_zero3.json \
-    --fp16 \
-    --run_name finetune \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 2e-5 \
+    --lr_scheduler_type cosine \
+    --bf16 \
+    --torch_dtype bfloat16 \
     --validation_split_percentage 0 \
     --logging_steps 20 \
     --do_train \
     --ddp_timeout 72000 \
     --save_steps 5000 \
-    --dataloader_num_workers 1 \
+    --use_flash_attention 0 \
+    --gradient_checkpointing 0 \
+    --dataloader_num_workers 8 \
+    --report_to wandb \
+    --run_name ${exp_id} \
+    --seed 42 \
     > >(tee ${log_dir}/train.log) \
     2> >(tee ${log_dir}/train.err >&2)
diff --git a/scripts/run_finetune_with_custom_optim.sh b/scripts/run_finetune_with_custom_optim.sh
index f52d95053..1e2272c20 100644
--- a/scripts/run_finetune_with_custom_optim.sh
+++ b/scripts/run_finetune_with_custom_optim.sh
@@ -1,20 +1,9 @@
 #!/bin/bash
-# Please run this script under ${project_id} in project directory of
-#   https://github.com/shizhediao/llm-ft
-#     COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4
-# Parses arguments
-model_name_or_path=gpt2
+model_name_or_path=meta-llama/Llama-3.2-3B-Instruct
 dataset_path=data/alpaca/train_conversation
+conversation_template=llama3
+output_dir=output_models/finetune_custom_optim
 
-# Other optional arguments that can improve memory saving
-gradient_checkpointing=True
-use_flash_attention=0
-gradient_accumulation_steps=1
-batch_size=1
-block_size=256
-per_device_train_batch_size=1
-conversation_template=llama2
-optim=dummy
 # Select an optimizer from the following options:
 # - 'adamw_torch'
 # - 'adafactor'
@@ -23,140 +12,17 @@ optim=dummy
 # - 'lion_32bit'
 # - 'rmsprop'
 # Additional optimizers are shown below
-learning_rate=1e-5
-lr_schedule=cosine
+optim=dummy
 beta1=0.9
 beta2=0.999
 beta3=0.99
 weight_decay=0
 momentum=0
-num_epoch=0.01
-use_deepspeed=1
-seed=42
-
-# Safety related arguments
-trust_remote_code=0
-
-# Enable model parallelism for multiple gpus, modify this if you prefer
-# customized deepspeed zero-redundancy optimization settings
-num_gpu=$(python -c "import torch; print(torch.cuda.device_count())")
-ds_config_file=configs/ds_config_zero0_no_offload.json
-if [[ ${num_gpu} -ge 2 ]]; then
-  ds_config_file=configs/ds_config_zero2_no_offload.json
-fi
-
-while [[ $# -ge 1 ]]; do
-  key="$1"
-  case ${key} in
-    -m|--model_name_or_path)
-      model_name_or_path="$2"
-      shift
-      ;;
-    -d|--dataset_path)
-      dataset_path="$2"
-      shift
-      ;;
-    -o|--output_model_path)
-      output_dir="$2"
-      shift
-      ;;
-    --lisa_activated_layers)
-      lisa_activated_layers="$2"
-      shift
-      ;;
-    --lisa_interval_steps)
-      lisa_interval_steps="$2"
-      shift
-      ;;
-    --gradient_checkpointing)
-      gradient_checkpointing="$2"
-      shift
-      ;;
-    --deepspeed)
-      ds_config_file="$2"
-      shift
-      ;;
-    --use_flash_attention)
-      use_flash_attention="$2"
-      shift
-      ;;
-    --gradient_accumulation_steps)
-      gradient_accumulation_steps="$2"
-      shift
-      ;;
-    --block_size)
-      block_size="$2"
-      shift
-      ;;
-    --conversation_template)
-      conversation_template="$2"
-      shift
-      ;;
-    --per_device_train_batch_size|--batch_size)
-      per_device_train_batch_size="$2"
-      batch_size="$2"
-      shift
-      ;;
-    --trust_remote_code)
-      trust_remote_code="$2"
-      shift
-      ;;
-    --run_name)
-      run_name="$2"
-      shift
-      ;;
-    --optim)
-      optim="$2"
-      shift
-      ;;
-    --lr)
-      learning_rate="$2"
-      shift
-      ;;
-    --beta1)
-      beta1="$2"
-      shift
-      ;;
-    --beta2)
-      beta2="$2"
-      shift
-      ;;
-    --beta3)
-      beta3="$2"
-      shift
-      ;;
-    --weight_decay)
-      weight_decay="$2"
-      shift
-      ;;
-    --momentum)
-      momentum="$2"
-      shift
-      ;;
-    -n|--num_epoch)
-      num_epoch="$2"
-      shift
-      ;;
-    --lr_schedule)
-      lr_schedule="$2"
-      shift
-      ;;
-    --use_deepspeed)
-      use_deepspeed="$2"
-      shift
-      ;;
-    --seed)
-      seed="$2"
-      shift
-      ;;
-    *)
-      echo "error: unknown option \"${key}\"" 1>&2
-      exit 1
-  esac
-  shift
-done
 
-deepspeed_args="--master_port=1103 --hostfile configs/hostfile"
+# Finetune
+exp_id=finetune_custom_optim
+log_dir=${output_dir}/log/
+mkdir -p ${output_dir} ${log_dir}
 
 optim_suffix_args=""
 if [ "${optim}" == "dummy" ]; then
@@ -275,62 +141,33 @@ else
   optim_suffix_args+=" --adam_beta2 ${beta2}"
 fi
 
-# Finetune
-exp_id=alpaca_${optim}_lr-${learning_rate}_beta1-${beta1}_beta2-${beta2}_lr-sched-${lr_schedule}_model-$(basename ${model_name_or_path})_batch-size-${batch_size}x${gradient_accumulation_steps}_seed-${seed}
-echo "$(date): ${exp_id}..."
-
-tmp_dir=tmp
-mkdir -p ${tmp_dir}
-
-prefix=${exp_id}
-if [ -f ${tmp_dir}/${prefix}.mark ]; then
-  exit 0
-fi
-
-trap "rm -f ${tmp_dir}/${prefix}.mark" SIGINT SIGTERM SIGKILL
-touch ${tmp_dir}/${prefix}.mark
-
-project_dir=$(cd "$(dirname $0)"/..; pwd)
-log_dir=${project_dir}/log/${exp_id}
-output_dir=output_models/${exp_id}
-mkdir -p ${output_dir} ${log_dir}
-
-exe="deepspeed ${deepspeed_args}"
-if [[ ${use_deepspeed} -eq 0 ]]; then
-  exe=python
-fi
-${exe} examples/finetune.py \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
+  examples/finetune.py \
     --model_name_or_path ${model_name_or_path} \
-    --trust_remote_code ${trust_remote_code} \
+    --trust_remote_code 0 \
     --dataset_path ${dataset_path} \
     --output_dir ${output_dir} --overwrite_output_dir \
     --conversation_template ${conversation_template} \
-    --num_train_epochs ${num_epoch} \
-    --learning_rate ${learning_rate} \
-    --lr_scheduler_type ${lr_schedule} \
+    ${optim_suffix_args} \
     --disable_group_texts 1 \
-    --block_size ${block_size} \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --num_train_epochs 1 \
+    --block_size 512 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 2e-5 \
+    --lr_scheduler_type cosine \
     --bf16 \
-    --deepspeed configs/ds_config_zero2_no_offload.json \
     --torch_dtype bfloat16 \
-    --run_name ${exp_id} \
     --validation_split_percentage 0 \
-    --logging_steps 1 \
+    --logging_steps 20 \
     --do_train \
     --ddp_timeout 72000 \
     --save_steps 5000 \
-    --dataloader_num_workers 1 \
-    --gradient_checkpointing ${gradient_checkpointing} \
-    --use_flash_attention ${use_flash_attention} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --seed ${seed} \
-    ${optim_suffix_args} \
-    | tee ${log_dir}/train.log \
-    2> ${log_dir}/train.err
-
-
-if [[ $? -ne 0 ]]; then
-  echo "$(date): failed"
-  rm -f ${tmp_dir}/${prefix}.mark
-fi
+    --use_flash_attention 0 \
+    --gradient_checkpointing 0 \
+    --dataloader_num_workers 8 \
+    --report_to wandb \
+    --run_name ${exp_id} \
+    --seed 42 \
+    > >(tee ${log_dir}/train.log) \
+    2> >(tee ${log_dir}/train.err >&2)
diff --git a/scripts/run_finetune_with_lisa.sh b/scripts/run_finetune_with_lisa.sh
index a7e2268dc..cf01a2c96 100755
--- a/scripts/run_finetune_with_lisa.sh
+++ b/scripts/run_finetune_with_lisa.sh
@@ -1,128 +1,47 @@
 #!/bin/bash
-# Please run this script under ${project_id} in project directory of
-#   https://github.com/shizhediao/llm-ft
-#     COMMIT: d5fecf30ba8011067b10cf51fede53a5ab6574e4
-
-# Parses arguments
-model_name_or_path=meta-llama/Llama-2-7b-hf
+model_name_or_path=meta-llama/Llama-3.2-3B-Instruct
 dataset_path=data/alpaca/train_conversation
+conversation_template=llama3
 output_dir=output_models/finetune_lisa
+
+# LISA related arguments
 lisa_activated_layers=1
 lisa_interval_steps=20
 
-# Other optional arguments that can improve memory saving
-gradient_checkpointing=True
-use_flash_attention=0
-gradient_accumulation_steps=1
-block_size=256
-per_device_train_batch_size=1
-conversation_template=llama2
-
-# Safety related arguments
-trust_remote_code=0
-
-# Enable model parallelism for multiple gpus, modify this if you prefer
-# customized deepspeed zero-redundancy optimization settings
-num_gpu=$(python -c "import torch; print(torch.cuda.device_count())")
-ds_config_file=configs/ds_config_zero0_no_offload.json
-if [ ${num_gpu} -ge 2 ]; then
-  ds_config_file=configs/ds_config_zero2_no_offload.json
-fi
-
-while [[ $# -ge 1 ]]; do
-  key="$1"
-  case ${key} in
-    -m|--model_name_or_path)
-      model_name_or_path="$2"
-      shift
-      ;;
-    -d|--dataset_path)
-      dataset_path="$2"
-      shift
-      ;;
-    -o|--output_model_path)
-      output_dir="$2"
-      shift
-      ;;
-    --lisa_activated_layers)
-      lisa_activated_layers="$2"
-      shift
-      ;;
-    --lisa_interval_steps)
-      lisa_interval_steps="$2"
-      shift
-      ;;
-    --gradient_checkpointing)
-      gradient_checkpointing="$2"
-      shift
-      ;;
-    --deepspeed)
-      ds_config_file="$2"
-      shift
-      ;;
-    --use_flash_attention)
-      use_flash_attention="$2"
-      shift
-      ;;
-    --gradient_accumulation_steps)
-      gradient_accumulation_steps="$2"
-      shift
-      ;;
-    --block_size)
-      block_size="$2"
-      shift
-      ;;
-    --conversation_template)
-      conversation_template="$2"
-      shift
-      ;;
-    --per_device_train_batch_size|--batch_size)
-      per_device_train_batch_size="$2"
-      shift
-      ;;
-    --trust_remote_code)
-      trust_remote_code="$2"
-      shift
-      ;;
-    *)
-      echo "error: unknown option \"${key}\"" 1>&2
-      exit 1
-  esac
-  shift
-done
-
 # Finetune
-exp_id=finetune
-project_dir=$(cd "$(dirname $0)"/..; pwd)
-log_dir=${project_dir}/log/${exp_id}
+exp_id=finetune_lisa
+log_dir=${output_dir}/log/
 mkdir -p ${output_dir} ${log_dir}
 
-python examples/finetune.py \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
+  examples/finetune.py \
     --model_name_or_path ${model_name_or_path} \
-    --trust_remote_code ${trust_remote_code} \
+    --trust_remote_code 0 \
     --dataset_path ${dataset_path} \
     --output_dir ${output_dir} --overwrite_output_dir \
     --conversation_template ${conversation_template} \
-    --num_train_epochs 1 \
-    --learning_rate 1e-5 \
+    --use_lisa 1 \
+    --lisa_activated_layers ${lisa_activated_layers} \
+    --lisa_interval_steps ${lisa_interval_steps} \
     --disable_group_texts 1 \
-    --block_size ${block_size} \
-    --per_device_train_batch_size ${per_device_train_batch_size} \
+    --num_train_epochs 1 \
+    --block_size 512 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 2e-5 \
+    --lr_scheduler_type cosine \
     --bf16 \
     --torch_dtype bfloat16 \
-    --run_name finetune \
-    --optim paged_adamw_32bit \
     --validation_split_percentage 0 \
     --logging_steps 20 \
     --do_train \
     --ddp_timeout 72000 \
     --save_steps 5000 \
-    --dataloader_num_workers 1 \
-    --gradient_checkpointing ${gradient_checkpointing} \
-    --use_flash_attention ${use_flash_attention} \
-    --gradient_accumulation_steps ${gradient_accumulation_steps} \
-    --use_lisa 1 \
-    --lisa_activated_layers ${lisa_activated_layers} \
-    --lisa_interval_steps ${lisa_interval_steps} \
-    | tee ${log_dir}/train.log \
-    2> ${log_dir}/train.err
+    --use_flash_attention 0 \
+    --gradient_checkpointing 0 \
+    --dataloader_num_workers 8 \
+    --report_to wandb \
+    --run_name ${exp_id} \
+    --seed 42 \
+    > >(tee ${log_dir}/train.log) \
+    2> >(tee ${log_dir}/train.err >&2)
diff --git a/scripts/run_finetune_with_lora.sh b/scripts/run_finetune_with_lora.sh
index c921ff744..096592880 100755
--- a/scripts/run_finetune_with_lora.sh
+++ b/scripts/run_finetune_with_lora.sh
@@ -1,78 +1,49 @@
 #!/bin/bash
-# Please run this script under ${project_id} in project directory of
-
-# Parses arguments
-model_name_or_path=gpt2
+model_name_or_path=meta-llama/Llama-3.2-3B-Instruct
 dataset_path=data/alpaca/train_conversation
-conversation_template=llama2
-output_dir=output_models/finetune
-deepspeed_args="--master_port=11000"
-
-# Safety related arguments
-trust_remote_code=0
+conversation_template=llama3
+output_dir=output_models/finetune_lora
 
-while [[ $# -ge 1 ]]; do
-  key="$1"
-  case ${key} in
-    -m|--model_name_or_path)
-      model_name_or_path="$2"
-      shift
-      ;;
-    -d|--dataset_path)
-      dataset_path="$2"
-      shift
-      ;;
-    --conversation_template)
-      conversation_template="$2"
-      shift
-      ;;
-    -o|--output_lora_path)
-      output_dir="$2"
-      shift
-      ;;
-    --deepspeed_args)
-      deepspeed_args="$2"
-      shift
-      ;;
-    --trust_remote_code)
-      trust_remote_code="$2"
-      shift
-      ;;
-    *)
-      echo "error: unknown option \"${key}\"" 1>&2
-      exit 1
-  esac
-  shift
-done
+# LoRA related arguments
+lora_r=8
+lora_alpha=32
+lora_dropout=0.1
 
 # Finetune
 exp_id=finetune_with_lora
-project_dir=$(cd "$(dirname $0)"/..; pwd)
-log_dir=${project_dir}/log/${exp_id}
+log_dir=${output_dir}/log/
 mkdir -p ${output_dir} ${log_dir}
 
-deepspeed ${deepspeed_args} \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
   examples/finetune.py \
     --model_name_or_path ${model_name_or_path} \
-    --trust_remote_code ${trust_remote_code} \
+    --trust_remote_code 0 \
     --dataset_path ${dataset_path} \
-    --conversation_template ${conversation_template} \
     --output_dir ${output_dir} --overwrite_output_dir \
-    --num_train_epochs 0.01 \
-    --learning_rate 1e-4 \
+    --conversation_template ${conversation_template} \
+    --use_lora 1 \
+    --lora_r ${lora_r} \
+    --lora_alpha ${lora_alpha} \
+    --lora_dropout ${lora_dropout} \
+    --disable_group_texts 1 \
+    --num_train_epochs 1 \
     --block_size 512 \
     --per_device_train_batch_size 1 \
-    --use_lora 1 \
-    --lora_r 8 \
-    --save_aggregated_lora 0\
-    --deepspeed configs/ds_config_zero2.json \
-    --fp16 \
-    --run_name ${exp_id} \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 2e-5 \
+    --lr_scheduler_type cosine \
+    --bf16 \
+    --torch_dtype bfloat16 \
     --validation_split_percentage 0 \
     --logging_steps 20 \
     --do_train \
     --ddp_timeout 72000 \
     --save_steps 5000 \
-    --dataloader_num_workers 1 \
-    | tee ${log_dir}/train.log \
-    2> ${log_dir}/train.err
+    --use_flash_attention 0 \
+    --gradient_checkpointing 0 \
+    --dataloader_num_workers 8 \
+    --report_to wandb \
+    --run_name ${exp_id} \
+    --seed 42 \
+    > >(tee ${log_dir}/train.log) \
+    2> >(tee ${log_dir}/train.err >&2)
diff --git a/scripts/run_finetune_with_qlora.sh b/scripts/run_finetune_with_qlora.sh
index d1cfdf5aa..84f684fb2 100644
--- a/scripts/run_finetune_with_qlora.sh
+++ b/scripts/run_finetune_with_qlora.sh
@@ -1,77 +1,51 @@
 #!/bin/bash
-# Please run this script under ${project_id} in project directory of
-
-# Parses arguments
-model_name_or_path=meta-llama/Llama-2-13b-hf
+model_name_or_path=meta-llama/Llama-3.2-3B-Instruct
 dataset_path=data/alpaca/train_conversation
-conversation_template=llama2
-output_dir=output_models/finetune
-deepspeed_args="--master_port=11000"
-
-# Safety related arguments
-trust_remote_code=0
+conversation_template=llama3
+output_dir=output_models/finetune_qlora
 
-while [[ $# -ge 1 ]]; do
-  key="$1"
-  case ${key} in
-    -m|--model_name_or_path)
-      model_name_or_path="$2"
-      shift
-      ;;
-    -d|--dataset_path)
-      dataset_path="$2"
-      shift
-      ;;
-    --conversation_template)
-      conversation_template="$2"
-      shift
-      ;;
-    -o|--output_model_path)
-      output_dir="$2"
-      shift
-      ;;
-    --deepspeed_args)
-      deepspeed_args="$2"
-      shift
-      ;;
-    --trust_remote_code)
-      trust_remote_code="$2"
-      shift
-      ;;
-    *)
-      echo "error: unknown option \"${key}\"" 1>&2
-      exit 1
-  esac
-  shift
-done
+# QLoRA related arguments
+quant_bit=4
+lora_r=8
+lora_alpha=32
+lora_dropout=0.1
 
 # Finetune
-exp_id=finetune_with_lora
-project_dir=$(cd "$(dirname $0)"/..; pwd)
-log_dir=${project_dir}/log/${exp_id}
+exp_id=finetune_with_qlora
+log_dir=${output_dir}/log/
 mkdir -p ${output_dir} ${log_dir}
 
-deepspeed ${deepspeed_args} \
+accelerate launch --config_file configs/accelerate_fsdp_config.yaml \
   examples/finetune.py \
     --model_name_or_path ${model_name_or_path} \
-    --trust_remote_code ${trust_remote_code} \
+    --trust_remote_code 0 \
     --dataset_path ${dataset_path} \
-    --conversation_template ${conversation_template} \
     --output_dir ${output_dir} --overwrite_output_dir \
-    --num_train_epochs 0.01 \
-    --learning_rate 1e-4 \
+    --conversation_template ${conversation_template} \
+    --use_qlora 1 \
+    --quant_bit ${quant_bit} \
+    --lora_r ${lora_r} \
+    --lora_alpha ${lora_alpha} \
+    --lora_dropout ${lora_dropout} \
+    --disable_group_texts 1 \
+    --num_train_epochs 1 \
     --block_size 512 \
     --per_device_train_batch_size 1 \
-    --use_qlora 1 \
-    --save_aggregated_lora 0 \
-    --deepspeed configs/ds_config_zero2.json \
-    --fp16 \
-    --run_name ${exp_id} \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 2e-5 \
+    --lr_scheduler_type cosine \
+    --bf16 \
+    --torch_dtype bfloat16 \
     --validation_split_percentage 0 \
     --logging_steps 20 \
     --do_train \
     --ddp_timeout 72000 \
     --save_steps 5000 \
-    --dataloader_num_workers 1 \
-    | tee ${log_dir}/train.log \
-    2> ${log_dir}/train.err
\ No newline at end of file
+    --use_flash_attention 0 \
+    --gradient_checkpointing 0 \
+    --dataloader_num_workers 8 \
+    --report_to wandb \
+    --run_name ${exp_id} \
+    --seed 42 \
+    > >(tee ${log_dir}/train.log) \
+    2> >(tee ${log_dir}/train.err >&2)
diff --git a/scripts/run_merge_lora.sh b/scripts/run_merge_lora.sh
index d4a28ae91..5c6a3a388 100644
--- a/scripts/run_merge_lora.sh
+++ b/scripts/run_merge_lora.sh
@@ -1,14 +1,10 @@
 #!/bin/bash
-
 # Parses arguments
 model_name_or_path=gpt2
 lora_model_path=output_models/lora
 output_model_path=output_models/merge_lora
 device=cpu
 
-# if gpu
-deepspeed_args="--master_port=11000"
-
 while [[ $# -ge 1 ]]; do
   key="$1"
   case ${key} in
@@ -28,10 +24,6 @@ while [[ $# -ge 1 ]]; do
       device="$2"
       shift
       ;;
-    --deepspeed_args)
-      deepspeed_args="$2"
-      shift
-      ;;
     *)
       echo "error: unknown option \"${key}\"" 1>&2
       exit 1
@@ -46,7 +38,6 @@ if [ ${device} == "cpu" ]; then
         --lora_model_path ${lora_model_path} \
         --output_model_path ${output_model_path} \
         --device ${device} \
-        --ds_config configs/ds_config_eval.json
 elif [ ${device} == "gpu" ]; then
     echo "Error: Merging LoRA weights using gpu not supported yet. Please use cpu."
 else
diff --git a/scripts/speculative_decoding/README.md b/scripts/speculative_decoding/README.md
deleted file mode 100644
index ccf112b1a..000000000
--- a/scripts/speculative_decoding/README.md
+++ /dev/null
@@ -1,69 +0,0 @@
-# Speculative Decoding
-## Introduction
-[Speculative Decoding (Ref: arXiv:2211.17192v2)](https://arxiv.org/abs/2211.17192) is now available for playing via:
-```bash
-python ./examples/speculative_inference.py \ 
-  --model            # your_model_name_or_path 
-  --draft_model      # your_draft_model_name_or_path 
-  --temperature      # your_temperature 
-  --gamma            # your_gamma
-  --max_new_tokens   # your_max_new_tokens
-  --gpu              # your_gpu_id
-```
-For example, 
-```bash
-python ./examples/speculative_inference.py \ 
-  --model gpt2-xl 
-  --draft_model gpt2 
-  --temperature 0.3 
-  --gamma 5
-  --max_new_tokens 512
-  --gpu 0
-```
-Another example,
-```bash
-python ./examples/speculative_inference.py \ 
-  --model /home/eric/Documents/models/gpt2-xl 
-  --draft_model /home/eric/Documents/models/gpt2 
-  --temperature 0 
-  --gamma 3
-  --max_new_tokens 1024
-  --gpu 7
-```
-## Parameter Instruction
-`model`, `draft_model`
-- Huggingface model name or locally cached model path.
--  Currently only supports huggingface decoder only models. 
--  `model` refers to the target model (i.e., the large model you want to accelerate) in the paper. 
--  `draft_model` refers to the draft model in the paper.
-
-`temperature`
-- Temperature for sampling. When temperature <= 1e-6, will use argmax sampling.
-
-`gamma`
-- Number of tokens that the draft model will generate at each step. See the paper for more details.
-
-`max_new_tokens`
-- Maximum number of tokens that the speculative inference will generate.
-- TODO: currently the speculative decoding will always generate `max_new_tokens` tokens. We will add a `stop_token` in the future.
-
-`gpu`
-- gpu id, currently speculative inference only support single gpu.
-
-## Experiments
-We tested the speculative inference using the first 100 inputs from alpaca test dataset as prompts. When `model=gpt2-xl`, `draft_model=gpt2`, `temperature=0.`, `max_new_tokens=512`, we observed the following acceleration:
-
-|gamma|speedup (inference time)|speed up (num of forwards)
-|--|--|--|
-|1|1.75x|1.96x|
-|2|2.29x|2.89x|
-|3|2.71x|3.77x|
-|4|3.06x|4.63x|
-|5|3.35x|5.44x|
-|6|3.65x|6.23x|
-|7|3.82x|6.94x|
-|8|3.96x|7.64x|
-|9|4.05x|8.33x|
-|10|4.14x|9.00x|
-
-Note that the speedup may be overestimated. When `temperature=0`, `gpt2-xl` and `gpt2` tend to generate duplicated tokens as the number of tokens generated increases, thus making the target model more likely to accept the draft model's output.
\ No newline at end of file
diff --git a/scripts/tools/print_model_architecture.py b/scripts/tools/print_model_architecture.py
deleted file mode 100644
index 094f65032..000000000
--- a/scripts/tools/print_model_architecture.py
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env python
-#coding=utf-8
-import argparse
-import sys
-from transformers import AutoModel
-
-def parse_argument(sys_argv):
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument("--model_name_or_path", type=str, default='gpt2')
-    args = parser.parse_args(sys_argv[1:])
-    return args
-
-def main():
-    args = parse_argument(sys.argv)
-    model_name = args.model_name_or_path
-    model = AutoModel.from_pretrained(model_name)
-
-    print(model.config)
-    print(model)
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/vocab_extension/README.md b/scripts/vocab_extension/README.md
deleted file mode 100644
index 5582d7d16..000000000
--- a/scripts/vocab_extension/README.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Vocab Extension
-## Train & Merge Tokenizer
-To automatically convert data, train a SentencePiece tokenizer, and merge the tokenizer, you can run the following script:
-```
-bash scripts/vocab_extension/train_merge_tokenizer.sh
-``` 
-Alternatively, you can run each of the three steps separately:
-
-## Convert JSON Data to TXT
-To convert JSON data to TXT for sentencepiece tokenizer training, run:
-```
-bash scripts/vocab_extension/convert_json_to_txt.sh
-```
-## Train SentencePiece Tokenizer
-To train a SentencePiece tokenizer, run:
-```
-bash scripts/vocab_extension/train_tokenizer.sh
-```
-## Merge New Tokenizer with the Origin One
-To merge a new tokenizer with the original one, run:
-```
-bash scripts/vocab_extension/merge_tokenizer.sh
-```
\ No newline at end of file
diff --git a/scripts/vocab_extension/convert_json_to_txt.sh b/scripts/vocab_extension/convert_json_to_txt.sh
deleted file mode 100644
index 5d2a57e95..000000000
--- a/scripts/vocab_extension/convert_json_to_txt.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-cd data && bash download.sh wiki_zh_eval && cd -
-
-python utils/convert_json_to_txt.py --dataset_path ./data/wiki_zh_eval \
-        --output_path ./data/wiki_zh_eval/converted_data.txt \
-        --overwrite True
\ No newline at end of file
diff --git a/scripts/vocab_extension/merge_tokenizer.sh b/scripts/vocab_extension/merge_tokenizer.sh
deleted file mode 100644
index 0cd1722c0..000000000
--- a/scripts/vocab_extension/merge_tokenizer.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-mkdir -p ./output_models/new_tokenizer
-python utils/merge_tokenizer.py --tokenizer_dir openlm-research/open_llama_3b \
-        --chinese_sp_model_file ./output_models/new_tokenizer/example.model \
-        --output_dir ./output_models/merged_tokenizer \
\ No newline at end of file
diff --git a/scripts/vocab_extension/train_merge_tokenizer.sh b/scripts/vocab_extension/train_merge_tokenizer.sh
deleted file mode 100644
index 2e63e84b6..000000000
--- a/scripts/vocab_extension/train_merge_tokenizer.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-# download data
-cd data && bash download.sh wiki_zh_eval && cd -
-
-# convert json to txt for sentencepiece
-python utils/convert_json_to_txt.py --dataset_path ./data/wiki_zh_eval \
-        --output_path ./data/wiki_zh_eval/converted_data.txt \
-        --overwrite True
-
-# train a new tokenizer
-mkdir -p ./output_models/new_tokenizer
-python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_data.txt \
-        --model_type bpe \
-        --output_dir ./output_models/new_tokenizer \
-        --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \
-        --vocab_size 20000 \
-        --max_sentencepiece_length 4
-
-# merge the new tokenizer with the old one
-mkdir -p ./output_models/merged_tokenizer
-python utils/merge_tokenizer.py --chinese_sp_model_file ./output_models/new_tokenizer/example.model \
-        --tokenizer_dir openlm-research/open_llama_3b \
-        --output_dir ./output_models/merged_tokenizer
\ No newline at end of file
diff --git a/scripts/vocab_extension/train_tokenizer.sh b/scripts/vocab_extension/train_tokenizer.sh
deleted file mode 100644
index d61275499..000000000
--- a/scripts/vocab_extension/train_tokenizer.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-mkdir -p ./output_models/merged_tokenizer
-python utils/train_tokenizer.py --dataset_path ./data/wiki_zh_eval/converted_data.txt \
-        --model_type bpe \
-        --output_dir ./output_models/new_tokenizer \
-        --user_defined_symbols 0,1,2,3,4,5,6,7,8,9,% \
-        --vocab_size 20000 \
-        --max_sentencepiece_length 4
\ No newline at end of file
diff --git a/service/app.py b/service/app.py
deleted file mode 100644
index 7580ac39e..000000000
--- a/service/app.py
+++ /dev/null
@@ -1,149 +0,0 @@
-from dataclasses import dataclass, field
-import json
-import os
-from typing import Optional
-
-from accelerate import Accelerator
-import torch
-from transformers import HfArgumentParser
-
-from lmflow.args import ModelArguments
-from lmflow.models.auto_model import AutoModel
-from lmflow.utils.versioning import is_flask_available
-
-if is_flask_available():
-    from flask import Flask, request, stream_with_context
-    from flask import render_template
-    from flask_cors import CORS
-else:
-    raise ImportError("Flask is not available. Please install flask and flask_cors.")
-
-WINDOW_LENGTH = 512
-
-@dataclass
-class AppArguments:
-    end_string: Optional[str] = field(
-        default="##",
-        metadata={
-            "help": "end string mark of the chatbot's output"
-        },
-    )
-    max_new_tokens: Optional[int] = field(
-        default=200,
-        metadata={
-            "help": "maximum number of generated tokens"
-        },
-    )
-
-parser = HfArgumentParser((
-        ModelArguments,
-        AppArguments,
-))
-
-model_args, app_args = (
-        parser.parse_args_into_dataclasses()
-    )
-
-app = Flask(__name__)
-CORS(app)
-ds_config_path = "./examples/ds_config.json"
-with open (ds_config_path, "r") as f:
-    ds_config = json.load(f)
-
-
-local_rank = int(os.getenv("LOCAL_RANK", "0"))
-world_size = int(os.getenv("WORLD_SIZE", "1"))
-torch.cuda.set_device(local_rank)
-model = AutoModel.get_model(model_args, tune_strategy='none', ds_config=ds_config, use_accelerator=True)
-accelerator = Accelerator()
-
-def stream_generate(inputs,context_len = 1024, max_new_tokens=128, end_string="##"):
-
-
-    max_src_len = context_len - max_new_tokens - len(end_string)
-    input_ids = model.tokenizer(inputs).input_ids
-    input_echo_len = len(input_ids)
-    output_ids = list(input_ids)
-    input_ids = input_ids[-max_src_len:]
-
-    past_key_values = out = None
-    flag_stop = False
-    for i in range(0, max_new_tokens):
-        with accelerator.autocast():
-            if i == 0:
-                with torch.no_grad():
-                    out = model.backend_model(torch.as_tensor([input_ids], device=local_rank), use_cache=True)
-                logits = out.logits    
-                past_key_values = out.past_key_values
-            else:
-                with torch.no_grad():
-                    out = model.backend_model(
-                        input_ids=torch.as_tensor([[token]], device=local_rank),
-                        use_cache=True,
-                        past_key_values=past_key_values,
-                    )         
-                logits = out.logits
-                past_key_values = out.past_key_values
-
-        last_token_logits = logits[0, -1, :]
-        token = int(torch.argmax(last_token_logits))
-        output_ids.append(token)
-
-        tmp_output_ids = output_ids[input_echo_len:]
-
-        output = model.tokenizer.decode(
-            tmp_output_ids,
-            skip_special_tokens=True,
-            spaces_between_special_tokens=False,
-        )
-
-        if end_string in output:
-            index = output.index(end_string)
-            output = output[:index]
-            flag_stop = True
-        yield output.replace("\ufffd","")
-
-        if flag_stop == True:
-            break
-
-@app.route('/predict',methods = ['POST'])
-def predict():
-    if(request.method == "POST"):
-        try:
-            user_input = request.get_json()["Input"]
-            conversation = request.get_json()["History"]
-
-            history_input = ""
-            if(len(conversation) >= 2):
-                if(len(conversation) == 2):
-                    history_input ="###Human: " + user_input +" "
-                else:
-                    for i in range(0, len(conversation)-1):
-                        if(i % 2 == 0):
-                            history_input = history_input + "###Human: "  + conversation[i+1]["content"] + " "
-                        elif(i % 2 == 1):
-                            history_input = history_input + "###Assistant:"  + conversation[i+1]["content"] 
-                history_input = history_input +  "###Assistant:"
-
-            if len(model.encode(history_input))> WINDOW_LENGTH:
-                inputs = model.encode(history_input)
-                inputs = inputs[-WINDOW_LENGTH:]
-                history_input = model.decode(inputs)
-
-            return app.response_class(stream_with_context(stream_generate(history_input,
-                                                                          max_new_tokens=app_args.max_new_tokens,
-                                                                          end_string=app_args.end_string)))
-        except Exception as ex:
-            print(ex)
-            text_out = ex
-    else:
-        text_out = "Not POST Method"
-    return text_out
-
-@app.route('/',methods = ['GET'])
-def login():
-
-    return render_template('index.html')
-
-
-app.run(port = 5000, debug = False)
diff --git a/service/static/assets/background.png b/service/static/assets/background.png
deleted file mode 100644
index cb656f70a..000000000
Binary files a/service/static/assets/background.png and /dev/null differ
diff --git a/service/static/assets/logo.png b/service/static/assets/logo.png
deleted file mode 100644
index d2b683d6e..000000000
Binary files a/service/static/assets/logo.png and /dev/null differ
diff --git a/service/static/utils/vue-spinner.js b/service/static/utils/vue-spinner.js
deleted file mode 100644
index cc0d655e2..000000000
--- a/service/static/utils/vue-spinner.js
+++ /dev/null
@@ -1,3858 +0,0 @@
-(function webpackUniversalModuleDefinition(root, factory) {
-	if(typeof exports === 'object' && typeof module === 'object')
-		module.exports = factory();
-	else if(typeof define === 'function' && define.amd)
-		define([], factory);
-	else if(typeof exports === 'object')
-		exports["VueSpinner"] = factory();
-	else
-		root["VueSpinner"] = factory();
-})(this, function() {
-return /******/ (function(modules) { // webpackBootstrap
-/******/ 	// The module cache
-/******/ 	var installedModules = {};
-/******/
-/******/ 	// The require function
-/******/ 	function __webpack_require__(moduleId) {
-/******/
-/******/ 		// Check if module is in cache
-/******/ 		if(installedModules[moduleId])
-/******/ 			return installedModules[moduleId].exports;
-/******/
-/******/ 		// Create a new module (and put it into the cache)
-/******/ 		var module = installedModules[moduleId] = {
-/******/ 			exports: {},
-/******/ 			id: moduleId,
-/******/ 			loaded: false
-/******/ 		};
-/******/
-/******/ 		// Execute the module function
-/******/ 		modules[moduleId].call(module.exports, module, module.exports, __webpack_require__);
-/******/
-/******/ 		// Flag the module as loaded
-/******/ 		module.loaded = true;
-/******/
-/******/ 		// Return the exports of the module
-/******/ 		return module.exports;
-/******/ 	}
-/******/
-/******/
-/******/ 	// expose the modules object (__webpack_modules__)
-/******/ 	__webpack_require__.m = modules;
-/******/
-/******/ 	// expose the module cache
-/******/ 	__webpack_require__.c = installedModules;
-/******/
-/******/ 	// __webpack_public_path__
-/******/ 	__webpack_require__.p = "";
-/******/
-/******/ 	// Load entry module and return exports
-/******/ 	return __webpack_require__(0);
-/******/ })
-/************************************************************************/
-/******/ ([
-/* 0 */
-/***/ function(module, exports, __webpack_require__) {
-
-	'use strict';
-	
-	var _PulseLoader = __webpack_require__(1);
-	
-	var _PulseLoader2 = _interopRequireDefault(_PulseLoader);
-	
-	var _GridLoader = __webpack_require__(8);
-	
-	var _GridLoader2 = _interopRequireDefault(_GridLoader);
-	
-	var _ClipLoader = __webpack_require__(13);
-	
-	var _ClipLoader2 = _interopRequireDefault(_ClipLoader);
-	
-	var _RiseLoader = __webpack_require__(18);
-	
-	var _RiseLoader2 = _interopRequireDefault(_RiseLoader);
-	
-	var _BeatLoader = __webpack_require__(23);
-	
-	var _BeatLoader2 = _interopRequireDefault(_BeatLoader);
-	
-	var _SyncLoader = __webpack_require__(28);
-	
-	var _SyncLoader2 = _interopRequireDefault(_SyncLoader);
-	
-	var _RotateLoader = __webpack_require__(33);
-	
-	var _RotateLoader2 = _interopRequireDefault(_RotateLoader);
-	
-	var _FadeLoader = __webpack_require__(38);
-	
-	var _FadeLoader2 = _interopRequireDefault(_FadeLoader);
-	
-	var _PacmanLoader = __webpack_require__(43);
-	
-	var _PacmanLoader2 = _interopRequireDefault(_PacmanLoader);
-	
-	var _SquareLoader = __webpack_require__(48);
-	
-	var _SquareLoader2 = _interopRequireDefault(_SquareLoader);
-	
-	var _ScaleLoader = __webpack_require__(53);
-	
-	var _ScaleLoader2 = _interopRequireDefault(_ScaleLoader);
-	
-	var _SkewLoader = __webpack_require__(58);
-	
-	var _SkewLoader2 = _interopRequireDefault(_SkewLoader);
-	
-	var _MoonLoader = __webpack_require__(63);
-	
-	var _MoonLoader2 = _interopRequireDefault(_MoonLoader);
-	
-	var _RingLoader = __webpack_require__(68);
-	
-	var _RingLoader2 = _interopRequireDefault(_RingLoader);
-	
-	var _BounceLoader = __webpack_require__(73);
-	
-	var _BounceLoader2 = _interopRequireDefault(_BounceLoader);
-	
-	var _DotLoader = __webpack_require__(78);
-	
-	var _DotLoader2 = _interopRequireDefault(_DotLoader);
-	
-	function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { default: obj }; }
-	
-	var VueSpinner = {
-	  PulseLoader: _PulseLoader2.default,
-	  GridLoader: _GridLoader2.default,
-	  ClipLoader: _ClipLoader2.default,
-	  RiseLoader: _RiseLoader2.default,
-	  BeatLoader: _BeatLoader2.default,
-	  SyncLoader: _SyncLoader2.default,
-	  RotateLoader: _RotateLoader2.default,
-	  FadeLoader: _FadeLoader2.default,
-	  PacmanLoader: _PacmanLoader2.default,
-	  SquareLoader: _SquareLoader2.default,
-	  ScaleLoader: _ScaleLoader2.default,
-	  SkewLoader: _SkewLoader2.default,
-	  MoonLoader: _MoonLoader2.default,
-	  RingLoader: _RingLoader2.default,
-	  BounceLoader: _BounceLoader2.default,
-	  DotLoader: _DotLoader2.default
-	};
-	
-	module.exports = VueSpinner;
-
-/***/ },
-/* 1 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(2)
-	__vue_script__ = __webpack_require__(6)
-	__vue_template__ = __webpack_require__(7)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/PulseLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 2 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(3);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-453f872c&file=PulseLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./PulseLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-453f872c&file=PulseLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./PulseLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 3 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n/*.v-spinner\n{\n    margin: 100px auto;\n    text-align: center;\n}\n*/\n\n@-webkit-keyframes v-pulseStretchDelay\n{\n    0%,\n    80%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n    45%\n    {\n        -webkit-transform: scale(0.1);\n                transform: scale(0.1);\n        -webkit-opacity: 0.7;             \n                opacity: 0.7;\n    }\n}\n\n@keyframes v-pulseStretchDelay\n{\n    0%,\n    80%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n    45%\n    {\n        -webkit-transform: scale(0.1);\n                transform: scale(0.1);\n        -webkit-opacity: 0.7;             \n                opacity: 0.7;\n    }\n}\n", "", {"version":3,"sources":["/./src/PulseLoader.vue.style"],"names":[],"mappings":";AAmEA;;;;;EAKA;;AAEA;;IAEA;;;QAGA,4BAAA;gBACA,oBAAA;QACA,mBAAA;gBACA,WAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;QACA,qBAAA;gBACA,aAAA;KACA;CACA;;AAEA;;IAEA;;;QAGA,4BAAA;gBACA,oBAAA;QACA,mBAAA;gBACA,WAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;QACA,qBAAA;gBACA,aAAA;KACA;CACA","file":"PulseLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-pulse v-pulse1\" v-bind:style=\"[spinnerStyle,spinnerDelay1]\">\n    </div><div class=\"v-pulse v-pulse2\" v-bind:style=\"[spinnerStyle,spinnerDelay2]\">\n    </div><div class=\"v-pulse v-pulse3\" v-bind:style=\"[spinnerStyle,spinnerDelay3]\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'PulseLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '15px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n      \tbackgroundColor: this.color,\n      \twidth: this.size,\n        height: this.size,\n      \tmargin: this.margin,\n      \tborderRadius: this.radius,\n        display: 'inline-block',\n        animationName: 'v-pulseStretchDelay',\n        animationDuration: '0.75s',\n        animationIterationCount: 'infinite',\n        animationTimingFunction: 'cubic-bezier(.2,.68,.18,1.08)',\n        animationFillMode: 'both'\n      },\n      spinnerDelay1: {\n        animationDelay: '0.12s'\n      },\n      spinnerDelay2: {\n        animationDelay: '0.24s'\n      },\n      spinnerDelay3: {\n        animationDelay: '0.36s'\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n/*.v-spinner\n{\n    margin: 100px auto;\n    text-align: center;\n}\n*/\n\n@-webkit-keyframes v-pulseStretchDelay\n{\n    0%,\n    80%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n    45%\n    {\n        -webkit-transform: scale(0.1);\n                transform: scale(0.1);\n        -webkit-opacity: 0.7;             \n                opacity: 0.7;\n    }\n}\n\n@keyframes v-pulseStretchDelay\n{\n    0%,\n    80%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n    45%\n    {\n        -webkit-transform: scale(0.1);\n                transform: scale(0.1);\n        -webkit-opacity: 0.7;             \n                opacity: 0.7;\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 4 */
-/***/ function(module, exports) {
-
-	/*
-		MIT License http://www.opensource.org/licenses/mit-license.php
-		Author Tobias Koppers @sokra
-	*/
-	// css base code, injected by the css-loader
-	module.exports = function() {
-		var list = [];
-	
-		// return the list of modules as css string
-		list.toString = function toString() {
-			var result = [];
-			for(var i = 0; i < this.length; i++) {
-				var item = this[i];
-				if(item[2]) {
-					result.push("@media " + item[2] + "{" + item[1] + "}");
-				} else {
-					result.push(item[1]);
-				}
-			}
-			return result.join("");
-		};
-	
-		// import a list of modules into the list
-		list.i = function(modules, mediaQuery) {
-			if(typeof modules === "string")
-				modules = [[null, modules, ""]];
-			var alreadyImportedModules = {};
-			for(var i = 0; i < this.length; i++) {
-				var id = this[i][0];
-				if(typeof id === "number")
-					alreadyImportedModules[id] = true;
-			}
-			for(i = 0; i < modules.length; i++) {
-				var item = modules[i];
-				// skip already imported module
-				// this implementation is not 100% perfect for weird media query combinations
-				//  when a module is imported multiple times with different media queries.
-				//  I hope this will never occur (Hey this way we have smaller bundles)
-				if(typeof item[0] !== "number" || !alreadyImportedModules[item[0]]) {
-					if(mediaQuery && !item[2]) {
-						item[2] = mediaQuery;
-					} else if(mediaQuery) {
-						item[2] = "(" + item[2] + ") and (" + mediaQuery + ")";
-					}
-					list.push(item);
-				}
-			}
-		};
-		return list;
-	};
-
-
-/***/ },
-/* 5 */
-/***/ function(module, exports, __webpack_require__) {
-
-	/*
-		MIT License http://www.opensource.org/licenses/mit-license.php
-		Author Tobias Koppers @sokra
-	*/
-	var stylesInDom = {},
-		memoize = function(fn) {
-			var memo;
-			return function () {
-				if (typeof memo === "undefined") memo = fn.apply(this, arguments);
-				return memo;
-			};
-		},
-		isOldIE = memoize(function() {
-			return /msie [6-9]\b/.test(self.navigator.userAgent.toLowerCase());
-		}),
-		getHeadElement = memoize(function () {
-			return document.head || document.getElementsByTagName("head")[0];
-		}),
-		singletonElement = null,
-		singletonCounter = 0,
-		styleElementsInsertedAtTop = [];
-	
-	module.exports = function(list, options) {
-		if(false) {
-			if(typeof document !== "object") throw new Error("The style-loader cannot be used in a non-browser environment");
-		}
-	
-		options = options || {};
-		// Force single-tag solution on IE6-9, which has a hard limit on the # of <style>
-		// tags it will allow on a page
-		if (typeof options.singleton === "undefined") options.singleton = isOldIE();
-	
-		// By default, add <style> tags to the bottom of <head>.
-		if (typeof options.insertAt === "undefined") options.insertAt = "bottom";
-	
-		var styles = listToStyles(list);
-		addStylesToDom(styles, options);
-	
-		return function update(newList) {
-			var mayRemove = [];
-			for(var i = 0; i < styles.length; i++) {
-				var item = styles[i];
-				var domStyle = stylesInDom[item.id];
-				domStyle.refs--;
-				mayRemove.push(domStyle);
-			}
-			if(newList) {
-				var newStyles = listToStyles(newList);
-				addStylesToDom(newStyles, options);
-			}
-			for(var i = 0; i < mayRemove.length; i++) {
-				var domStyle = mayRemove[i];
-				if(domStyle.refs === 0) {
-					for(var j = 0; j < domStyle.parts.length; j++)
-						domStyle.parts[j]();
-					delete stylesInDom[domStyle.id];
-				}
-			}
-		};
-	}
-	
-	function addStylesToDom(styles, options) {
-		for(var i = 0; i < styles.length; i++) {
-			var item = styles[i];
-			var domStyle = stylesInDom[item.id];
-			if(domStyle) {
-				domStyle.refs++;
-				for(var j = 0; j < domStyle.parts.length; j++) {
-					domStyle.parts[j](item.parts[j]);
-				}
-				for(; j < item.parts.length; j++) {
-					domStyle.parts.push(addStyle(item.parts[j], options));
-				}
-			} else {
-				var parts = [];
-				for(var j = 0; j < item.parts.length; j++) {
-					parts.push(addStyle(item.parts[j], options));
-				}
-				stylesInDom[item.id] = {id: item.id, refs: 1, parts: parts};
-			}
-		}
-	}
-	
-	function listToStyles(list) {
-		var styles = [];
-		var newStyles = {};
-		for(var i = 0; i < list.length; i++) {
-			var item = list[i];
-			var id = item[0];
-			var css = item[1];
-			var media = item[2];
-			var sourceMap = item[3];
-			var part = {css: css, media: media, sourceMap: sourceMap};
-			if(!newStyles[id])
-				styles.push(newStyles[id] = {id: id, parts: [part]});
-			else
-				newStyles[id].parts.push(part);
-		}
-		return styles;
-	}
-	
-	function insertStyleElement(options, styleElement) {
-		var head = getHeadElement();
-		var lastStyleElementInsertedAtTop = styleElementsInsertedAtTop[styleElementsInsertedAtTop.length - 1];
-		if (options.insertAt === "top") {
-			if(!lastStyleElementInsertedAtTop) {
-				head.insertBefore(styleElement, head.firstChild);
-			} else if(lastStyleElementInsertedAtTop.nextSibling) {
-				head.insertBefore(styleElement, lastStyleElementInsertedAtTop.nextSibling);
-			} else {
-				head.appendChild(styleElement);
-			}
-			styleElementsInsertedAtTop.push(styleElement);
-		} else if (options.insertAt === "bottom") {
-			head.appendChild(styleElement);
-		} else {
-			throw new Error("Invalid value for parameter 'insertAt'. Must be 'top' or 'bottom'.");
-		}
-	}
-	
-	function removeStyleElement(styleElement) {
-		styleElement.parentNode.removeChild(styleElement);
-		var idx = styleElementsInsertedAtTop.indexOf(styleElement);
-		if(idx >= 0) {
-			styleElementsInsertedAtTop.splice(idx, 1);
-		}
-	}
-	
-	function createStyleElement(options) {
-		var styleElement = document.createElement("style");
-		styleElement.type = "text/css";
-		insertStyleElement(options, styleElement);
-		return styleElement;
-	}
-	
-	function createLinkElement(options) {
-		var linkElement = document.createElement("link");
-		linkElement.rel = "stylesheet";
-		insertStyleElement(options, linkElement);
-		return linkElement;
-	}
-	
-	function addStyle(obj, options) {
-		var styleElement, update, remove;
-	
-		if (options.singleton) {
-			var styleIndex = singletonCounter++;
-			styleElement = singletonElement || (singletonElement = createStyleElement(options));
-			update = applyToSingletonTag.bind(null, styleElement, styleIndex, false);
-			remove = applyToSingletonTag.bind(null, styleElement, styleIndex, true);
-		} else if(obj.sourceMap &&
-			typeof URL === "function" &&
-			typeof URL.createObjectURL === "function" &&
-			typeof URL.revokeObjectURL === "function" &&
-			typeof Blob === "function" &&
-			typeof btoa === "function") {
-			styleElement = createLinkElement(options);
-			update = updateLink.bind(null, styleElement);
-			remove = function() {
-				removeStyleElement(styleElement);
-				if(styleElement.href)
-					URL.revokeObjectURL(styleElement.href);
-			};
-		} else {
-			styleElement = createStyleElement(options);
-			update = applyToTag.bind(null, styleElement);
-			remove = function() {
-				removeStyleElement(styleElement);
-			};
-		}
-	
-		update(obj);
-	
-		return function updateStyle(newObj) {
-			if(newObj) {
-				if(newObj.css === obj.css && newObj.media === obj.media && newObj.sourceMap === obj.sourceMap)
-					return;
-				update(obj = newObj);
-			} else {
-				remove();
-			}
-		};
-	}
-	
-	var replaceText = (function () {
-		var textStore = [];
-	
-		return function (index, replacement) {
-			textStore[index] = replacement;
-			return textStore.filter(Boolean).join('\n');
-		};
-	})();
-	
-	function applyToSingletonTag(styleElement, index, remove, obj) {
-		var css = remove ? "" : obj.css;
-	
-		if (styleElement.styleSheet) {
-			styleElement.styleSheet.cssText = replaceText(index, css);
-		} else {
-			var cssNode = document.createTextNode(css);
-			var childNodes = styleElement.childNodes;
-			if (childNodes[index]) styleElement.removeChild(childNodes[index]);
-			if (childNodes.length) {
-				styleElement.insertBefore(cssNode, childNodes[index]);
-			} else {
-				styleElement.appendChild(cssNode);
-			}
-		}
-	}
-	
-	function applyToTag(styleElement, obj) {
-		var css = obj.css;
-		var media = obj.media;
-	
-		if(media) {
-			styleElement.setAttribute("media", media)
-		}
-	
-		if(styleElement.styleSheet) {
-			styleElement.styleSheet.cssText = css;
-		} else {
-			while(styleElement.firstChild) {
-				styleElement.removeChild(styleElement.firstChild);
-			}
-			styleElement.appendChild(document.createTextNode(css));
-		}
-	}
-	
-	function updateLink(linkElement, obj) {
-		var css = obj.css;
-		var sourceMap = obj.sourceMap;
-	
-		if(sourceMap) {
-			// http://stackoverflow.com/a/26603875
-			css += "\n/*# sourceMappingURL=data:application/json;base64," + btoa(unescape(encodeURIComponent(JSON.stringify(sourceMap)))) + " */";
-		}
-	
-		var blob = new Blob([css], { type: "text/css" });
-	
-		var oldSrc = linkElement.href;
-	
-		linkElement.href = URL.createObjectURL(blob);
-	
-		if(oldSrc)
-			URL.revokeObjectURL(oldSrc);
-	}
-
-
-/***/ },
-/* 6 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-pulse v-pulse1" v-bind:style="[spinnerStyle,spinnerDelay1]">
-	//     </div><div class="v-pulse v-pulse2" v-bind:style="[spinnerStyle,spinnerDelay2]">
-	//     </div><div class="v-pulse v-pulse3" v-bind:style="[spinnerStyle,spinnerDelay3]">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'PulseLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#808080'
-	    },
-	    size: {
-	      type: String,
-	      default: '12px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        width: this.size,
-	        height: this.size,
-	        margin: this.margin,
-	        borderRadius: this.radius,
-	        display: 'inline-block',
-	        animationName: 'v-pulseStretchDelay',
-	        animationDuration: '0.75s',
-	        animationIterationCount: 'infinite',
-	        animationTimingFunction: 'cubic-bezier(.2,.68,.18,1.08)',
-	        animationFillMode: 'both'
-	      },
-	      spinnerDelay1: {
-	        animationDelay: '0.12s'
-	      },
-	      spinnerDelay2: {
-	        animationDelay: '0.24s'
-	      },
-	      spinnerDelay3: {
-	        animationDelay: '0.36s'
-	      }
-	    };
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	// /*.v-spinner
-	// {
-	//     margin: 100px auto;
-	//     text-align: center;
-	// }
-	// */
-	//
-	// @-webkit-keyframes v-pulseStretchDelay
-	// {
-	//     0%,
-	//     80%
-	//     {
-	//         -webkit-transform: scale(1);
-	//                 transform: scale(1);
-	//         -webkit-opacity: 1;             
-	//                 opacity: 1;
-	//     }
-	//     45%
-	//     {
-	//         -webkit-transform: scale(0.1);
-	//                 transform: scale(0.1);
-	//         -webkit-opacity: 0.7;             
-	//                 opacity: 0.7;
-	//     }
-	// }
-	//
-	// @keyframes v-pulseStretchDelay
-	// {
-	//     0%,
-	//     80%
-	//     {
-	//         -webkit-transform: scale(1);
-	//                 transform: scale(1);
-	//         -webkit-opacity: 1;             
-	//                 opacity: 1;
-	//     }
-	//     45%
-	//     {
-	//         -webkit-transform: scale(0.1);
-	//                 transform: scale(0.1);
-	//         -webkit-opacity: 0.7;             
-	//                 opacity: 0.7;
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 7 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-pulse v-pulse1\" v-bind:style=\"[spinnerStyle,spinnerDelay1]\">\n    </div><div class=\"v-pulse v-pulse2\" v-bind:style=\"[spinnerStyle,spinnerDelay2]\">\n    </div><div class=\"v-pulse v-pulse3\" v-bind:style=\"[spinnerStyle,spinnerDelay3]\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 8 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(9)
-	__vue_script__ = __webpack_require__(11)
-	__vue_template__ = __webpack_require__(12)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/GridLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 9 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(10);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-c536acea&file=GridLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./GridLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-c536acea&file=GridLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./GridLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 10 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n@-webkit-keyframes v-gridStretchDelay\n{\n    0%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n    }\n    50%\n    {\n        -webkit-transform: scale(0.5);\n                transform: scale(0.5);\n        -webkit-opacity: 0.7;             \n                opacity: 0.7;\n    }\n    100%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n\n@keyframes v-gridStretchDelay\n{\n    0%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n    }\n    50%\n    {\n        -webkit-transform: scale(0.5);\n                transform: scale(0.5);\n        -webkit-opacity: 0.7;             \n                opacity: 0.7;\n    }\n    100%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n", "", {"version":3,"sources":["/./src/GridLoader.vue.style"],"names":[],"mappings":";AA2IA;;IAEA;;QAEA,4BAAA;gBACA,oBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;QACA,qBAAA;gBACA,aAAA;KACA;IACA;;QAEA,4BAAA;gBACA,oBAAA;QACA,mBAAA;gBACA,WAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,4BAAA;gBACA,oBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;QACA,qBAAA;gBACA,aAAA;KACA;IACA;;QAEA,4BAAA;gBACA,oBAAA;QACA,mBAAA;gBACA,WAAA;KACA;CACA","file":"GridLoader.vue","sourcesContent":["<template>\n  <div class=\"v-spinner\" v-bind:style=\"containerStyle\" v-show=\"loading\">\n    <div class=\"v-grid v-grid1\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle1]\">\n    </div><div class=\"v-grid v-grid2\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle2]\">\n    </div><div class=\"v-grid v-grid3\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle3]\">\n    </div><div class=\"v-grid v-grid4\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle4]\">\n    </div><div class=\"v-grid v-grid5\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle5]\">\n    </div><div class=\"v-grid v-grid6\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle6]\">\n    </div><div class=\"v-grid v-grid7\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle7]\">\n    </div><div class=\"v-grid v-grid8\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle8]\">\n    </div><div class=\"v-grid v-grid9\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle9]\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'GridLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '15px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n        backgroundColor: this.color,\n        width: this.size,\n        height: this.size,\n        margin: this.margin,\n        borderRadius: this.radius\n      }\n    }\n  },\n  computed: {\n    animationStyle () {\n      return {\n        animationName: 'v-gridStretchDelay',\n        animationIterationCount: 'infinite',\n        animationTimingFunction: 'ease',\n        animationFillMode: 'both',\n        display: 'inline-block'\n      }\n    },\n    animationStyle1 () {\n      return {\n        animationDelay: this.delay(),\n        animationDuration: this.duration()\n      }\n    },\n    animationStyle2 () {\n      return {\n        animationDelay: this.delay(),\n        animationDuration: this.duration()\n      }\n    },\n    animationStyle3 () {\n      return {\n        animationDelay: this.delay(),\n        animationDuration: this.duration()\n      }\n    },\n    animationStyle4 () {\n      return {\n        animationDelay: this.delay(),\n        animationDuration: this.duration()\n      }\n    },\n    animationStyle5 () {\n      return {\n        animationDelay: this.delay(),\n        animationDuration: this.duration()\n      }\n    },\n    animationStyle6 () {\n      return {\n        animationDelay: this.delay(),\n        animationDuration: this.duration()\n      }\n    },\n    animationStyle7 () {\n      return {\n        animationDelay: this.delay(),\n        animationDuration: this.duration()\n      }\n    },\n    animationStyle8 () {\n      return {\n        animationDelay: this.delay(),\n        animationDuration: this.duration()\n      }\n    },\n    animationStyle9 () {\n      return {\n        animationDelay: this.delay(),\n        animationDuration: this.duration()\n      }\n    },\n    containerStyle () {\n      return {\n        width: parseFloat(this.size) * 3 + parseFloat(this.margin) * 6 + 'px',\n        fontSize: 0\n      }\n    }\n  },\n  methods: {\n    random (value) {\n      return Math.random() * value\n    },\n    delay () {\n      return ((this.random(100) / 100) - 0.2) + 's'\n    },\n    duration () {\n      return ((this.random(100) / 100) + 0.6) + 's'\n    },\n  }\n}\n</script>\n\n<style>\n@-webkit-keyframes v-gridStretchDelay\n{\n    0%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n    }\n    50%\n    {\n        -webkit-transform: scale(0.5);\n                transform: scale(0.5);\n        -webkit-opacity: 0.7;             \n                opacity: 0.7;\n    }\n    100%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n\n@keyframes v-gridStretchDelay\n{\n    0%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n    }\n    50%\n    {\n        -webkit-transform: scale(0.5);\n                transform: scale(0.5);\n        -webkit-opacity: 0.7;             \n                opacity: 0.7;\n    }\n    100%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 11 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	//   <div class="v-spinner" v-bind:style="containerStyle" v-show="loading">
-	//     <div class="v-grid v-grid1" v-bind:style="[spinnerStyle,animationStyle,animationStyle1]">
-	//     </div><div class="v-grid v-grid2" v-bind:style="[spinnerStyle,animationStyle,animationStyle2]">
-	//     </div><div class="v-grid v-grid3" v-bind:style="[spinnerStyle,animationStyle,animationStyle3]">
-	//     </div><div class="v-grid v-grid4" v-bind:style="[spinnerStyle,animationStyle,animationStyle4]">
-	//     </div><div class="v-grid v-grid5" v-bind:style="[spinnerStyle,animationStyle,animationStyle5]">
-	//     </div><div class="v-grid v-grid6" v-bind:style="[spinnerStyle,animationStyle,animationStyle6]">
-	//     </div><div class="v-grid v-grid7" v-bind:style="[spinnerStyle,animationStyle,animationStyle7]">
-	//     </div><div class="v-grid v-grid8" v-bind:style="[spinnerStyle,animationStyle,animationStyle8]">
-	//     </div><div class="v-grid v-grid9" v-bind:style="[spinnerStyle,animationStyle,animationStyle9]">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'GridLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '15px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        width: this.size,
-	        height: this.size,
-	        margin: this.margin,
-	        borderRadius: this.radius
-	      }
-	    };
-	  },
-	
-	  computed: {
-	    animationStyle: function animationStyle() {
-	      return {
-	        animationName: 'v-gridStretchDelay',
-	        animationIterationCount: 'infinite',
-	        animationTimingFunction: 'ease',
-	        animationFillMode: 'both',
-	        display: 'inline-block'
-	      };
-	    },
-	    animationStyle1: function animationStyle1() {
-	      return {
-	        animationDelay: this.delay(),
-	        animationDuration: this.duration()
-	      };
-	    },
-	    animationStyle2: function animationStyle2() {
-	      return {
-	        animationDelay: this.delay(),
-	        animationDuration: this.duration()
-	      };
-	    },
-	    animationStyle3: function animationStyle3() {
-	      return {
-	        animationDelay: this.delay(),
-	        animationDuration: this.duration()
-	      };
-	    },
-	    animationStyle4: function animationStyle4() {
-	      return {
-	        animationDelay: this.delay(),
-	        animationDuration: this.duration()
-	      };
-	    },
-	    animationStyle5: function animationStyle5() {
-	      return {
-	        animationDelay: this.delay(),
-	        animationDuration: this.duration()
-	      };
-	    },
-	    animationStyle6: function animationStyle6() {
-	      return {
-	        animationDelay: this.delay(),
-	        animationDuration: this.duration()
-	      };
-	    },
-	    animationStyle7: function animationStyle7() {
-	      return {
-	        animationDelay: this.delay(),
-	        animationDuration: this.duration()
-	      };
-	    },
-	    animationStyle8: function animationStyle8() {
-	      return {
-	        animationDelay: this.delay(),
-	        animationDuration: this.duration()
-	      };
-	    },
-	    animationStyle9: function animationStyle9() {
-	      return {
-	        animationDelay: this.delay(),
-	        animationDuration: this.duration()
-	      };
-	    },
-	    containerStyle: function containerStyle() {
-	      return {
-	        width: parseFloat(this.size) * 3 + parseFloat(this.margin) * 6 + 'px',
-	        fontSize: 0
-	      };
-	    }
-	  },
-	  methods: {
-	    random: function random(value) {
-	      return Math.random() * value;
-	    },
-	    delay: function delay() {
-	      return this.random(100) / 100 - 0.2 + 's';
-	    },
-	    duration: function duration() {
-	      return this.random(100) / 100 + 0.6 + 's';
-	    }
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	// @-webkit-keyframes v-gridStretchDelay
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: scale(1);
-	//                 transform: scale(1);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scale(0.5);
-	//                 transform: scale(0.5);
-	//         -webkit-opacity: 0.7;             
-	//                 opacity: 0.7;
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: scale(1);
-	//                 transform: scale(1);
-	//         -webkit-opacity: 1;             
-	//                 opacity: 1;
-	//     }
-	// }
-	//
-	// @keyframes v-gridStretchDelay
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: scale(1);
-	//                 transform: scale(1);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scale(0.5);
-	//                 transform: scale(0.5);
-	//         -webkit-opacity: 0.7;             
-	//                 opacity: 0.7;
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: scale(1);
-	//                 transform: scale(1);
-	//         -webkit-opacity: 1;             
-	//                 opacity: 1;
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 12 */
-/***/ function(module, exports) {
-
-	module.exports = "\n  <div class=\"v-spinner\" v-bind:style=\"containerStyle\" v-show=\"loading\">\n    <div class=\"v-grid v-grid1\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle1]\">\n    </div><div class=\"v-grid v-grid2\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle2]\">\n    </div><div class=\"v-grid v-grid3\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle3]\">\n    </div><div class=\"v-grid v-grid4\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle4]\">\n    </div><div class=\"v-grid v-grid5\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle5]\">\n    </div><div class=\"v-grid v-grid6\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle6]\">\n    </div><div class=\"v-grid v-grid7\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle7]\">\n    </div><div class=\"v-grid v-grid8\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle8]\">\n    </div><div class=\"v-grid v-grid9\" v-bind:style=\"[spinnerStyle,animationStyle,animationStyle9]\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 13 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(14)
-	__vue_script__ = __webpack_require__(16)
-	__vue_template__ = __webpack_require__(17)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/ClipLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 14 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(15);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-7993bd15&file=ClipLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./ClipLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-7993bd15&file=ClipLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./ClipLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 15 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n.v-spinner\n{\n/*\t  font-size: 10px; \n\n    width: 60px;\n    height: 40px;*/\n    /*margin: 25px auto;*/\n    text-align: center;\n \n}\n\n.v-spinner .v-clip\n{\n    -webkit-animation: v-clipDelay 0.75s 0s infinite linear;\n            animation: v-clipDelay 0.75s 0s infinite linear;\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n\n    display: inline-block;\n}\n\n@-webkit-keyframes v-clipDelay\n{\n    0%\n    {\n        -webkit-transform: rotate(0deg) scale(1);\n                transform: rotate(0deg) scale(1);\n    }\n    50%\n    {\n        -webkit-transform: rotate(180deg) scale(0.8);\n                transform: rotate(180deg) scale(0.8);\n    }\n    100%\n    {\n        -webkit-transform: rotate(360deg) scale(1);\n                transform: rotate(360deg) scale(1);\n    }\n}\n\n@keyframes v-clipDelay\n{\n    0%\n    {\n        -webkit-transform: rotate(0deg) scale(1);\n                transform: rotate(0deg) scale(1);\n    }\n    50%\n    {\n        -webkit-transform: rotate(180deg) scale(0.8);\n                transform: rotate(180deg) scale(0.8);\n    }\n    100%\n    {\n        -webkit-transform: rotate(360deg) scale(1);\n                transform: rotate(360deg) scale(1);\n    }\n}\n", "", {"version":3,"sources":["/./src/ClipLoader.vue.style"],"names":[],"mappings":";AA+CA;;AAEA;;;mBAGA;IACA,sBAAA;IACA,mBAAA;;CAEA;;AAEA;;IAEA,wDAAA;YACA,gDAAA;IACA,kCAAA;WACA,0BAAA;;IAEA,sBAAA;CACA;;AAEA;;IAEA;;QAEA,yCAAA;gBACA,iCAAA;KACA;IACA;;QAEA,6CAAA;gBACA,qCAAA;KACA;IACA;;QAEA,2CAAA;gBACA,mCAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,yCAAA;gBACA,iCAAA;KACA;IACA;;QAEA,6CAAA;gBACA,qCAAA;KACA;IACA;;QAEA,2CAAA;gBACA,mCAAA;KACA;CACA","file":"ClipLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-clip\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'ClipLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n  \tcolor: { \n      type: String,\n      default: '#5dc596'\n    },\n  \tsize: {\n      type: String,\n      default: '35px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  computed: {\n    spinnerStyle () {\n      return {\n        height: this.size,\n        width: this.size,\n        borderWidth: '2px',\n        borderStyle: 'solid',\n        borderColor: this.color + ' ' + this.color + ' transparent',\n        borderRadius: this.radius,\n        background: 'transparent'\n      }\n    }\n  }\n}\n</script>\n\n<style>\n.v-spinner\n{\n/*\t  font-size: 10px; \n\n    width: 60px;\n    height: 40px;*/\n    /*margin: 25px auto;*/\n    text-align: center;\n \n}\n\n.v-spinner .v-clip\n{\n    -webkit-animation: v-clipDelay 0.75s 0s infinite linear;\n            animation: v-clipDelay 0.75s 0s infinite linear;\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n\n    display: inline-block;\n}\n\n@-webkit-keyframes v-clipDelay\n{\n    0%\n    {\n        -webkit-transform: rotate(0deg) scale(1);\n                transform: rotate(0deg) scale(1);\n    }\n    50%\n    {\n        -webkit-transform: rotate(180deg) scale(0.8);\n                transform: rotate(180deg) scale(0.8);\n    }\n    100%\n    {\n        -webkit-transform: rotate(360deg) scale(1);\n                transform: rotate(360deg) scale(1);\n    }\n}\n\n@keyframes v-clipDelay\n{\n    0%\n    {\n        -webkit-transform: rotate(0deg) scale(1);\n                transform: rotate(0deg) scale(1);\n    }\n    50%\n    {\n        -webkit-transform: rotate(180deg) scale(0.8);\n                transform: rotate(180deg) scale(0.8);\n    }\n    100%\n    {\n        -webkit-transform: rotate(360deg) scale(1);\n                transform: rotate(360deg) scale(1);\n    }\n}\n</style>\n"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 16 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-clip" v-bind:style="spinnerStyle">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'ClipLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '35px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  computed: {
-	    spinnerStyle: function spinnerStyle() {
-	      return {
-	        height: this.size,
-	        width: this.size,
-	        borderWidth: '2px',
-	        borderStyle: 'solid',
-	        borderColor: this.color + ' ' + this.color + ' transparent',
-	        borderRadius: this.radius,
-	        background: 'transparent'
-	      };
-	    }
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	// .v-spinner
-	// {
-	// /*	  font-size: 10px; 
-	//
-	//     width: 60px;
-	//     height: 40px;*/
-	//     /*margin: 25px auto;*/
-	//     text-align: center;
-	//
-	// }
-	//
-	// .v-spinner .v-clip
-	// {
-	//     -webkit-animation: v-clipDelay 0.75s 0s infinite linear;
-	//             animation: v-clipDelay 0.75s 0s infinite linear;
-	//     -webkit-animation-fill-mode: both;
-	// 	          animation-fill-mode: both;
-	//
-	//     display: inline-block;
-	// }
-	//
-	// @-webkit-keyframes v-clipDelay
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: rotate(0deg) scale(1);
-	//                 transform: rotate(0deg) scale(1);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: rotate(180deg) scale(0.8);
-	//                 transform: rotate(180deg) scale(0.8);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: rotate(360deg) scale(1);
-	//                 transform: rotate(360deg) scale(1);
-	//     }
-	// }
-	//
-	// @keyframes v-clipDelay
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: rotate(0deg) scale(1);
-	//                 transform: rotate(0deg) scale(1);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: rotate(180deg) scale(0.8);
-	//                 transform: rotate(180deg) scale(0.8);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: rotate(360deg) scale(1);
-	//                 transform: rotate(360deg) scale(1);
-	//     }
-	// }
-	// </style>
-	//
-	/* generated by vue-loader */
-
-/***/ },
-/* 17 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-clip\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 18 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(19)
-	__vue_script__ = __webpack_require__(21)
-	__vue_template__ = __webpack_require__(22)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/RiseLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 19 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(20);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-5db37d64&file=RiseLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./RiseLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-5db37d64&file=RiseLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./RiseLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 20 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n.v-spinner\n{\n/*\t  font-size: 10px; \n\n    width: 60px;\n    height: 40px;*/\n    /*margin: 25px auto;*/\n    text-align: center;\n \n}\n\n.v-spinner .v-rise-odd\n{\n    -webkit-animation: v-riseOddDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);\n            animation: v-riseOddDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);\n    -webkit-animation-fill-mode: both;\n\t        animation-fill-mode: both;\n\n    display: inline-block;\n}\n\n.v-spinner .v-rise-even\n{\n    -webkit-animation: v-riseEvenDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);\n            animation: v-riseEvenDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);\n    -webkit-animation-fill-mode: both;\n            animation-fill-mode: both;\n\n    display: inline-block;\n}\n\n@-webkit-keyframes v-riseOddDelay\n{\n    25\n    {\n        -webkit-transform: translateY(30px);\n                transform: translateY(30px);\n    }\n    0%\n    {\n        -webkit-transform: scale(0.4);\n                transform: scale(0.4);\n    }\n    50%\n    {\n        -webkit-transform: scale(1.1);\n                transform: scale(1.1);\n    }\n    75%\n    {\n        -webkit-transform: translateY(-30px);\n                transform: translateY(-30px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0) scale(0.75);\n                transform: translateY(0) scale(0.75);\n    }\n}\n\n@keyframes v-riseOddDelay\n{\n    25\n    {\n        -webkit-transform: translateY(30px);\n                transform: translateY(30px);\n    }\n    0%\n    {\n        -webkit-transform: scale(0.4);\n                transform: scale(0.4);\n    }\n    50%\n    {\n        -webkit-transform: scale(1.1);\n                transform: scale(1.1);\n    }\n    75%\n    {\n        -webkit-transform: translateY(-30px);\n                transform: translateY(-30px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0) scale(0.75);\n                transform: translateY(0) scale(0.75);\n    }\n}\n\n@-webkit-keyframes v-riseEvenDelay\n{\n    25\n    {\n        -webkit-transform: translateY(-30px);\n                transform: translateY(-30px);\n    }\n    0%\n    {\n        -webkit-transform: scale(1.1);\n                transform: scale(1.1);\n    }\n    50%\n    {\n        -webkit-transform: scale(0.4);\n                transform: scale(0.4);\n    }\n    75%\n    {\n        -webkit-transform: translateY(30px);\n                transform: translateY(30px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0) scale(1.0);\n                transform: translateY(0) scale(1.0);\n    }\n}\n\n@keyframes v-riseEvenDelay\n{\n    25\n    {\n        -webkit-transform: translateY(-30px);\n                transform: translateY(-30px);\n    }\n    0%\n    {\n        -webkit-transform: scale(1.1);\n                transform: scale(1.1);\n    }\n    50%\n    {\n        -webkit-transform: scale(0.4);\n                transform: scale(0.4);\n    }\n    75%\n    {\n        -webkit-transform: translateY(30px);\n                transform: translateY(30px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0) scale(1.0);\n                transform: translateY(0) scale(1.0);\n    }\n}\n", "", {"version":3,"sources":["/./src/RiseLoader.vue.style"],"names":[],"mappings":";AAsDA;;AAEA;;;mBAGA;IACA,sBAAA;IACA,mBAAA;;CAEA;;AAEA;;IAEA,6EAAA;YACA,qEAAA;IACA,kCAAA;SACA,0BAAA;;IAEA,sBAAA;CACA;;AAEA;;IAEA,8EAAA;YACA,sEAAA;IACA,kCAAA;YACA,0BAAA;;IAEA,sBAAA;CACA;;AAEA;;IAEA;;QAEA,oCAAA;gBACA,4BAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;IACA;;QAEA,qCAAA;gBACA,6BAAA;KACA;IACA;;QAEA,6CAAA;gBACA,qCAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,oCAAA;gBACA,4BAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;IACA;;QAEA,qCAAA;gBACA,6BAAA;KACA;IACA;;QAEA,6CAAA;gBACA,qCAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,qCAAA;gBACA,6BAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;IACA;;QAEA,oCAAA;gBACA,4BAAA;KACA;IACA;;QAEA,4CAAA;gBACA,oCAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,qCAAA;gBACA,6BAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;IACA;;QAEA,oCAAA;gBACA,4BAAA;KACA;IACA;;QAEA,4CAAA;gBACA,oCAAA;KACA;CACA","file":"RiseLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-rise v-rise-odd\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rise v-rise-even\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rise v-rise-odd\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rise v-rise-even\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rise v-rise-odd\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'RiseLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n  \tcolor: { \n      type: String,\n      default: '#5dc596'\n    },\n  \tsize: {\n      type: String,\n      default: '15px'\n    },\n  \tmargin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n      \tbackgroundColor: this.color,\n      \theight: this.size,\n     \twidth: this.size,\n      \tmargin: this.margin,\n      \tborderRadius: this.radius\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n.v-spinner\n{\n/*\t  font-size: 10px; \n\n    width: 60px;\n    height: 40px;*/\n    /*margin: 25px auto;*/\n    text-align: center;\n \n}\n\n.v-spinner .v-rise-odd\n{\n    -webkit-animation: v-riseOddDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);\n            animation: v-riseOddDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);\n    -webkit-animation-fill-mode: both;\n\t        animation-fill-mode: both;\n\n    display: inline-block;\n}\n\n.v-spinner .v-rise-even\n{\n    -webkit-animation: v-riseEvenDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);\n            animation: v-riseEvenDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);\n    -webkit-animation-fill-mode: both;\n            animation-fill-mode: both;\n\n    display: inline-block;\n}\n\n@-webkit-keyframes v-riseOddDelay\n{\n    25\n    {\n        -webkit-transform: translateY(30px);\n                transform: translateY(30px);\n    }\n    0%\n    {\n        -webkit-transform: scale(0.4);\n                transform: scale(0.4);\n    }\n    50%\n    {\n        -webkit-transform: scale(1.1);\n                transform: scale(1.1);\n    }\n    75%\n    {\n        -webkit-transform: translateY(-30px);\n                transform: translateY(-30px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0) scale(0.75);\n                transform: translateY(0) scale(0.75);\n    }\n}\n\n@keyframes v-riseOddDelay\n{\n    25\n    {\n        -webkit-transform: translateY(30px);\n                transform: translateY(30px);\n    }\n    0%\n    {\n        -webkit-transform: scale(0.4);\n                transform: scale(0.4);\n    }\n    50%\n    {\n        -webkit-transform: scale(1.1);\n                transform: scale(1.1);\n    }\n    75%\n    {\n        -webkit-transform: translateY(-30px);\n                transform: translateY(-30px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0) scale(0.75);\n                transform: translateY(0) scale(0.75);\n    }\n}\n\n@-webkit-keyframes v-riseEvenDelay\n{\n    25\n    {\n        -webkit-transform: translateY(-30px);\n                transform: translateY(-30px);\n    }\n    0%\n    {\n        -webkit-transform: scale(1.1);\n                transform: scale(1.1);\n    }\n    50%\n    {\n        -webkit-transform: scale(0.4);\n                transform: scale(0.4);\n    }\n    75%\n    {\n        -webkit-transform: translateY(30px);\n                transform: translateY(30px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0) scale(1.0);\n                transform: translateY(0) scale(1.0);\n    }\n}\n\n@keyframes v-riseEvenDelay\n{\n    25\n    {\n        -webkit-transform: translateY(-30px);\n                transform: translateY(-30px);\n    }\n    0%\n    {\n        -webkit-transform: scale(1.1);\n                transform: scale(1.1);\n    }\n    50%\n    {\n        -webkit-transform: scale(0.4);\n                transform: scale(0.4);\n    }\n    75%\n    {\n        -webkit-transform: translateY(30px);\n                transform: translateY(30px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0) scale(1.0);\n                transform: translateY(0) scale(1.0);\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 21 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-rise v-rise-odd" v-bind:style="spinnerStyle">
-	//     </div><div class="v-rise v-rise-even" v-bind:style="spinnerStyle">
-	//     </div><div class="v-rise v-rise-odd" v-bind:style="spinnerStyle">
-	//     </div><div class="v-rise v-rise-even" v-bind:style="spinnerStyle">
-	//     </div><div class="v-rise v-rise-odd" v-bind:style="spinnerStyle">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'RiseLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '15px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        height: this.size,
-	        width: this.size,
-	        margin: this.margin,
-	        borderRadius: this.radius
-	      }
-	    };
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	// .v-spinner
-	// {
-	// /*	  font-size: 10px; 
-	//
-	//     width: 60px;
-	//     height: 40px;*/
-	//     /*margin: 25px auto;*/
-	//     text-align: center;
-	//
-	// }
-	//
-	// .v-spinner .v-rise-odd
-	// {
-	//     -webkit-animation: v-riseOddDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);
-	//             animation: v-riseOddDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);
-	//     -webkit-animation-fill-mode: both;
-	// 	        animation-fill-mode: both;
-	//
-	//     display: inline-block;
-	// }
-	//
-	// .v-spinner .v-rise-even
-	// {
-	//     -webkit-animation: v-riseEvenDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);
-	//             animation: v-riseEvenDelay 1s 0s infinite cubic-bezier(.15,.46,.9,.6);
-	//     -webkit-animation-fill-mode: both;
-	//             animation-fill-mode: both;
-	//
-	//     display: inline-block;
-	// }
-	//
-	// @-webkit-keyframes v-riseOddDelay
-	// {
-	//     25
-	//     {
-	//         -webkit-transform: translateY(30px);
-	//                 transform: translateY(30px);
-	//     }
-	//     0%
-	//     {
-	//         -webkit-transform: scale(0.4);
-	//                 transform: scale(0.4);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scale(1.1);
-	//                 transform: scale(1.1);
-	//     }
-	//     75%
-	//     {
-	//         -webkit-transform: translateY(-30px);
-	//                 transform: translateY(-30px);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: translateY(0) scale(0.75);
-	//                 transform: translateY(0) scale(0.75);
-	//     }
-	// }
-	//
-	// @keyframes v-riseOddDelay
-	// {
-	//     25
-	//     {
-	//         -webkit-transform: translateY(30px);
-	//                 transform: translateY(30px);
-	//     }
-	//     0%
-	//     {
-	//         -webkit-transform: scale(0.4);
-	//                 transform: scale(0.4);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scale(1.1);
-	//                 transform: scale(1.1);
-	//     }
-	//     75%
-	//     {
-	//         -webkit-transform: translateY(-30px);
-	//                 transform: translateY(-30px);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: translateY(0) scale(0.75);
-	//                 transform: translateY(0) scale(0.75);
-	//     }
-	// }
-	//
-	// @-webkit-keyframes v-riseEvenDelay
-	// {
-	//     25
-	//     {
-	//         -webkit-transform: translateY(-30px);
-	//                 transform: translateY(-30px);
-	//     }
-	//     0%
-	//     {
-	//         -webkit-transform: scale(1.1);
-	//                 transform: scale(1.1);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scale(0.4);
-	//                 transform: scale(0.4);
-	//     }
-	//     75%
-	//     {
-	//         -webkit-transform: translateY(30px);
-	//                 transform: translateY(30px);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: translateY(0) scale(1.0);
-	//                 transform: translateY(0) scale(1.0);
-	//     }
-	// }
-	//
-	// @keyframes v-riseEvenDelay
-	// {
-	//     25
-	//     {
-	//         -webkit-transform: translateY(-30px);
-	//                 transform: translateY(-30px);
-	//     }
-	//     0%
-	//     {
-	//         -webkit-transform: scale(1.1);
-	//                 transform: scale(1.1);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scale(0.4);
-	//                 transform: scale(0.4);
-	//     }
-	//     75%
-	//     {
-	//         -webkit-transform: translateY(30px);
-	//                 transform: translateY(30px);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: translateY(0) scale(1.0);
-	//                 transform: translateY(0) scale(1.0);
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 22 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-rise v-rise-odd\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rise v-rise-even\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rise v-rise-odd\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rise v-rise-even\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rise v-rise-odd\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 23 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(24)
-	__vue_script__ = __webpack_require__(26)
-	__vue_template__ = __webpack_require__(27)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/BeatLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 24 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(25);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-282665fb&file=BeatLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./BeatLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-282665fb&file=BeatLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./BeatLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 25 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n\n.v-spinner .v-beat\n{\n    -webkit-animation: v-beatStretchDelay 0.7s infinite linear;\n            animation: v-beatStretchDelay 0.7s infinite linear;\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n    display: inline-block;         \n}\n\n.v-spinner .v-beat-odd\n{\n  -webkit-animation-delay: 0s;\n          animation-delay: 0s;\n}\n.v-spinner .v-beat-even\n{\n  -webkit-animation-delay: 0.35s;\n          animation-delay: 0.35s;\n}\n\n@-webkit-keyframes v-beatStretchDelay\n{\n    50%\n    {\n        -webkit-transform: scale(0.75);\n                transform: scale(0.75);\n        -webkit-opacity: 0.2;             \n                opacity: 0.2;\n    }\n    100%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n\n@keyframes v-beatStretchDelay\n{\n    50%\n    {\n        -webkit-transform: scale(0.75);\n                transform: scale(0.75);\n        -webkit-opacity: 0.2;             \n                opacity: 0.2;\n    }\n    100%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n", "", {"version":3,"sources":["/./src/BeatLoader.vue.style"],"names":[],"mappings":";;AAqDA;;IAEA,2DAAA;YACA,mDAAA;IACA,kCAAA;WACA,0BAAA;IACA,sBAAA;CACA;;AAEA;;EAEA,4BAAA;UAAA,oBAAA;CACA;AACA;;EAEA,+BAAA;UAAA,uBAAA;CACA;;AAEA;;IAEA;;QAEA,+BAAA;gBACA,uBAAA;QACA,qBAAA;gBACA,aAAA;KACA;IACA;;QAEA,4BAAA;gBACA,oBAAA;QACA,mBAAA;gBACA,WAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,+BAAA;gBACA,uBAAA;QACA,qBAAA;gBACA,aAAA;KACA;IACA;;QAEA,4BAAA;gBACA,oBAAA;QACA,mBAAA;gBACA,WAAA;KACA;CACA","file":"BeatLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-beat v-beat-odd\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-beat v-beat-even\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-beat v-beat-odd\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'BeatLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '15px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n      \tbackgroundColor: this.color,\n      \theight: this.size,\n     \t\twidth: this.size,\n      \tmargin: this.margin,\n      \tborderRadius: this.radius\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n\n.v-spinner .v-beat\n{\n    -webkit-animation: v-beatStretchDelay 0.7s infinite linear;\n            animation: v-beatStretchDelay 0.7s infinite linear;\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n    display: inline-block;         \n}\n\n.v-spinner .v-beat-odd\n{\n  animation-delay: 0s;\n}\n.v-spinner .v-beat-even\n{\n  animation-delay: 0.35s;\n}\n\n@-webkit-keyframes v-beatStretchDelay\n{\n    50%\n    {\n        -webkit-transform: scale(0.75);\n                transform: scale(0.75);\n        -webkit-opacity: 0.2;             \n                opacity: 0.2;\n    }\n    100%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n\n@keyframes v-beatStretchDelay\n{\n    50%\n    {\n        -webkit-transform: scale(0.75);\n                transform: scale(0.75);\n        -webkit-opacity: 0.2;             \n                opacity: 0.2;\n    }\n    100%\n    {\n        -webkit-transform: scale(1);\n                transform: scale(1);\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 26 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-beat v-beat-odd" v-bind:style="spinnerStyle">
-	//     </div><div class="v-beat v-beat-even" v-bind:style="spinnerStyle">
-	//     </div><div class="v-beat v-beat-odd" v-bind:style="spinnerStyle">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'BeatLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '15px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        height: this.size,
-	        width: this.size,
-	        margin: this.margin,
-	        borderRadius: this.radius
-	      }
-	    };
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	//
-	// .v-spinner .v-beat
-	// {
-	//     -webkit-animation: v-beatStretchDelay 0.7s infinite linear;
-	//             animation: v-beatStretchDelay 0.7s infinite linear;
-	//     -webkit-animation-fill-mode: both;
-	// 	          animation-fill-mode: both;
-	//     display: inline-block;         
-	// }
-	//
-	// .v-spinner .v-beat-odd
-	// {
-	//   animation-delay: 0s;
-	// }
-	// .v-spinner .v-beat-even
-	// {
-	//   animation-delay: 0.35s;
-	// }
-	//
-	// @-webkit-keyframes v-beatStretchDelay
-	// {
-	//     50%
-	//     {
-	//         -webkit-transform: scale(0.75);
-	//                 transform: scale(0.75);
-	//         -webkit-opacity: 0.2;             
-	//                 opacity: 0.2;
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: scale(1);
-	//                 transform: scale(1);
-	//         -webkit-opacity: 1;             
-	//                 opacity: 1;
-	//     }
-	// }
-	//
-	// @keyframes v-beatStretchDelay
-	// {
-	//     50%
-	//     {
-	//         -webkit-transform: scale(0.75);
-	//                 transform: scale(0.75);
-	//         -webkit-opacity: 0.2;             
-	//                 opacity: 0.2;
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: scale(1);
-	//                 transform: scale(1);
-	//         -webkit-opacity: 1;             
-	//                 opacity: 1;
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 27 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-beat v-beat-odd\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-beat v-beat-even\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-beat v-beat-odd\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 28 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(29)
-	__vue_script__ = __webpack_require__(31)
-	__vue_template__ = __webpack_require__(32)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/SyncLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 29 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(30);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-4af2d2a0&file=SyncLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./SyncLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-4af2d2a0&file=SyncLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./SyncLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 30 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n\n@-webkit-keyframes v-syncStretchDelay\n{\n    33%\n    {\n        -webkit-transform: translateY(10px);\n                transform: translateY(10px);\n    }\n    66%\n    {\n        -webkit-transform: translateY(-10px);\n                transform: translateY(-10px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0);\n                transform: translateY(0);\n    }\n}\n\n@keyframes v-syncStretchDelay\n{\n    33%\n    {\n        -webkit-transform: translateY(10px);\n                transform: translateY(10px);\n    }\n    66%\n    {\n        -webkit-transform: translateY(-10px);\n                transform: translateY(-10px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0);\n                transform: translateY(0);\n    }\n}\n", "", {"version":3,"sources":["/./src/SyncLoader.vue.style"],"names":[],"mappings":";;AAoEA;;IAEA;;QAEA,oCAAA;gBACA,4BAAA;KACA;IACA;;QAEA,qCAAA;gBACA,6BAAA;KACA;IACA;;QAEA,iCAAA;gBACA,yBAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,oCAAA;gBACA,4BAAA;KACA;IACA;;QAEA,qCAAA;gBACA,6BAAA;KACA;IACA;;QAEA,iCAAA;gBACA,yBAAA;KACA;CACA","file":"SyncLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-sync v-sync1\" v-bind:style=\"[spinnerStyle,spinnerDelay1]\">\n    </div><div class=\"v-sync v-sync2\" v-bind:style=\"[spinnerStyle,spinnerDelay2]\">\n    </div><div class=\"v-sync v-sync3\" v-bind:style=\"[spinnerStyle,spinnerDelay3]\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'SyncLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '15px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n      \tbackgroundColor: this.color,\n      \theight: this.size,\n     \t\twidth: this.size,\n      \tmargin: this.margin,\n      \tborderRadius: this.radius,\n        display: 'inline-block',\n        animationName: 'v-syncStretchDelay',\n        animationDuration: '0.6s',\n        animationIterationCount: 'infinite',\n        animationTimingFunction: 'ease-in-out',\n        animationFillMode: 'both'\n      },\n      spinnerDelay1: {\n        animationDelay: '0.07s'\n      },\n      spinnerDelay2: {\n        animationDelay: '0.14s'\n      },\n      spinnerDelay3: {\n        animationDelay: '0.21s'\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n\n@-webkit-keyframes v-syncStretchDelay\n{\n    33%\n    {\n        -webkit-transform: translateY(10px);\n                transform: translateY(10px);\n    }\n    66%\n    {\n        -webkit-transform: translateY(-10px);\n                transform: translateY(-10px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0);\n                transform: translateY(0);\n    }\n}\n\n@keyframes v-syncStretchDelay\n{\n    33%\n    {\n        -webkit-transform: translateY(10px);\n                transform: translateY(10px);\n    }\n    66%\n    {\n        -webkit-transform: translateY(-10px);\n                transform: translateY(-10px);\n    }\n    100%\n    {\n        -webkit-transform: translateY(0);\n                transform: translateY(0);\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 31 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-sync v-sync1" v-bind:style="[spinnerStyle,spinnerDelay1]">
-	//     </div><div class="v-sync v-sync2" v-bind:style="[spinnerStyle,spinnerDelay2]">
-	//     </div><div class="v-sync v-sync3" v-bind:style="[spinnerStyle,spinnerDelay3]">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'SyncLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '15px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        height: this.size,
-	        width: this.size,
-	        margin: this.margin,
-	        borderRadius: this.radius,
-	        display: 'inline-block',
-	        animationName: 'v-syncStretchDelay',
-	        animationDuration: '0.6s',
-	        animationIterationCount: 'infinite',
-	        animationTimingFunction: 'ease-in-out',
-	        animationFillMode: 'both'
-	      },
-	      spinnerDelay1: {
-	        animationDelay: '0.07s'
-	      },
-	      spinnerDelay2: {
-	        animationDelay: '0.14s'
-	      },
-	      spinnerDelay3: {
-	        animationDelay: '0.21s'
-	      }
-	    };
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	//
-	// @-webkit-keyframes v-syncStretchDelay
-	// {
-	//     33%
-	//     {
-	//         -webkit-transform: translateY(10px);
-	//                 transform: translateY(10px);
-	//     }
-	//     66%
-	//     {
-	//         -webkit-transform: translateY(-10px);
-	//                 transform: translateY(-10px);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: translateY(0);
-	//                 transform: translateY(0);
-	//     }
-	// }
-	//
-	// @keyframes v-syncStretchDelay
-	// {
-	//     33%
-	//     {
-	//         -webkit-transform: translateY(10px);
-	//                 transform: translateY(10px);
-	//     }
-	//     66%
-	//     {
-	//         -webkit-transform: translateY(-10px);
-	//                 transform: translateY(-10px);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: translateY(0);
-	//                 transform: translateY(0);
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 32 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-sync v-sync1\" v-bind:style=\"[spinnerStyle,spinnerDelay1]\">\n    </div><div class=\"v-sync v-sync2\" v-bind:style=\"[spinnerStyle,spinnerDelay2]\">\n    </div><div class=\"v-sync v-sync3\" v-bind:style=\"[spinnerStyle,spinnerDelay3]\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 33 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(34)
-	__vue_script__ = __webpack_require__(36)
-	__vue_template__ = __webpack_require__(37)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/RotateLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 34 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(35);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-16962240&file=RotateLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./RotateLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-16962240&file=RotateLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./RotateLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 35 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n\n.v-spinner .v-rotate1\n{\n    -webkit-animation: v-rotateStretchDelay 1s 0s infinite cubic-bezier(.7,-.13,.22,.86);\n            animation: v-rotateStretchDelay 1s 0s infinite cubic-bezier(.7,-.13,.22,.86);\n    -webkit-animation-fill-mode: both;\n            animation-fill-mode: both;\n    display: inline-block;\n    position: relative;\n}\n\n.v-spinner .v-rotate2\n{\n  opacity: 0.8;\n  position: absolute;\n  top: 0;\n  left: -28px;\n}\n\n.v-spinner .v-rotate3\n{\n  opacity: 0.8;\n  position: absolute;\n  top: 0;\n  left: 25px;\n}\n\n@-webkit-keyframes v-rotateStretchDelay\n{\n    0%\n    {\n        -webkit-transform: rotate(0deg);\n                transform: rotate(0deg);\n    }\n    50%\n    {\n        -webkit-transform: rotate(180deg);\n                transform: rotate(180deg);\n    }\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n\n@keyframes v-rotateStretchDelay\n{\n    0%\n    {\n        -webkit-transform: rotate(0deg);\n                transform: rotate(0deg);\n    }\n    50%\n    {\n        -webkit-transform: rotate(180deg);\n                transform: rotate(180deg);\n    }\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n", "", {"version":3,"sources":["/./src/RotateLoader.vue.style"],"names":[],"mappings":";;AAqDA;;IAEA,qFAAA;YACA,6EAAA;IACA,kCAAA;YACA,0BAAA;IACA,sBAAA;IACA,mBAAA;CACA;;AAEA;;EAEA,aAAA;EACA,mBAAA;EACA,OAAA;EACA,YAAA;CACA;;AAEA;;EAEA,aAAA;EACA,mBAAA;EACA,OAAA;EACA,WAAA;CACA;;AAEA;;IAEA;;QAEA,gCAAA;gBACA,wBAAA;KACA;IACA;;QAEA,kCAAA;gBACA,0BAAA;KACA;IACA;;QAEA,kCAAA;gBACA,0BAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,gCAAA;gBACA,wBAAA;KACA;IACA;;QAEA,kCAAA;gBACA,0BAAA;KACA;IACA;;QAEA,kCAAA;gBACA,0BAAA;KACA;CACA","file":"RotateLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-rotate v-rotate1\" v-bind:style=\"spinnerStyle\">\n    <div class=\"v-rotate v-rotate2\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rotate v-rotate3\" v-bind:style=\"spinnerStyle\">\n    </div></div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'RotateLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '15px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n      \tbackgroundColor: this.color,\n      \theight: this.size,\n     \t\twidth: this.size,\n      \tmargin: this.margin,\n      \tborderRadius: this.radius\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n\n.v-spinner .v-rotate1\n{\n    -webkit-animation: v-rotateStretchDelay 1s 0s infinite cubic-bezier(.7,-.13,.22,.86);\n            animation: v-rotateStretchDelay 1s 0s infinite cubic-bezier(.7,-.13,.22,.86);\n    -webkit-animation-fill-mode: both;\n            animation-fill-mode: both;\n    display: inline-block;\n    position: relative;\n}\n\n.v-spinner .v-rotate2\n{\n  opacity: 0.8;\n  position: absolute;\n  top: 0;\n  left: -28px;\n}\n\n.v-spinner .v-rotate3\n{\n  opacity: 0.8;\n  position: absolute;\n  top: 0;\n  left: 25px;\n}\n\n@-webkit-keyframes v-rotateStretchDelay\n{\n    0%\n    {\n        -webkit-transform: rotate(0deg);\n                transform: rotate(0deg);\n    }\n    50%\n    {\n        -webkit-transform: rotate(180deg);\n                transform: rotate(180deg);\n    }\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n\n@keyframes v-rotateStretchDelay\n{\n    0%\n    {\n        -webkit-transform: rotate(0deg);\n                transform: rotate(0deg);\n    }\n    50%\n    {\n        -webkit-transform: rotate(180deg);\n                transform: rotate(180deg);\n    }\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 36 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-rotate v-rotate1" v-bind:style="spinnerStyle">
-	//     <div class="v-rotate v-rotate2" v-bind:style="spinnerStyle">
-	//     </div><div class="v-rotate v-rotate3" v-bind:style="spinnerStyle">
-	//     </div></div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'RotateLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '15px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        height: this.size,
-	        width: this.size,
-	        margin: this.margin,
-	        borderRadius: this.radius
-	      }
-	    };
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	//
-	// .v-spinner .v-rotate1
-	// {
-	//     -webkit-animation: v-rotateStretchDelay 1s 0s infinite cubic-bezier(.7,-.13,.22,.86);
-	//             animation: v-rotateStretchDelay 1s 0s infinite cubic-bezier(.7,-.13,.22,.86);
-	//     -webkit-animation-fill-mode: both;
-	//             animation-fill-mode: both;
-	//     display: inline-block;
-	//     position: relative;
-	// }
-	//
-	// .v-spinner .v-rotate2
-	// {
-	//   opacity: 0.8;
-	//   position: absolute;
-	//   top: 0;
-	//   left: -28px;
-	// }
-	//
-	// .v-spinner .v-rotate3
-	// {
-	//   opacity: 0.8;
-	//   position: absolute;
-	//   top: 0;
-	//   left: 25px;
-	// }
-	//
-	// @-webkit-keyframes v-rotateStretchDelay
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: rotate(0deg);
-	//                 transform: rotate(0deg);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: rotate(180deg);
-	//                 transform: rotate(180deg);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: rotate(360deg);
-	//                 transform: rotate(360deg);
-	//     }
-	// }
-	//
-	// @keyframes v-rotateStretchDelay
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: rotate(0deg);
-	//                 transform: rotate(0deg);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: rotate(180deg);
-	//                 transform: rotate(180deg);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: rotate(360deg);
-	//                 transform: rotate(360deg);
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 37 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-rotate v-rotate1\" v-bind:style=\"spinnerStyle\">\n    <div class=\"v-rotate v-rotate2\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-rotate v-rotate3\" v-bind:style=\"spinnerStyle\">\n    </div></div>\n  </div>\n";
-
-/***/ },
-/* 38 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(39)
-	__vue_script__ = __webpack_require__(41)
-	__vue_template__ = __webpack_require__(42)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/FadeLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 39 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(40);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-533fe941&file=FadeLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./FadeLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-533fe941&file=FadeLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./FadeLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 40 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n\n.v-spinner .v-fade\n{\n    -webkit-animation: v-fadeStretchDelay 1.2s infinite ease-in-out;\n            animation: v-fadeStretchDelay 1.2s infinite ease-in-out;\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n    position: absolute;               \n}\n\n@-webkit-keyframes v-fadeStretchDelay\n{\n    50%\n    {\n        -webkit-opacity: 0.3;             \n                opacity: 0.3;\n    }\n    100%\n    {\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n\n@keyframes v-fadeStretchDelay\n{\n    50%\n    {\n        -webkit-opacity: 0.3;             \n                opacity: 0.3;\n    }\n    100%\n    {\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n", "", {"version":3,"sources":["/./src/FadeLoader.vue.style"],"names":[],"mappings":";;AAwIA;;IAEA,gEAAA;YACA,wDAAA;IACA,kCAAA;WACA,0BAAA;IACA,mBAAA;CACA;;AAEA;;IAEA;;QAEA,qBAAA;gBACA,aAAA;KACA;IACA;;QAEA,mBAAA;gBACA,WAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,qBAAA;gBACA,aAAA;KACA;IACA;;QAEA,mBAAA;gBACA,WAAA;KACA;CACA","file":"FadeLoader.vue","sourcesContent":["<template>\n  <div class=\"v-spinner\" v-bind:style=\"{position: 'relative', fontSize: 0}\" v-show=\"loading\">\n  <!-- <div class=\"v-spinner\" v-bind:style=\"containerStyle\"> -->\n    <div class=\"v-fade v-fade1\" v-bind:style=\"[spinnerStyle,animationStyle1]\">\n    </div><div class=\"v-fade v-fade2\" v-bind:style=\"[spinnerStyle,animationStyle2]\">\n    </div><div class=\"v-fade v-fade3\" v-bind:style=\"[spinnerStyle,animationStyle3]\">\n    </div><div class=\"v-fade v-fade4\" v-bind:style=\"[spinnerStyle,animationStyle4]\">\n    </div><div class=\"v-fade v-fade5\" v-bind:style=\"[spinnerStyle,animationStyle5]\">\n    </div><div class=\"v-fade v-fade6\" v-bind:style=\"[spinnerStyle,animationStyle6]\">\n    </div><div class=\"v-fade v-fade7\" v-bind:style=\"[spinnerStyle,animationStyle7]\">\n    </div><div class=\"v-fade v-fade8\" v-bind:style=\"[spinnerStyle,animationStyle8]\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'FadeLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    height: {\n      type: String,\n      default: '15px'\n    },\n    width: {\n      type: String,\n      default: '5px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '20px'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n      \tbackgroundColor: this.color,\n      \theight: this.height,\n     \t\twidth: this.width,\n      \tmargin: this.margin,\n      \tborderRadius: this.radius\n      }\n    }\n  },\n  computed: {\n    ngRadius () {\n      return '-' + this.radius\n    },\n    quarter () {\n      return (parseFloat(this.radius)/2 + parseFloat(this.radius)/5.5) + 'px'\n    },\n    ngQuarter () {\n      return '-' + this.quarter\n    },\n    animationStyle1 () {\n      return {\n        top: this.radius,\n        left: 0,\n        animationDelay: '0.12s'\n      }\n    },\n    animationStyle2 () {\n      return {\n        top: this.quarter,\n        left: this.quarter,\n        animationDelay: '0.24s',\n        transform: 'rotate(-45deg)'\n      }\n    },\n    animationStyle3 () {\n      return {\n        top: 0,\n        left: this.radius,\n        animationDelay: '0.36s',\n        transform: 'rotate(90deg)'\n      }\n    },\n    animationStyle4 () {\n      return {\n        top: this.ngQuarter,\n        left: this.quarter,\n        animationDelay: '0.48s',\n        transform: 'rotate(45deg)'\n      }\n    },\n    animationStyle5 () {\n      return {\n        top: this.ngRadius,\n        left: 0,\n        animationDelay: '0.60s'\n      }\n    },\n    animationStyle6 () {\n      return {\n        top: this.ngQuarter,\n        left: this.ngQuarter,\n        animationDelay: '0.72s',\n        transform: 'rotate(-45deg)'\n      }\n    },\n    animationStyle7 () {\n      return {\n        top: 0,\n        left: this.ngRadius,\n        animationDelay: '0.84s',\n        transform: 'rotate(90deg)'\n      }\n    },\n    animationStyle8 () {\n      return {\n        top: this.quarter,\n        left: this.ngQuarter,\n        animationDelay: '0.96s',\n        transform: 'rotate(45deg)'\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n\n.v-spinner .v-fade\n{\n    -webkit-animation: v-fadeStretchDelay 1.2s infinite ease-in-out;\n            animation: v-fadeStretchDelay 1.2s infinite ease-in-out;\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n    position: absolute;               \n}\n\n@-webkit-keyframes v-fadeStretchDelay\n{\n    50%\n    {\n        -webkit-opacity: 0.3;             \n                opacity: 0.3;\n    }\n    100%\n    {\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n\n@keyframes v-fadeStretchDelay\n{\n    50%\n    {\n        -webkit-opacity: 0.3;             \n                opacity: 0.3;\n    }\n    100%\n    {\n        -webkit-opacity: 1;             \n                opacity: 1;\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 41 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	//   <div class="v-spinner" v-bind:style="{position: 'relative', fontSize: 0}" v-show="loading">
-	//   <!-- <div class="v-spinner" v-bind:style="containerStyle"> -->
-	//     <div class="v-fade v-fade1" v-bind:style="[spinnerStyle,animationStyle1]">
-	//     </div><div class="v-fade v-fade2" v-bind:style="[spinnerStyle,animationStyle2]">
-	//     </div><div class="v-fade v-fade3" v-bind:style="[spinnerStyle,animationStyle3]">
-	//     </div><div class="v-fade v-fade4" v-bind:style="[spinnerStyle,animationStyle4]">
-	//     </div><div class="v-fade v-fade5" v-bind:style="[spinnerStyle,animationStyle5]">
-	//     </div><div class="v-fade v-fade6" v-bind:style="[spinnerStyle,animationStyle6]">
-	//     </div><div class="v-fade v-fade7" v-bind:style="[spinnerStyle,animationStyle7]">
-	//     </div><div class="v-fade v-fade8" v-bind:style="[spinnerStyle,animationStyle8]">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'FadeLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    height: {
-	      type: String,
-	      default: '15px'
-	    },
-	    width: {
-	      type: String,
-	      default: '5px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '20px'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        height: this.height,
-	        width: this.width,
-	        margin: this.margin,
-	        borderRadius: this.radius
-	      }
-	    };
-	  },
-	
-	  computed: {
-	    ngRadius: function ngRadius() {
-	      return '-' + this.radius;
-	    },
-	    quarter: function quarter() {
-	      return parseFloat(this.radius) / 2 + parseFloat(this.radius) / 5.5 + 'px';
-	    },
-	    ngQuarter: function ngQuarter() {
-	      return '-' + this.quarter;
-	    },
-	    animationStyle1: function animationStyle1() {
-	      return {
-	        top: this.radius,
-	        left: 0,
-	        animationDelay: '0.12s'
-	      };
-	    },
-	    animationStyle2: function animationStyle2() {
-	      return {
-	        top: this.quarter,
-	        left: this.quarter,
-	        animationDelay: '0.24s',
-	        transform: 'rotate(-45deg)'
-	      };
-	    },
-	    animationStyle3: function animationStyle3() {
-	      return {
-	        top: 0,
-	        left: this.radius,
-	        animationDelay: '0.36s',
-	        transform: 'rotate(90deg)'
-	      };
-	    },
-	    animationStyle4: function animationStyle4() {
-	      return {
-	        top: this.ngQuarter,
-	        left: this.quarter,
-	        animationDelay: '0.48s',
-	        transform: 'rotate(45deg)'
-	      };
-	    },
-	    animationStyle5: function animationStyle5() {
-	      return {
-	        top: this.ngRadius,
-	        left: 0,
-	        animationDelay: '0.60s'
-	      };
-	    },
-	    animationStyle6: function animationStyle6() {
-	      return {
-	        top: this.ngQuarter,
-	        left: this.ngQuarter,
-	        animationDelay: '0.72s',
-	        transform: 'rotate(-45deg)'
-	      };
-	    },
-	    animationStyle7: function animationStyle7() {
-	      return {
-	        top: 0,
-	        left: this.ngRadius,
-	        animationDelay: '0.84s',
-	        transform: 'rotate(90deg)'
-	      };
-	    },
-	    animationStyle8: function animationStyle8() {
-	      return {
-	        top: this.quarter,
-	        left: this.ngQuarter,
-	        animationDelay: '0.96s',
-	        transform: 'rotate(45deg)'
-	      };
-	    }
-	  }
-	
-	};
-	// </script>
-	//
-	// <style>
-	//
-	// .v-spinner .v-fade
-	// {
-	//     -webkit-animation: v-fadeStretchDelay 1.2s infinite ease-in-out;
-	//             animation: v-fadeStretchDelay 1.2s infinite ease-in-out;
-	//     -webkit-animation-fill-mode: both;
-	// 	          animation-fill-mode: both;
-	//     position: absolute;               
-	// }
-	//
-	// @-webkit-keyframes v-fadeStretchDelay
-	// {
-	//     50%
-	//     {
-	//         -webkit-opacity: 0.3;             
-	//                 opacity: 0.3;
-	//     }
-	//     100%
-	//     {
-	//         -webkit-opacity: 1;             
-	//                 opacity: 1;
-	//     }
-	// }
-	//
-	// @keyframes v-fadeStretchDelay
-	// {
-	//     50%
-	//     {
-	//         -webkit-opacity: 0.3;             
-	//                 opacity: 0.3;
-	//     }
-	//     100%
-	//     {
-	//         -webkit-opacity: 1;             
-	//                 opacity: 1;
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 42 */
-/***/ function(module, exports) {
-
-	module.exports = "\n  <div class=\"v-spinner\" v-bind:style=\"{position: 'relative', fontSize: 0}\" v-show=\"loading\">\n  <!-- <div class=\"v-spinner\" v-bind:style=\"containerStyle\"> -->\n    <div class=\"v-fade v-fade1\" v-bind:style=\"[spinnerStyle,animationStyle1]\">\n    </div><div class=\"v-fade v-fade2\" v-bind:style=\"[spinnerStyle,animationStyle2]\">\n    </div><div class=\"v-fade v-fade3\" v-bind:style=\"[spinnerStyle,animationStyle3]\">\n    </div><div class=\"v-fade v-fade4\" v-bind:style=\"[spinnerStyle,animationStyle4]\">\n    </div><div class=\"v-fade v-fade5\" v-bind:style=\"[spinnerStyle,animationStyle5]\">\n    </div><div class=\"v-fade v-fade6\" v-bind:style=\"[spinnerStyle,animationStyle6]\">\n    </div><div class=\"v-fade v-fade7\" v-bind:style=\"[spinnerStyle,animationStyle7]\">\n    </div><div class=\"v-fade v-fade8\" v-bind:style=\"[spinnerStyle,animationStyle8]\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 43 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(44)
-	__vue_script__ = __webpack_require__(46)
-	__vue_template__ = __webpack_require__(47)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/PacmanLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 44 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(45);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-1fd650cd&file=PacmanLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./PacmanLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-1fd650cd&file=PacmanLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./PacmanLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 45 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n.v-spinner\n{\n    text-align: center;\n}\n\n/*TODO computed transform */\n@-webkit-keyframes v-pacmanStretchDelay\n{\n    75%\n    {\n        -webkit-opacity: 0.7;             \n        opacity: 0.7;\n    }\n    100%\n    {\n        -webkit-transform: translate(-100px, -6.25px);\n                transform: translate(-100px, -6.25px);\n    }\n}\n\n@keyframes v-pacmanStretchDelay\n{\n    75%\n    {\n        -webkit-opacity: 0.7;             \n        opacity: 0.7;\n    }\n    100%\n    {\n        -webkit-transform: translate(-100px, -6.25px);\n                transform: translate(-100px, -6.25px);\n    }\n}\n", "", {"version":3,"sources":["/./src/PacmanLoader.vue.style"],"names":[],"mappings":";AAsGA;;IAEA,mBAAA;CACA;;AAEA,4BAAA;AACA;;IAEA;;QAEA,qBAAA;QACA,aAAA;KACA;IACA;;QAEA,8CAAA;gBACA,sCAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,qBAAA;QACA,aAAA;KACA;IACA;;QAEA,8CAAA;gBACA,sCAAA;KACA;CACA","file":"PacmanLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-bind:style=\"{position: 'relative', fontSize: 0}\" v-show=\"loading\">\n    <div class=\"v-pacman v-pacman1\" v-bind:style=\"spinnerStyle1\">\n    </div><div class=\"v-pacman v-pacman2\" v-bind:style=\"[spinnerStyle,animationStyle,spinnerDelay2]\">\n    </div><div class=\"v-pacman v-pacman3\" v-bind:style=\"[spinnerStyle,animationStyle,spinnerDelay3]\">\n    </div><div class=\"v-pacman v-pacman4\" v-bind:style=\"[spinnerStyle,animationStyle,spinnerDelay4]\">\n    </div><div class=\"v-pacman v-pacman5\" v-bind:style=\"[spinnerStyle,animationStyle,spinnerDelay5]\">\n    </div>\n</div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'PacmanLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '25px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  data () {\n    return {\n      spinnerDelay2: {\n        animationDelay: '0.25s'\n      },\n      spinnerDelay3: {\n        animationDelay: '0.50s'\n      },\n      spinnerDelay4: {\n        animationDelay: '0.75s'\n      },\n      spinnerDelay5: {\n        animationDelay: '1s'\n      }\n    }\n  },\n  computed: {\n    spinnerStyle () {\n      return {\n        backgroundColor: this.color,\n        width: this.size,\n        height: this.size,\n        margin: this.margin,\n        borderRadius: this.radius,\n      }\n    },\n    border1 () {\n      return this.size + ' solid transparent'\n    },\n    border2 () {\n      return this.size + ' solid ' + this.color\n    },\n    spinnerStyle1 () {\n      return {\n        width: 0,\n        height: 0,\n        borderTop: this.border2,\n        borderRight: this.border1,\n        borderBottom: this.border2,\n        borderLeft: this.border2,\n        borderRadius: this.size\n      }\n    },\n    animationStyle () {\n      return {\n        width: '10px',\n        height: '10px',\n        transform: 'translate(0, '+ -parseFloat(this.size)/4 + 'px)',\n        position: 'absolute',\n        top: '25px',\n        left: '100px',\n        animationName: 'v-pacmanStretchDelay',\n        animationDuration: '1s',\n        animationIterationCount: 'infinite',\n        animationTimingFunction: 'linear',\n        animationFillMode: 'both'\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n.v-spinner\n{\n    text-align: center;\n}\n\n/*TODO computed transform */\n@-webkit-keyframes v-pacmanStretchDelay\n{\n    75%\n    {\n        -webkit-opacity: 0.7;             \n        opacity: 0.7;\n    }\n    100%\n    {\n        -webkit-transform: translate(-100px, -6.25px);\n                transform: translate(-100px, -6.25px);\n    }\n}\n\n@keyframes v-pacmanStretchDelay\n{\n    75%\n    {\n        -webkit-opacity: 0.7;             \n        opacity: 0.7;\n    }\n    100%\n    {\n        -webkit-transform: translate(-100px, -6.25px);\n                transform: translate(-100px, -6.25px);\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 46 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-bind:style="{position: 'relative', fontSize: 0}" v-show="loading">
-	//     <div class="v-pacman v-pacman1" v-bind:style="spinnerStyle1">
-	//     </div><div class="v-pacman v-pacman2" v-bind:style="[spinnerStyle,animationStyle,spinnerDelay2]">
-	//     </div><div class="v-pacman v-pacman3" v-bind:style="[spinnerStyle,animationStyle,spinnerDelay3]">
-	//     </div><div class="v-pacman v-pacman4" v-bind:style="[spinnerStyle,animationStyle,spinnerDelay4]">
-	//     </div><div class="v-pacman v-pacman5" v-bind:style="[spinnerStyle,animationStyle,spinnerDelay5]">
-	//     </div>
-	// </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'PacmanLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '25px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerDelay2: {
-	        animationDelay: '0.25s'
-	      },
-	      spinnerDelay3: {
-	        animationDelay: '0.50s'
-	      },
-	      spinnerDelay4: {
-	        animationDelay: '0.75s'
-	      },
-	      spinnerDelay5: {
-	        animationDelay: '1s'
-	      }
-	    };
-	  },
-	
-	  computed: {
-	    spinnerStyle: function spinnerStyle() {
-	      return {
-	        backgroundColor: this.color,
-	        width: this.size,
-	        height: this.size,
-	        margin: this.margin,
-	        borderRadius: this.radius
-	      };
-	    },
-	    border1: function border1() {
-	      return this.size + ' solid transparent';
-	    },
-	    border2: function border2() {
-	      return this.size + ' solid ' + this.color;
-	    },
-	    spinnerStyle1: function spinnerStyle1() {
-	      return {
-	        width: 0,
-	        height: 0,
-	        borderTop: this.border2,
-	        borderRight: this.border1,
-	        borderBottom: this.border2,
-	        borderLeft: this.border2,
-	        borderRadius: this.size
-	      };
-	    },
-	    animationStyle: function animationStyle() {
-	      return {
-	        width: '10px',
-	        height: '10px',
-	        transform: 'translate(0, ' + -parseFloat(this.size) / 4 + 'px)',
-	        position: 'absolute',
-	        top: '25px',
-	        left: '100px',
-	        animationName: 'v-pacmanStretchDelay',
-	        animationDuration: '1s',
-	        animationIterationCount: 'infinite',
-	        animationTimingFunction: 'linear',
-	        animationFillMode: 'both'
-	      };
-	    }
-	  }
-	
-	};
-	// </script>
-	//
-	// <style>
-	// .v-spinner
-	// {
-	//     text-align: center;
-	// }
-	//
-	// /*TODO computed transform */
-	// @-webkit-keyframes v-pacmanStretchDelay
-	// {
-	//     75%
-	//     {
-	//         -webkit-opacity: 0.7;             
-	//         opacity: 0.7;
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: translate(-100px, -6.25px);
-	//                 transform: translate(-100px, -6.25px);
-	//     }
-	// }
-	//
-	// @keyframes v-pacmanStretchDelay
-	// {
-	//     75%
-	//     {
-	//         -webkit-opacity: 0.7;             
-	//         opacity: 0.7;
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: translate(-100px, -6.25px);
-	//                 transform: translate(-100px, -6.25px);
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 47 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-bind:style=\"{position: 'relative', fontSize: 0}\" v-show=\"loading\">\n    <div class=\"v-pacman v-pacman1\" v-bind:style=\"spinnerStyle1\">\n    </div><div class=\"v-pacman v-pacman2\" v-bind:style=\"[spinnerStyle,animationStyle,spinnerDelay2]\">\n    </div><div class=\"v-pacman v-pacman3\" v-bind:style=\"[spinnerStyle,animationStyle,spinnerDelay3]\">\n    </div><div class=\"v-pacman v-pacman4\" v-bind:style=\"[spinnerStyle,animationStyle,spinnerDelay4]\">\n    </div><div class=\"v-pacman v-pacman5\" v-bind:style=\"[spinnerStyle,animationStyle,spinnerDelay5]\">\n    </div>\n</div>\n";
-
-/***/ },
-/* 48 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(49)
-	__vue_script__ = __webpack_require__(51)
-	__vue_template__ = __webpack_require__(52)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/SquareLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 49 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(50);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-5c782022&file=SquareLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./SquareLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-5c782022&file=SquareLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./SquareLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 50 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n.v-spinner\n{\n    text-align: center;\n}\n\n.v-spinner .v-square\n{\n    -webkit-animation: v-squareDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);\n            animation: v-squareDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n    -webkit-perspective: 100px;\n            perspective: 100px;\n    display: inline-block;\n}\n\n@-webkit-keyframes v-squareDelay\n{\n    25%\n    {\n        -webkit-transform: rotateX(180deg) rotateY(0);\n                transform: rotateX(180deg) rotateY(0);\n    }\n    50%\n    {\n        -webkit-transform: rotateX(180deg) rotateY(180deg);\n                transform: rotateX(180deg) rotateY(180deg);\n    }\n    75%\n    {\n        -webkit-transform: rotateX(0) rotateY(180deg);\n                transform: rotateX(0) rotateY(180deg);\n    }\n    100%\n    {\n        -webkit-transform: rotateX(0) rotateY(0);\n                transform: rotateX(0) rotateY(0);\n    }\n}\n\n@keyframes v-squareDelay\n{\n    25%\n    {\n        -webkit-transform: rotateX(180deg) rotateY(0);\n                transform: rotateX(180deg) rotateY(0);\n    }\n    50%\n    {\n        -webkit-transform: rotateX(180deg) rotateY(180deg);\n                transform: rotateX(180deg) rotateY(180deg);\n    }\n    75%\n    {\n        -webkit-transform: rotateX(0) rotateY(180deg);\n                transform: rotateX(0) rotateY(180deg);\n    }\n    100%\n    {\n        -webkit-transform: rotateX(0) rotateY(0);\n                transform: rotateX(0) rotateY(0);\n    }\n}\n", "", {"version":3,"sources":["/./src/SquareLoader.vue.style"],"names":[],"mappings":";AAwCA;;IAEA,mBAAA;CACA;;AAEA;;IAEA,6EAAA;YACA,qEAAA;IACA,kCAAA;WACA,0BAAA;IACA,2BAAA;YAAA,mBAAA;IACA,sBAAA;CACA;;AAEA;;IAEA;;QAEA,8CAAA;gBACA,sCAAA;KACA;IACA;;QAEA,mDAAA;gBACA,2CAAA;KACA;IACA;;QAEA,8CAAA;gBACA,sCAAA;KACA;IACA;;QAEA,yCAAA;gBACA,iCAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,8CAAA;gBACA,sCAAA;KACA;IACA;;QAEA,mDAAA;gBACA,2CAAA;KACA;IACA;;QAEA,8CAAA;gBACA,sCAAA;KACA;IACA;;QAEA,yCAAA;gBACA,iCAAA;KACA;CACA","file":"SquareLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-square\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'SquareLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n  \tcolor: { \n      type: String,\n      default: '#5dc596'\n    },\n  \tsize: {\n      type: String,\n      default: '50px'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n      \tbackgroundColor: this.color,\n      \theight: this.size,\n     \t  width: this.size\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n.v-spinner\n{\n    text-align: center;\n}\n\n.v-spinner .v-square\n{\n    -webkit-animation: v-squareDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);\n            animation: v-squareDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n    perspective: 100px;\n    display: inline-block;\n}\n\n@-webkit-keyframes v-squareDelay\n{\n    25%\n    {\n        -webkit-transform: rotateX(180deg) rotateY(0);\n                transform: rotateX(180deg) rotateY(0);\n    }\n    50%\n    {\n        -webkit-transform: rotateX(180deg) rotateY(180deg);\n                transform: rotateX(180deg) rotateY(180deg);\n    }\n    75%\n    {\n        -webkit-transform: rotateX(0) rotateY(180deg);\n                transform: rotateX(0) rotateY(180deg);\n    }\n    100%\n    {\n        -webkit-transform: rotateX(0) rotateY(0);\n                transform: rotateX(0) rotateY(0);\n    }\n}\n\n@keyframes v-squareDelay\n{\n    25%\n    {\n        -webkit-transform: rotateX(180deg) rotateY(0);\n                transform: rotateX(180deg) rotateY(0);\n    }\n    50%\n    {\n        -webkit-transform: rotateX(180deg) rotateY(180deg);\n                transform: rotateX(180deg) rotateY(180deg);\n    }\n    75%\n    {\n        -webkit-transform: rotateX(0) rotateY(180deg);\n                transform: rotateX(0) rotateY(180deg);\n    }\n    100%\n    {\n        -webkit-transform: rotateX(0) rotateY(0);\n                transform: rotateX(0) rotateY(0);\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 51 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-square" v-bind:style="spinnerStyle">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'SquareLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '50px'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        height: this.size,
-	        width: this.size
-	      }
-	    };
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	// .v-spinner
-	// {
-	//     text-align: center;
-	// }
-	//
-	// .v-spinner .v-square
-	// {
-	//     -webkit-animation: v-squareDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);
-	//             animation: v-squareDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);
-	//     -webkit-animation-fill-mode: both;
-	// 	          animation-fill-mode: both;
-	//     perspective: 100px;
-	//     display: inline-block;
-	// }
-	//
-	// @-webkit-keyframes v-squareDelay
-	// {
-	//     25%
-	//     {
-	//         -webkit-transform: rotateX(180deg) rotateY(0);
-	//                 transform: rotateX(180deg) rotateY(0);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: rotateX(180deg) rotateY(180deg);
-	//                 transform: rotateX(180deg) rotateY(180deg);
-	//     }
-	//     75%
-	//     {
-	//         -webkit-transform: rotateX(0) rotateY(180deg);
-	//                 transform: rotateX(0) rotateY(180deg);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: rotateX(0) rotateY(0);
-	//                 transform: rotateX(0) rotateY(0);
-	//     }
-	// }
-	//
-	// @keyframes v-squareDelay
-	// {
-	//     25%
-	//     {
-	//         -webkit-transform: rotateX(180deg) rotateY(0);
-	//                 transform: rotateX(180deg) rotateY(0);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: rotateX(180deg) rotateY(180deg);
-	//                 transform: rotateX(180deg) rotateY(180deg);
-	//     }
-	//     75%
-	//     {
-	//         -webkit-transform: rotateX(0) rotateY(180deg);
-	//                 transform: rotateX(0) rotateY(180deg);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: rotateX(0) rotateY(0);
-	//                 transform: rotateX(0) rotateY(0);
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 52 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-square\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 53 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(54)
-	__vue_script__ = __webpack_require__(56)
-	__vue_template__ = __webpack_require__(57)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/ScaleLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 54 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(55);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-d844fe4a&file=ScaleLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./ScaleLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-d844fe4a&file=ScaleLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./ScaleLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 55 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n.v-spinner\n{\n/*\t  font-size: 10px; \n\n    width: 60px;\n    height: 40px;*/\n    /*margin: 25px auto;*/\n    text-align: center;\n}\n\n@-webkit-keyframes v-scaleStretchDelay\n{\n    0%\n    {\n       -webkit-animation-name: inherit;\n       -webkit-animation-duration: inherit;\n       -webkit-animation-iteration-count: inherit;\n       -webkit-animation-direction: inherit;\n    },\n    100%\n    {\n        -webkit-transform: scaleY(1);\n                transform: scaleY(1);\n    }\n    50%\n    {\n        -webkit-transform: scaleY(0.4);\n                transform: scaleY(0.4);\n    }\n}\n\n@keyframes v-scaleStretchDelay\n{\n    0%\n    {\n       -webkit-animation-name: inherit;\n       -webkit-animation-duration: inherit;\n       -webkit-animation-iteration-count: inherit;\n       -webkit-animation-direction: inherit;\n    },\n    100%\n    {\n        -webkit-transform: scaleY(1);\n                transform: scaleY(1);\n    }\n    50%\n    {\n        -webkit-transform: scaleY(0.4);\n                transform: scaleY(0.4);\n    }\n}\n", "", {"version":3,"sources":["/./src/ScaleLoader.vue.style"],"names":[],"mappings":";AAiFA;;AAEA;;;mBAGA;IACA,sBAAA;IACA,mBAAA;CACA;;AAEA;;IAEA;;OAEA,gCAAA;OACA,oCAAA;OACA,2CAAA;OACA,qCAAA;KACA;;;QAGA,6BAAA;gBACA,qBAAA;KACA;IACA;;QAEA,+BAAA;gBACA,uBAAA;KACA;CACA;;AAEA;;IAEA;;OAEA,gCAAA;OACA,oCAAA;OACA,2CAAA;OACA,qCAAA;KACA;;;QAGA,6BAAA;gBACA,qBAAA;KACA;IACA;;QAEA,+BAAA;gBACA,uBAAA;KACA;CACA","file":"ScaleLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-scale v-scale1\" v-bind:style=\"[spinnerStyle,spinnerDelay1]\">\n    </div><div class=\"v-scale v-scale2\" v-bind:style=\"[spinnerStyle,spinnerDelay2]\">\n    </div><div class=\"v-scale v-scale3\" v-bind:style=\"[spinnerStyle,spinnerDelay3]\">\n    </div><div class=\"v-scale v-scale4\" v-bind:style=\"[spinnerStyle,spinnerDelay4]\">\n    </div><div class=\"v-scale v-scale5\" v-bind:style=\"[spinnerStyle,spinnerDelay5]\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'ScaleLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n  \tcolor: { \n      type: String,\n      default: '#5dc596'\n    },\n  \theight: {\n      type: String,\n      default: '35px'\n    },\n  \twidth: {\n      type: String,\n      default: '4px'\n    },\n  \tmargin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '2px'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n      \tbackgroundColor: this.color,\n      \theight: this.height,\n     \t\twidth: this.width,\n      \tmargin: this.margin,\n      \tborderRadius: this.radius,\n        display: 'inline-block',\n        animationName: 'v-scaleStretchDelay',\n        animationDuration: '1s',\n        animationIterationCount: 'infinite',\n        animationTimingFunction: 'cubic-bezier(.2,.68,.18,1.08)',\n        animationFillMode: 'both'\n      \t// TODO \n      \t// animation: ['sk-waveStretchDelay', '1s', 'infinite', 'cubic-bezier(.2,.68,.18,1.08)'].join(' '),\n      \t// animationFillMode: \"both\"\n      },\n      spinnerDelay1: {\n        animationDelay: '0.1s'\n      },\n      spinnerDelay2: {\n        animationDelay: '0.2s'\n      },\n      spinnerDelay3: {\n        animationDelay: '0.3s'\n      },\n      spinnerDelay4: {\n        animationDelay: '0.4s'\n      },\n      spinnerDelay5: {\n        animationDelay: '0.5s'\n      }\n    }\n  }\n}\n</script>\n\n<style>\n.v-spinner\n{\n/*\t  font-size: 10px; \n\n    width: 60px;\n    height: 40px;*/\n    /*margin: 25px auto;*/\n    text-align: center;\n}\n\n@-webkit-keyframes v-scaleStretchDelay\n{\n    0%\n    {\n       -webkit-animation-name: inherit;\n       -webkit-animation-duration: inherit;\n       -webkit-animation-iteration-count: inherit;\n       -webkit-animation-direction: inherit;\n    },\n    100%\n    {\n        -webkit-transform: scaleY(1);\n                transform: scaleY(1);\n    }\n    50%\n    {\n        -webkit-transform: scaleY(0.4);\n                transform: scaleY(0.4);\n    }\n}\n\n@keyframes v-scaleStretchDelay\n{\n    0%\n    {\n       -webkit-animation-name: inherit;\n       -webkit-animation-duration: inherit;\n       -webkit-animation-iteration-count: inherit;\n       -webkit-animation-direction: inherit;\n    },\n    100%\n    {\n        -webkit-transform: scaleY(1);\n                transform: scaleY(1);\n    }\n    50%\n    {\n        -webkit-transform: scaleY(0.4);\n                transform: scaleY(0.4);\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 56 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-scale v-scale1" v-bind:style="[spinnerStyle,spinnerDelay1]">
-	//     </div><div class="v-scale v-scale2" v-bind:style="[spinnerStyle,spinnerDelay2]">
-	//     </div><div class="v-scale v-scale3" v-bind:style="[spinnerStyle,spinnerDelay3]">
-	//     </div><div class="v-scale v-scale4" v-bind:style="[spinnerStyle,spinnerDelay4]">
-	//     </div><div class="v-scale v-scale5" v-bind:style="[spinnerStyle,spinnerDelay5]">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'ScaleLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    height: {
-	      type: String,
-	      default: '35px'
-	    },
-	    width: {
-	      type: String,
-	      default: '4px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '2px'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        height: this.height,
-	        width: this.width,
-	        margin: this.margin,
-	        borderRadius: this.radius,
-	        display: 'inline-block',
-	        animationName: 'v-scaleStretchDelay',
-	        animationDuration: '1s',
-	        animationIterationCount: 'infinite',
-	        animationTimingFunction: 'cubic-bezier(.2,.68,.18,1.08)',
-	        animationFillMode: 'both'
-	        // TODO 
-	        // animation: ['sk-waveStretchDelay', '1s', 'infinite', 'cubic-bezier(.2,.68,.18,1.08)'].join(' '),
-	        // animationFillMode: "both"
-	      },
-	      spinnerDelay1: {
-	        animationDelay: '0.1s'
-	      },
-	      spinnerDelay2: {
-	        animationDelay: '0.2s'
-	      },
-	      spinnerDelay3: {
-	        animationDelay: '0.3s'
-	      },
-	      spinnerDelay4: {
-	        animationDelay: '0.4s'
-	      },
-	      spinnerDelay5: {
-	        animationDelay: '0.5s'
-	      }
-	    };
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	// .v-spinner
-	// {
-	// /*	  font-size: 10px; 
-	//
-	//     width: 60px;
-	//     height: 40px;*/
-	//     /*margin: 25px auto;*/
-	//     text-align: center;
-	// }
-	//
-	// @-webkit-keyframes v-scaleStretchDelay
-	// {
-	//     0%
-	//     {
-	//        -webkit-animation-name: inherit;
-	//        -webkit-animation-duration: inherit;
-	//        -webkit-animation-iteration-count: inherit;
-	//        -webkit-animation-direction: inherit;
-	//     },
-	//     100%
-	//     {
-	//         -webkit-transform: scaleY(1);
-	//                 transform: scaleY(1);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scaleY(0.4);
-	//                 transform: scaleY(0.4);
-	//     }
-	// }
-	//
-	// @keyframes v-scaleStretchDelay
-	// {
-	//     0%
-	//     {
-	//        -webkit-animation-name: inherit;
-	//        -webkit-animation-duration: inherit;
-	//        -webkit-animation-iteration-count: inherit;
-	//        -webkit-animation-direction: inherit;
-	//     },
-	//     100%
-	//     {
-	//         -webkit-transform: scaleY(1);
-	//                 transform: scaleY(1);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scaleY(0.4);
-	//                 transform: scaleY(0.4);
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 57 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-scale v-scale1\" v-bind:style=\"[spinnerStyle,spinnerDelay1]\">\n    </div><div class=\"v-scale v-scale2\" v-bind:style=\"[spinnerStyle,spinnerDelay2]\">\n    </div><div class=\"v-scale v-scale3\" v-bind:style=\"[spinnerStyle,spinnerDelay3]\">\n    </div><div class=\"v-scale v-scale4\" v-bind:style=\"[spinnerStyle,spinnerDelay4]\">\n    </div><div class=\"v-scale v-scale5\" v-bind:style=\"[spinnerStyle,spinnerDelay5]\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 58 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(59)
-	__vue_script__ = __webpack_require__(61)
-	__vue_template__ = __webpack_require__(62)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/SkewLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 59 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(60);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-c730c362&file=SkewLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./SkewLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-c730c362&file=SkewLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./SkewLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 60 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n.v-spinner\n{\n    text-align: center;\n}\n\n.v-spinner .v-skew\n{\n    -webkit-animation: v-skewDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);\n            animation: v-skewDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n    display: inline-block;\n}\n\n@-webkit-keyframes v-skewDelay\n{\n    25%\n    {\n        -webkit-transform: perspective(100px) rotateX(180deg) rotateY(0);\n                transform: perspective(100px) rotateX(180deg) rotateY(0);\n    }\n    50%\n    {\n        -webkit-transform: perspective(100px) rotateX(180deg) rotateY(180deg);\n                transform: perspective(100px) rotateX(180deg) rotateY(180deg);\n    }\n    75%\n    {\n        -webkit-transform: perspective(100px) rotateX(0) rotateY(180deg);\n                transform: perspective(100px) rotateX(0) rotateY(180deg);\n    }\n    100%\n    {\n        -webkit-transform: perspective(100px) rotateX(0) rotateY(0);\n                transform: perspective(100px) rotateX(0) rotateY(0);\n    }\n}\n\n@keyframes v-skewDelay\n{\n    25%\n    {\n        -webkit-transform: perspective(100px) rotateX(180deg) rotateY(0);\n                transform: perspective(100px) rotateX(180deg) rotateY(0);\n    }\n    50%\n    {\n        -webkit-transform: perspective(100px) rotateX(180deg) rotateY(180deg);\n                transform: perspective(100px) rotateX(180deg) rotateY(180deg);\n    }\n    75%\n    {\n        -webkit-transform: perspective(100px) rotateX(0) rotateY(180deg);\n                transform: perspective(100px) rotateX(0) rotateY(180deg);\n    }\n    100%\n    {\n        -webkit-transform: perspective(100px) rotateX(0) rotateY(0);\n                transform: perspective(100px) rotateX(0) rotateY(0);\n    }\n}\n", "", {"version":3,"sources":["/./src/SkewLoader.vue.style"],"names":[],"mappings":";AA0CA;;IAEA,mBAAA;CACA;;AAEA;;IAEA,2EAAA;YACA,mEAAA;IACA,kCAAA;WACA,0BAAA;IACA,sBAAA;CACA;;AAEA;;IAEA;;QAEA,iEAAA;gBACA,yDAAA;KACA;IACA;;QAEA,sEAAA;gBACA,8DAAA;KACA;IACA;;QAEA,iEAAA;gBACA,yDAAA;KACA;IACA;;QAEA,4DAAA;gBACA,oDAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,iEAAA;gBACA,yDAAA;KACA;IACA;;QAEA,sEAAA;gBACA,8DAAA;KACA;IACA;;QAEA,iEAAA;gBACA,yDAAA;KACA;IACA;;QAEA,4DAAA;gBACA,oDAAA;KACA;CACA","file":"SkewLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-skew\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'SkewLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n  \tcolor: { \n      type: String,\n      default: '#5dc596'\n    },\n  \tsize: {\n      type: String,\n      default: '20px'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n      \theight: 0,\n     \t  width: 0,\n        borderLeft: this.size + ' solid transparent',\n        borderRight: this.size + ' solid transparent',\n        borderBottom: this.size + ' solid ' + this.color\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n.v-spinner\n{\n    text-align: center;\n}\n\n.v-spinner .v-skew\n{\n    -webkit-animation: v-skewDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);\n            animation: v-skewDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);\n    -webkit-animation-fill-mode: both;\n\t          animation-fill-mode: both;\n    display: inline-block;\n}\n\n@-webkit-keyframes v-skewDelay\n{\n    25%\n    {\n        -webkit-transform: perspective(100px) rotateX(180deg) rotateY(0);\n                transform: perspective(100px) rotateX(180deg) rotateY(0);\n    }\n    50%\n    {\n        -webkit-transform: perspective(100px) rotateX(180deg) rotateY(180deg);\n                transform: perspective(100px) rotateX(180deg) rotateY(180deg);\n    }\n    75%\n    {\n        -webkit-transform: perspective(100px) rotateX(0) rotateY(180deg);\n                transform: perspective(100px) rotateX(0) rotateY(180deg);\n    }\n    100%\n    {\n        -webkit-transform: perspective(100px) rotateX(0) rotateY(0);\n                transform: perspective(100px) rotateX(0) rotateY(0);\n    }\n}\n\n@keyframes v-skewDelay\n{\n    25%\n    {\n        -webkit-transform: perspective(100px) rotateX(180deg) rotateY(0);\n                transform: perspective(100px) rotateX(180deg) rotateY(0);\n    }\n    50%\n    {\n        -webkit-transform: perspective(100px) rotateX(180deg) rotateY(180deg);\n                transform: perspective(100px) rotateX(180deg) rotateY(180deg);\n    }\n    75%\n    {\n        -webkit-transform: perspective(100px) rotateX(0) rotateY(180deg);\n                transform: perspective(100px) rotateX(0) rotateY(180deg);\n    }\n    100%\n    {\n        -webkit-transform: perspective(100px) rotateX(0) rotateY(0);\n                transform: perspective(100px) rotateX(0) rotateY(0);\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 61 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-skew" v-bind:style="spinnerStyle">
-	//     </div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'SkewLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '20px'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        height: 0,
-	        width: 0,
-	        borderLeft: this.size + ' solid transparent',
-	        borderRight: this.size + ' solid transparent',
-	        borderBottom: this.size + ' solid ' + this.color
-	      }
-	    };
-	  }
-	};
-	// </script>
-	//
-	// <style>
-	// .v-spinner
-	// {
-	//     text-align: center;
-	// }
-	//
-	// .v-spinner .v-skew
-	// {
-	//     -webkit-animation: v-skewDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);
-	//             animation: v-skewDelay 3s 0s infinite cubic-bezier(.09,.57,.49,.9);
-	//     -webkit-animation-fill-mode: both;
-	// 	          animation-fill-mode: both;
-	//     display: inline-block;
-	// }
-	//
-	// @-webkit-keyframes v-skewDelay
-	// {
-	//     25%
-	//     {
-	//         -webkit-transform: perspective(100px) rotateX(180deg) rotateY(0);
-	//                 transform: perspective(100px) rotateX(180deg) rotateY(0);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: perspective(100px) rotateX(180deg) rotateY(180deg);
-	//                 transform: perspective(100px) rotateX(180deg) rotateY(180deg);
-	//     }
-	//     75%
-	//     {
-	//         -webkit-transform: perspective(100px) rotateX(0) rotateY(180deg);
-	//                 transform: perspective(100px) rotateX(0) rotateY(180deg);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: perspective(100px) rotateX(0) rotateY(0);
-	//                 transform: perspective(100px) rotateX(0) rotateY(0);
-	//     }
-	// }
-	//
-	// @keyframes v-skewDelay
-	// {
-	//     25%
-	//     {
-	//         -webkit-transform: perspective(100px) rotateX(180deg) rotateY(0);
-	//                 transform: perspective(100px) rotateX(180deg) rotateY(0);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: perspective(100px) rotateX(180deg) rotateY(180deg);
-	//                 transform: perspective(100px) rotateX(180deg) rotateY(180deg);
-	//     }
-	//     75%
-	//     {
-	//         -webkit-transform: perspective(100px) rotateX(0) rotateY(180deg);
-	//                 transform: perspective(100px) rotateX(0) rotateY(180deg);
-	//     }
-	//     100%
-	//     {
-	//         -webkit-transform: perspective(100px) rotateX(0) rotateY(0);
-	//                 transform: perspective(100px) rotateX(0) rotateY(0);
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 62 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-skew\" v-bind:style=\"spinnerStyle\">\n    </div>\n  </div>\n";
-
-/***/ },
-/* 63 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(64)
-	__vue_script__ = __webpack_require__(66)
-	__vue_template__ = __webpack_require__(67)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/MoonLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 64 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(65);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-1c272b86&file=MoonLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./MoonLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-1c272b86&file=MoonLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./MoonLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 65 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n\n.v-spinner .v-moon1\n{\n\n    -webkit-animation: v-moonStretchDelay 0.6s 0s infinite linear;\n            animation: v-moonStretchDelay 0.6s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    position: relative;\n}\n\n.v-spinner .v-moon2\n{\n    -webkit-animation: v-moonStretchDelay 0.6s 0s infinite linear;\n            animation: v-moonStretchDelay 0.6s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    opacity: 0.8;\n    position: absolute;\n}\n\n.v-spinner .v-moon3\n{\n    opacity: 0.1;\n}\n\n@-webkit-keyframes v-moonStretchDelay\n{\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n\n@keyframes v-moonStretchDelay\n{\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n", "", {"version":3,"sources":["/./src/MoonLoader.vue.style"],"names":[],"mappings":";;AA0EA;;;IAGA,8DAAA;YACA,sDAAA;IACA,sCAAA;YACA,8BAAA;IACA,mBAAA;CACA;;AAEA;;IAEA,8DAAA;YACA,sDAAA;IACA,sCAAA;YACA,8BAAA;IACA,aAAA;IACA,mBAAA;CACA;;AAEA;;IAEA,aAAA;CACA;;AAEA;;IAEA;;QAEA,kCAAA;gBACA,0BAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,kCAAA;gBACA,0BAAA;KACA;CACA","file":"MoonLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-moon v-moon1\" v-bind:style=\"spinnerStyle\">\n    <div class=\"v-moon v-moon2\" v-bind:style=\"[spinnerMoonStyle,animationStyle2]\">\n    </div><div class=\"v-moon v-moon3\" v-bind:style=\"[spinnerStyle,animationStyle3]\">\n    </div></div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'MoonLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '60px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n        height: this.size,\n        width: this.size,\n        borderRadius: this.radius\n      }\n    }\n  },\n  computed: {\n    moonSize() {\n      return parseFloat(this.size)/7\n    },\n    spinnerMoonStyle () {\n      return {\n        height: this.moonSize  + 'px',\n        width: this.moonSize  + 'px',\n        borderRadius: this.radius\n      }\n    },\n    animationStyle2 () {\n      return {\n        top: parseFloat(this.size)/2 - this.moonSize/2 + 'px',\n        backgroundColor: this.color\n      }\n    },\n    animationStyle3 () {\n      return {\n        border: this.moonSize + 'px solid ' + this.color\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n\n.v-spinner .v-moon1\n{\n\n    -webkit-animation: v-moonStretchDelay 0.6s 0s infinite linear;\n            animation: v-moonStretchDelay 0.6s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    position: relative;\n}\n\n.v-spinner .v-moon2\n{\n    -webkit-animation: v-moonStretchDelay 0.6s 0s infinite linear;\n            animation: v-moonStretchDelay 0.6s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    opacity: 0.8;\n    position: absolute;\n}\n\n.v-spinner .v-moon3\n{\n    opacity: 0.1;\n}\n\n@-webkit-keyframes v-moonStretchDelay\n{\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n\n@keyframes v-moonStretchDelay\n{\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 66 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-moon v-moon1" v-bind:style="spinnerStyle">
-	//     <div class="v-moon v-moon2" v-bind:style="[spinnerMoonStyle,animationStyle2]">
-	//     </div><div class="v-moon v-moon3" v-bind:style="[spinnerStyle,animationStyle3]">
-	//     </div></div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'MoonLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '60px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        height: this.size,
-	        width: this.size,
-	        borderRadius: this.radius
-	      }
-	    };
-	  },
-	
-	  computed: {
-	    moonSize: function moonSize() {
-	      return parseFloat(this.size) / 7;
-	    },
-	    spinnerMoonStyle: function spinnerMoonStyle() {
-	      return {
-	        height: this.moonSize + 'px',
-	        width: this.moonSize + 'px',
-	        borderRadius: this.radius
-	      };
-	    },
-	    animationStyle2: function animationStyle2() {
-	      return {
-	        top: parseFloat(this.size) / 2 - this.moonSize / 2 + 'px',
-	        backgroundColor: this.color
-	      };
-	    },
-	    animationStyle3: function animationStyle3() {
-	      return {
-	        border: this.moonSize + 'px solid ' + this.color
-	      };
-	    }
-	  }
-	
-	};
-	// </script>
-	//
-	// <style>
-	//
-	// .v-spinner .v-moon1
-	// {
-	//
-	//     -webkit-animation: v-moonStretchDelay 0.6s 0s infinite linear;
-	//             animation: v-moonStretchDelay 0.6s 0s infinite linear;
-	//     -webkit-animation-fill-mode: forwards;
-	//             animation-fill-mode: forwards;
-	//     position: relative;
-	// }
-	//
-	// .v-spinner .v-moon2
-	// {
-	//     -webkit-animation: v-moonStretchDelay 0.6s 0s infinite linear;
-	//             animation: v-moonStretchDelay 0.6s 0s infinite linear;
-	//     -webkit-animation-fill-mode: forwards;
-	//             animation-fill-mode: forwards;
-	//     opacity: 0.8;
-	//     position: absolute;
-	// }
-	//
-	// .v-spinner .v-moon3
-	// {
-	//     opacity: 0.1;
-	// }
-	//
-	// @-webkit-keyframes v-moonStretchDelay
-	// {
-	//     100%
-	//     {
-	//         -webkit-transform: rotate(360deg);
-	//                 transform: rotate(360deg);
-	//     }
-	// }
-	//
-	// @keyframes v-moonStretchDelay
-	// {
-	//     100%
-	//     {
-	//         -webkit-transform: rotate(360deg);
-	//                 transform: rotate(360deg);
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 67 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-moon v-moon1\" v-bind:style=\"spinnerStyle\">\n    <div class=\"v-moon v-moon2\" v-bind:style=\"[spinnerMoonStyle,animationStyle2]\">\n    </div><div class=\"v-moon v-moon3\" v-bind:style=\"[spinnerStyle,animationStyle3]\">\n    </div></div>\n  </div>\n";
-
-/***/ },
-/* 68 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(69)
-	__vue_script__ = __webpack_require__(71)
-	__vue_template__ = __webpack_require__(72)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/RingLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 69 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(70);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-57112816&file=RingLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./RingLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-57112816&file=RingLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./RingLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 70 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n\n.v-spinner .v-ring\n{         \n}\n\n.v-spinner .v-ring1\n{      \n}\n\n.v-spinner .v-ring2\n{\n\n    -webkit-animation: v-ringRightRotate 2s 0s infinite linear;\n            animation: v-ringRightRotate 2s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    -webkit-perspective: 800px;\n            perspective: 800px;  \n    position: absolute;          \n    top: 0;\n    left: 0;\n}\n\n.v-spinner .v-ring3\n{\n\n    -webkit-animation: v-ringLeftRotate 2s 0s infinite linear;\n            animation: v-ringLeftRotate 2s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    -webkit-perspective: 800px;\n            perspective: 800px;            \n    position: absolute;\n    top: 0;\n    left: 0;\n}\n\n@-webkit-keyframes v-ringRightRotate\n{\n    0%\n    {\n        -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n                transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n    }\n    100%\n    { \n        -webkit-transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);\n                transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);\n    \n    }\n}\n\n@keyframes v-ringRightRotate\n{\n    0%\n    {\n        -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n                transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n    }\n    100%\n    { \n        -webkit-transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);\n                transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);\n    \n    }\n}\n\n@-webkit-keyframes v-ringLeftRotate\n{\n    0%\n    {\n        -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n                transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n    }\n    100%\n    { \n        -webkit-transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);\n                transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);\n    \n    }\n}\n\n@keyframes v-ringLeftRotate\n{\n    0%\n    {\n        -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n                transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n    }\n    100%\n    { \n        -webkit-transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);\n                transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);\n    \n    }\n}\n", "", {"version":3,"sources":["/./src/RingLoader.vue.style"],"names":[],"mappings":";;AA4DA;;CAEA;;AAEA;;CAEA;;AAEA;;;IAGA,2DAAA;YACA,mDAAA;IACA,sCAAA;YACA,8BAAA;IACA,2BAAA;YAAA,mBAAA;IACA,mBAAA;IACA,OAAA;IACA,QAAA;CACA;;AAEA;;;IAGA,0DAAA;YACA,kDAAA;IACA,sCAAA;YACA,8BAAA;IACA,2BAAA;YAAA,mBAAA;IACA,mBAAA;IACA,OAAA;IACA,QAAA;CACA;;AAEA;;IAEA;;QAEA,6DAAA;gBACA,qDAAA;KACA;IACA;;QAEA,mEAAA;gBACA,2DAAA;;KAEA;CACA;;AAEA;;IAEA;;QAEA,6DAAA;gBACA,qDAAA;KACA;IACA;;QAEA,mEAAA;gBACA,2DAAA;;KAEA;CACA;;AAEA;;IAEA;;QAEA,6DAAA;gBACA,qDAAA;KACA;IACA;;QAEA,mEAAA;gBACA,2DAAA;;KAEA;CACA;;AAEA;;IAEA;;QAEA,6DAAA;gBACA,qDAAA;KACA;IACA;;QAEA,mEAAA;gBACA,2DAAA;;KAEA;CACA","file":"RingLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-ring v-ring1\" v-bind:style=\"spinnerBasicStyle\">\n    <div class=\"v-ring v-ring2\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-ring v-ring3\" v-bind:style=\"spinnerStyle\">\n    </div></div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'RingLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '60px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  computed: {\n    spinnerStyle () {\n      return {\n        height: this.size,\n        width: this.size,\n        border: parseFloat(this.size)/10 + 'px solid' + this.color,\n        opacity: 0.4,\n        borderRadius: this.radius\n      }\n    },\n    spinnerBasicStyle () {\n      return {\n        height: this.size,\n        width: this.size,\n        position: 'relative'\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n\n.v-spinner .v-ring\n{         \n}\n\n.v-spinner .v-ring1\n{      \n}\n\n.v-spinner .v-ring2\n{\n\n    -webkit-animation: v-ringRightRotate 2s 0s infinite linear;\n            animation: v-ringRightRotate 2s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    perspective: 800px;  \n    position: absolute;          \n    top: 0;\n    left: 0;\n}\n\n.v-spinner .v-ring3\n{\n\n    -webkit-animation: v-ringLeftRotate 2s 0s infinite linear;\n            animation: v-ringLeftRotate 2s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    perspective: 800px;            \n    position: absolute;\n    top: 0;\n    left: 0;\n}\n\n@-webkit-keyframes v-ringRightRotate\n{\n    0%\n    {\n        -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n                transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n    }\n    100%\n    { \n        -webkit-transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);\n                transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);\n    \n    }\n}\n\n@keyframes v-ringRightRotate\n{\n    0%\n    {\n        -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n                transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n    }\n    100%\n    { \n        -webkit-transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);\n                transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);\n    \n    }\n}\n\n@-webkit-keyframes v-ringLeftRotate\n{\n    0%\n    {\n        -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n                transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n    }\n    100%\n    { \n        -webkit-transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);\n                transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);\n    \n    }\n}\n\n@keyframes v-ringLeftRotate\n{\n    0%\n    {\n        -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n                transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);\n    }\n    100%\n    { \n        -webkit-transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);\n                transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);\n    \n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 71 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-ring v-ring1" v-bind:style="spinnerBasicStyle">
-	//     <div class="v-ring v-ring2" v-bind:style="spinnerStyle">
-	//     </div><div class="v-ring v-ring3" v-bind:style="spinnerStyle">
-	//     </div></div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'RingLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '60px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  computed: {
-	    spinnerStyle: function spinnerStyle() {
-	      return {
-	        height: this.size,
-	        width: this.size,
-	        border: parseFloat(this.size) / 10 + 'px solid' + this.color,
-	        opacity: 0.4,
-	        borderRadius: this.radius
-	      };
-	    },
-	    spinnerBasicStyle: function spinnerBasicStyle() {
-	      return {
-	        height: this.size,
-	        width: this.size,
-	        position: 'relative'
-	      };
-	    }
-	  }
-	
-	};
-	// </script>
-	//
-	// <style>
-	//
-	// .v-spinner .v-ring
-	// {         
-	// }
-	//
-	// .v-spinner .v-ring1
-	// {      
-	// }
-	//
-	// .v-spinner .v-ring2
-	// {
-	//
-	//     -webkit-animation: v-ringRightRotate 2s 0s infinite linear;
-	//             animation: v-ringRightRotate 2s 0s infinite linear;
-	//     -webkit-animation-fill-mode: forwards;
-	//             animation-fill-mode: forwards;
-	//     perspective: 800px;  
-	//     position: absolute;          
-	//     top: 0;
-	//     left: 0;
-	// }
-	//
-	// .v-spinner .v-ring3
-	// {
-	//
-	//     -webkit-animation: v-ringLeftRotate 2s 0s infinite linear;
-	//             animation: v-ringLeftRotate 2s 0s infinite linear;
-	//     -webkit-animation-fill-mode: forwards;
-	//             animation-fill-mode: forwards;
-	//     perspective: 800px;            
-	//     position: absolute;
-	//     top: 0;
-	//     left: 0;
-	// }
-	//
-	// @-webkit-keyframes v-ringRightRotate
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);
-	//                 transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);
-	//     }
-	//     100%
-	//     { 
-	//         -webkit-transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);
-	//                 transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);
-	//
-	//     }
-	// }
-	//
-	// @keyframes v-ringRightRotate
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);
-	//                 transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);
-	//     }
-	//     100%
-	//     { 
-	//         -webkit-transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);
-	//                 transform: rotateX(180deg) rotateY(360deg) rotateZ(360deg);
-	//
-	//     }
-	// }
-	//
-	// @-webkit-keyframes v-ringLeftRotate
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);
-	//                 transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);
-	//     }
-	//     100%
-	//     { 
-	//         -webkit-transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);
-	//                 transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);
-	//
-	//     }
-	// }
-	//
-	// @keyframes v-ringLeftRotate
-	// {
-	//     0%
-	//     {
-	//         -webkit-transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);
-	//                 transform: rotateX(0deg) rotateY(0deg) rotateZ(0deg);
-	//     }
-	//     100%
-	//     { 
-	//         -webkit-transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);
-	//                 transform: rotateX(360deg) rotateY(180deg) rotateZ(360deg);
-	//
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 72 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-ring v-ring1\" v-bind:style=\"spinnerBasicStyle\">\n    <div class=\"v-ring v-ring2\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-ring v-ring3\" v-bind:style=\"spinnerStyle\">\n    </div></div>\n  </div>\n";
-
-/***/ },
-/* 73 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(74)
-	__vue_script__ = __webpack_require__(76)
-	__vue_template__ = __webpack_require__(77)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/BounceLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 74 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(75);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-575033cd&file=BounceLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./BounceLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-575033cd&file=BounceLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./BounceLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 75 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n\n.v-spinner .v-bounce\n{\n          \n}\n\n.v-spinner .v-bounce1\n{\n}\n\n.v-spinner .v-bounce2\n{\n\n    -webkit-animation: v-bounceStretchDelay 2s 1s infinite ease-in-out;\n            animation: v-bounceStretchDelay 2s 1s infinite ease-in-out;\n    -webkit-animation-fill-mode: both;\n            animation-fill-mode: both;\n}\n\n.v-spinner .v-bounce3\n{\n    -webkit-animation: v-bounceStretchDelay 2s 0s infinite ease-in-out;\n            animation: v-bounceStretchDelay 2s 0s infinite ease-in-out;\n    -webkit-animation-fill-mode: both;\n            animation-fill-mode: both;\n}\n\n@-webkit-keyframes v-bounceStretchDelay\n{\n    0%,\n    100%\n    {\n        -webkit-transform: scale(0);\n                transform: scale(0);\n    }\n    50%\n    { \n        -webkit-transform: scale(1.0);\n                transform: scale(1.0);\n    \n    }\n}\n\n@keyframes v-bounceStretchDelay\n{\n    0%,\n    100%\n    {\n        -webkit-transform: scale(0);\n                transform: scale(0);\n    }\n    50%\n    { \n        -webkit-transform: scale(1.0);\n                transform: scale(1.0);\n    \n    }\n}\n", "", {"version":3,"sources":["/./src/BounceLoader.vue.style"],"names":[],"mappings":";;AAiEA;;;CAGA;;AAEA;;CAEA;;AAEA;;;IAGA,mEAAA;YACA,2DAAA;IACA,kCAAA;YACA,0BAAA;CACA;;AAEA;;IAEA,mEAAA;YACA,2DAAA;IACA,kCAAA;YACA,0BAAA;CACA;;AAEA;;IAEA;;;QAGA,4BAAA;gBACA,oBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;;KAEA;CACA;;AAEA;;IAEA;;;QAGA,4BAAA;gBACA,oBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;;KAEA;CACA","file":"BounceLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-bounce v-bounce1\" v-bind:style=\"spinnerBasicStyle\">\n    <div class=\"v-bounce v-bounce2\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-bounce v-bounce3\" v-bind:style=\"spinnerStyle\">\n    </div></div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'BounceLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '60px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  data () {\n    return {\n      spinnerStyle: {\n        backgroundColor: this.color,\n        height: this.size,\n        width: this.size,\n        borderRadius: this.radius,\n        opacity: 0.6,\n        position: 'absolute',\n        top: 0,\n        left: 0\n      }\n    }\n  },\n  computed: {\n    spinnerBasicStyle () {\n      return {\n        height: this.size,\n        width: this.size,\n        position: 'relative'\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n\n.v-spinner .v-bounce\n{\n          \n}\n\n.v-spinner .v-bounce1\n{\n}\n\n.v-spinner .v-bounce2\n{\n\n    -webkit-animation: v-bounceStretchDelay 2s 1s infinite ease-in-out;\n            animation: v-bounceStretchDelay 2s 1s infinite ease-in-out;\n    -webkit-animation-fill-mode: both;\n            animation-fill-mode: both;\n}\n\n.v-spinner .v-bounce3\n{\n    -webkit-animation: v-bounceStretchDelay 2s 0s infinite ease-in-out;\n            animation: v-bounceStretchDelay 2s 0s infinite ease-in-out;\n    -webkit-animation-fill-mode: both;\n            animation-fill-mode: both;\n}\n\n@-webkit-keyframes v-bounceStretchDelay\n{\n    0%,\n    100%\n    {\n        -webkit-transform: scale(0);\n                transform: scale(0);\n    }\n    50%\n    { \n        -webkit-transform: scale(1.0);\n                transform: scale(1.0);\n    \n    }\n}\n\n@keyframes v-bounceStretchDelay\n{\n    0%,\n    100%\n    {\n        -webkit-transform: scale(0);\n                transform: scale(0);\n    }\n    50%\n    { \n        -webkit-transform: scale(1.0);\n                transform: scale(1.0);\n    \n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 76 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-bounce v-bounce1" v-bind:style="spinnerBasicStyle">
-	//     <div class="v-bounce v-bounce2" v-bind:style="spinnerStyle">
-	//     </div><div class="v-bounce v-bounce3" v-bind:style="spinnerStyle">
-	//     </div></div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'BounceLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '60px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  data: function data() {
-	    return {
-	      spinnerStyle: {
-	        backgroundColor: this.color,
-	        height: this.size,
-	        width: this.size,
-	        borderRadius: this.radius,
-	        opacity: 0.6,
-	        position: 'absolute',
-	        top: 0,
-	        left: 0
-	      }
-	    };
-	  },
-	
-	  computed: {
-	    spinnerBasicStyle: function spinnerBasicStyle() {
-	      return {
-	        height: this.size,
-	        width: this.size,
-	        position: 'relative'
-	      };
-	    }
-	  }
-	
-	};
-	// </script>
-	//
-	// <style>
-	//
-	// .v-spinner .v-bounce
-	// {
-	//
-	// }
-	//
-	// .v-spinner .v-bounce1
-	// {
-	// }
-	//
-	// .v-spinner .v-bounce2
-	// {
-	//
-	//     -webkit-animation: v-bounceStretchDelay 2s 1s infinite ease-in-out;
-	//             animation: v-bounceStretchDelay 2s 1s infinite ease-in-out;
-	//     -webkit-animation-fill-mode: both;
-	//             animation-fill-mode: both;
-	// }
-	//
-	// .v-spinner .v-bounce3
-	// {
-	//     -webkit-animation: v-bounceStretchDelay 2s 0s infinite ease-in-out;
-	//             animation: v-bounceStretchDelay 2s 0s infinite ease-in-out;
-	//     -webkit-animation-fill-mode: both;
-	//             animation-fill-mode: both;
-	// }
-	//
-	// @-webkit-keyframes v-bounceStretchDelay
-	// {
-	//     0%,
-	//     100%
-	//     {
-	//         -webkit-transform: scale(0);
-	//                 transform: scale(0);
-	//     }
-	//     50%
-	//     { 
-	//         -webkit-transform: scale(1.0);
-	//                 transform: scale(1.0);
-	//
-	//     }
-	// }
-	//
-	// @keyframes v-bounceStretchDelay
-	// {
-	//     0%,
-	//     100%
-	//     {
-	//         -webkit-transform: scale(0);
-	//                 transform: scale(0);
-	//     }
-	//     50%
-	//     { 
-	//         -webkit-transform: scale(1.0);
-	//                 transform: scale(1.0);
-	//
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 77 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-bounce v-bounce1\" v-bind:style=\"spinnerBasicStyle\">\n    <div class=\"v-bounce v-bounce2\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-bounce v-bounce3\" v-bind:style=\"spinnerStyle\">\n    </div></div>\n  </div>\n";
-
-/***/ },
-/* 78 */
-/***/ function(module, exports, __webpack_require__) {
-
-	var __vue_script__, __vue_template__
-	__webpack_require__(79)
-	__vue_script__ = __webpack_require__(81)
-	__vue_template__ = __webpack_require__(82)
-	module.exports = __vue_script__ || {}
-	if (module.exports.__esModule) module.exports = module.exports.default
-	if (__vue_template__) { (typeof module.exports === "function" ? module.exports.options : module.exports).template = __vue_template__ }
-	if (false) {(function () {  module.hot.accept()
-	  var hotAPI = require("vue-hot-reload-api")
-	  hotAPI.install(require("vue"), true)
-	  if (!hotAPI.compatible) return
-	  var id = "/Users/greyby/Workspaces/tmp/javascript/vuejs/vue-spinner/src/DotLoader.vue"
-	  if (!module.hot.data) {
-	    hotAPI.createRecord(id, module.exports)
-	  } else {
-	    hotAPI.update(id, module.exports, __vue_template__)
-	  }
-	})()}
-
-/***/ },
-/* 79 */
-/***/ function(module, exports, __webpack_require__) {
-
-	// style-loader: Adds some css to the DOM by adding a <style> tag
-	
-	// load the styles
-	var content = __webpack_require__(80);
-	if(typeof content === 'string') content = [[module.id, content, '']];
-	// add the styles to the DOM
-	var update = __webpack_require__(5)(content, {});
-	if(content.locals) module.exports = content.locals;
-	// Hot Module Replacement
-	if(false) {
-		// When the styles change, update the <style> tags
-		if(!content.locals) {
-			module.hot.accept("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-44cd439a&file=DotLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./DotLoader.vue", function() {
-				var newContent = require("!!../node_modules/css-loader/index.js?sourceMap!../node_modules/vue-loader/lib/style-rewriter.js?id=_v-44cd439a&file=DotLoader.vue!../node_modules/vue-loader/lib/selector.js?type=style&index=0!./DotLoader.vue");
-				if(typeof newContent === 'string') newContent = [[module.id, newContent, '']];
-				update(newContent);
-			});
-		}
-		// When the module is disposed, remove the <style> tags
-		module.hot.dispose(function() { update(); });
-	}
-
-/***/ },
-/* 80 */
-/***/ function(module, exports, __webpack_require__) {
-
-	exports = module.exports = __webpack_require__(4)();
-	// imports
-	
-	
-	// module
-	exports.push([module.id, "\n\n.v-spinner .v-dot\n{\n          \n}\n\n.v-spinner .v-dot1\n{\n\n    -webkit-animation: v-dotRotate 2s 0s infinite linear;\n            animation: v-dotRotate 2s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n}\n\n.v-spinner .v-dot2\n{\n\n    -webkit-animation: v-dotBounce 2s 0s infinite linear;\n            animation: v-dotBounce 2s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    position: 'absolute';\n    top: 0;\n    bottom: auto;\n}\n\n.v-spinner .v-dot3\n{\n    -webkit-animation: v-dotBounce 2s -1s infinite linear;\n            animation: v-dotBounce 2s -1s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    position: 'absolute';\n    top: auto;\n    bottom: 0;            \n}\n\n@-webkit-keyframes v-dotRotate\n{\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n\n@keyframes v-dotRotate\n{\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n\n@-webkit-keyframes v-dotBounce\n{\n    0%,\n    100%\n    {\n        -webkit-transform: scale(0);\n                transform: scale(0);\n    }\n    50%\n    {\n        -webkit-transform: scale(1.0);\n                transform: scale(1.0);\n    }\n}\n\n@keyframes v-dotBounce\n{\n    0%,\n    100%\n    {\n        -webkit-transform: scale(0);\n                transform: scale(0);\n    }\n    50%\n    {\n        -webkit-transform: scale(1.0);\n                transform: scale(1.0);\n    }\n}\n", "", {"version":3,"sources":["/./src/DotLoader.vue.style"],"names":[],"mappings":";;AA2DA;;;CAGA;;AAEA;;;IAGA,qDAAA;YACA,6CAAA;IACA,sCAAA;YACA,8BAAA;CACA;;AAEA;;;IAGA,qDAAA;YACA,6CAAA;IACA,sCAAA;YACA,8BAAA;IACA,qBAAA;IACA,OAAA;IACA,aAAA;CACA;;AAEA;;IAEA,sDAAA;YACA,8CAAA;IACA,sCAAA;YACA,8BAAA;IACA,qBAAA;IACA,UAAA;IACA,UAAA;CACA;;AAEA;;IAEA;;QAEA,kCAAA;gBACA,0BAAA;KACA;CACA;;AAEA;;IAEA;;QAEA,kCAAA;gBACA,0BAAA;KACA;CACA;;AAEA;;IAEA;;;QAGA,4BAAA;gBACA,oBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;CACA;;AAEA;;IAEA;;;QAGA,4BAAA;gBACA,oBAAA;KACA;IACA;;QAEA,8BAAA;gBACA,sBAAA;KACA;CACA","file":"DotLoader.vue","sourcesContent":["<template>\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-dot v-dot1\" v-bind:style=\"spinnerBasicStyle\">\n    <div class=\"v-dot v-dot2\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-dot v-dot3\" v-bind:style=\"spinnerStyle\">\n    </div></div>\n  </div>\n</template>\n\n<script>\nexport default {\n  \n  name: 'DotLoader',\n\n  props: {\n    loading: {\n      type: Boolean,\n      default: true\n    },\n    color: { \n      type: String,\n      default: '#5dc596'\n    },\n    size: {\n      type: String,\n      default: '60px'\n    },\n    margin: {\n      type: String,\n      default: '2px'\n    },\n    radius: {\n      type: String,\n      default: '100%'\n    }\n  },\n  computed: {\n    spinnerStyle () {\n      return {\n        backgroundColor: this.color,\n        height: parseFloat(this.size)/2 + 'px',\n        width: parseFloat(this.size)/2 + 'px',\n        borderRadius: this.radius\n      }\n    },\n    spinnerBasicStyle () {\n      return {\n        height: this.size,\n        width: this.size,\n        position: 'relative'\n      }\n    }\n  }\n\n}\n</script>\n\n<style>\n\n.v-spinner .v-dot\n{\n          \n}\n\n.v-spinner .v-dot1\n{\n\n    -webkit-animation: v-dotRotate 2s 0s infinite linear;\n            animation: v-dotRotate 2s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n}\n\n.v-spinner .v-dot2\n{\n\n    -webkit-animation: v-dotBounce 2s 0s infinite linear;\n            animation: v-dotBounce 2s 0s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    position: 'absolute';\n    top: 0;\n    bottom: auto;\n}\n\n.v-spinner .v-dot3\n{\n    -webkit-animation: v-dotBounce 2s -1s infinite linear;\n            animation: v-dotBounce 2s -1s infinite linear;\n    -webkit-animation-fill-mode: forwards;\n            animation-fill-mode: forwards;\n    position: 'absolute';\n    top: auto;\n    bottom: 0;            \n}\n\n@-webkit-keyframes v-dotRotate\n{\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n\n@keyframes v-dotRotate\n{\n    100%\n    {\n        -webkit-transform: rotate(360deg);\n                transform: rotate(360deg);\n    }\n}\n\n@-webkit-keyframes v-dotBounce\n{\n    0%,\n    100%\n    {\n        -webkit-transform: scale(0);\n                transform: scale(0);\n    }\n    50%\n    {\n        -webkit-transform: scale(1.0);\n                transform: scale(1.0);\n    }\n}\n\n@keyframes v-dotBounce\n{\n    0%,\n    100%\n    {\n        -webkit-transform: scale(0);\n                transform: scale(0);\n    }\n    50%\n    {\n        -webkit-transform: scale(1.0);\n                transform: scale(1.0);\n    }\n}\n</style>"],"sourceRoot":"webpack://"}]);
-	
-	// exports
-
-
-/***/ },
-/* 81 */
-/***/ function(module, exports) {
-
-	'use strict';
-	
-	Object.defineProperty(exports, "__esModule", {
-	  value: true
-	});
-	// <template>
-	// <div class="v-spinner" v-show="loading">
-	//     <div class="v-dot v-dot1" v-bind:style="spinnerBasicStyle">
-	//     <div class="v-dot v-dot2" v-bind:style="spinnerStyle">
-	//     </div><div class="v-dot v-dot3" v-bind:style="spinnerStyle">
-	//     </div></div>
-	//   </div>
-	// </template>
-	//
-	// <script>
-	exports.default = {
-	
-	  name: 'DotLoader',
-	
-	  props: {
-	    loading: {
-	      type: Boolean,
-	      default: true
-	    },
-	    color: {
-	      type: String,
-	      default: '#5dc596'
-	    },
-	    size: {
-	      type: String,
-	      default: '60px'
-	    },
-	    margin: {
-	      type: String,
-	      default: '2px'
-	    },
-	    radius: {
-	      type: String,
-	      default: '100%'
-	    }
-	  },
-	  computed: {
-	    spinnerStyle: function spinnerStyle() {
-	      return {
-	        backgroundColor: this.color,
-	        height: parseFloat(this.size) / 2 + 'px',
-	        width: parseFloat(this.size) / 2 + 'px',
-	        borderRadius: this.radius
-	      };
-	    },
-	    spinnerBasicStyle: function spinnerBasicStyle() {
-	      return {
-	        height: this.size,
-	        width: this.size,
-	        position: 'relative'
-	      };
-	    }
-	  }
-	
-	};
-	// </script>
-	//
-	// <style>
-	//
-	// .v-spinner .v-dot
-	// {
-	//
-	// }
-	//
-	// .v-spinner .v-dot1
-	// {
-	//
-	//     -webkit-animation: v-dotRotate 2s 0s infinite linear;
-	//             animation: v-dotRotate 2s 0s infinite linear;
-	//     -webkit-animation-fill-mode: forwards;
-	//             animation-fill-mode: forwards;
-	// }
-	//
-	// .v-spinner .v-dot2
-	// {
-	//
-	//     -webkit-animation: v-dotBounce 2s 0s infinite linear;
-	//             animation: v-dotBounce 2s 0s infinite linear;
-	//     -webkit-animation-fill-mode: forwards;
-	//             animation-fill-mode: forwards;
-	//     position: 'absolute';
-	//     top: 0;
-	//     bottom: auto;
-	// }
-	//
-	// .v-spinner .v-dot3
-	// {
-	//     -webkit-animation: v-dotBounce 2s -1s infinite linear;
-	//             animation: v-dotBounce 2s -1s infinite linear;
-	//     -webkit-animation-fill-mode: forwards;
-	//             animation-fill-mode: forwards;
-	//     position: 'absolute';
-	//     top: auto;
-	//     bottom: 0;            
-	// }
-	//
-	// @-webkit-keyframes v-dotRotate
-	// {
-	//     100%
-	//     {
-	//         -webkit-transform: rotate(360deg);
-	//                 transform: rotate(360deg);
-	//     }
-	// }
-	//
-	// @keyframes v-dotRotate
-	// {
-	//     100%
-	//     {
-	//         -webkit-transform: rotate(360deg);
-	//                 transform: rotate(360deg);
-	//     }
-	// }
-	//
-	// @-webkit-keyframes v-dotBounce
-	// {
-	//     0%,
-	//     100%
-	//     {
-	//         -webkit-transform: scale(0);
-	//                 transform: scale(0);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scale(1.0);
-	//                 transform: scale(1.0);
-	//     }
-	// }
-	//
-	// @keyframes v-dotBounce
-	// {
-	//     0%,
-	//     100%
-	//     {
-	//         -webkit-transform: scale(0);
-	//                 transform: scale(0);
-	//     }
-	//     50%
-	//     {
-	//         -webkit-transform: scale(1.0);
-	//                 transform: scale(1.0);
-	//     }
-	// }
-	// </style>
-	/* generated by vue-loader */
-
-/***/ },
-/* 82 */
-/***/ function(module, exports) {
-
-	module.exports = "\n<div class=\"v-spinner\" v-show=\"loading\">\n    <div class=\"v-dot v-dot1\" v-bind:style=\"spinnerBasicStyle\">\n    <div class=\"v-dot v-dot2\" v-bind:style=\"spinnerStyle\">\n    </div><div class=\"v-dot v-dot3\" v-bind:style=\"spinnerStyle\">\n    </div></div>\n  </div>\n";
-
-/***/ }
-/******/ ])
-});
-;
-//# sourceMappingURL=vue-spinner.js.map
\ No newline at end of file
diff --git a/service/templates/index.html b/service/templates/index.html
deleted file mode 100644
index 3fe0b7e17..000000000
--- a/service/templates/index.html
+++ /dev/null
@@ -1,198 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-    <head>
-        <meta charset="utf-8" />
-        <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no" />
-        <meta name="description" content="" />
-        <meta name="author" content="" />
-        <title>{{title_variable}}</title>
-        <link rel="icon" type="image/x-icon" href="{{ url_for('static', filename='dist/assets/favicon.ico') }}" />
-        <!-- Font Awesome icons (free version)-->
-        <script src="https://use.fontawesome.com/releases/v5.15.3/js/all.js" crossorigin="anonymous"></script>
-        <!-- Google fonts-->
-        <link href="https://fonts.googleapis.com/css?family=Varela+Round" rel="stylesheet" />
-        <link href="https://fonts.googleapis.com/css?family=Nunito:200,200i,300,300i,400,400i,600,600i,700,700i,800,800i,900,900i" rel="stylesheet" />
-        <!-- Core theme CSS (includes Bootstrap)-->
-        <!-- <link href="{{ url_for('static', filename='dist/css/styles.css') }}" rel="stylesheet" /> -->
-        <script src="https://cdn.tailwindcss.com"></script>
-        <script src="https://unpkg.com/vue@3/dist/vue.global.js"></script>
-        <script src="https://cdnjs.cloudflare.com/ajax/libs/axios/0.18.0/axios.js"></script>
-        <script src="/static/utils/vue-spinner.js"></script>
-        <script>
-          var PulseLoader = VueSpinner.PulseLoader
-        </script>
-        <script>
-            function pressed(e) {
-                // Has the enter key been pressed?
-                if ( (window.event ? event.keyCode : e.which) == 13) { 
-                    // If it has been so, manually submit the <form>
-                    document.forms[0].submit();
-                    
-                }
-            }
-
-        </script>
-
-    </head>
-    <body id="page-top" >
-
-        <div id="app" class="flex flex-col  h-screen w-screen overflow-y-auto" style="background-image: url('/static/assets/background.png')">
-            <!-- Header -->
-            <div class="h-[10%]sticky top-0 z-40 w-auto backdrop-blur flex-none transition-colors duration-500 lg:z-50  supports-backdrop-blur:bg-white/60">
-                <div class="max-w-8xl mx-auto">
-                    <div class=" border-2 border-slate-400/10 lg:border-0 mx-4 lg:mx-0">
-                        <div class="relative flex items-center">
-                            <a class="mt-2 mx-2 flex overflow-hidden md:w-auto items-center" href="/">
-                                <img class="h-8 w-40 logo" src="/static/assets/logo.png">
-                            </a>
-                            <button class="bg-blue-200 my-3 rounded-lg px-1" v-on:click="reset" > Reset</button>
-                            <div class="relative hidden lg:flex items-center ml-auto ">
-                                <nav class="text-base leading-6 font-semibold text-slate-700 items-center">
-                                </nav>
-                            </div>
-                        </div>
-                    </div>
-  
-                </div>
-            </div>
-            <!-- Main Chat -->
-            <div class="flex flex-col chat-box w-full h-[90%]">
-                
-                <div class="flex flex-col box-border h-[10%] w-100% justify-center items-center">
-                    <h1 class="text-lg">Welcome to your own Chatbot</h1>
-                </div>
-                <!-- ChatWindow -->
-                <div class="h-[70%] overflow-y-scroll overflow-x-hidden px-10 py-3 " id="chatWindow">
-                    <ul class="flex-auto  font-sans text-base">
-                        <div v-for="item in conversation" :key="item.type">
-                            <li v-if="item.type==1" class="relative min-h-[68px] text-right">
-                              <div class="whitespace-normal px-5 bg-blue-300 bg-opacity-50 inline-block align-top rounded-lg break-all mt-2" >[[item.content]]</div>
-                            </li>
-                            <li v-else-if="item.type===2 " class="relative min-h-[68px] text-left">
-                              <div class="whitespace-pre-line px-5 bg-gray-300 inline-block bg-opacity-75 rounded-lg ">[[item.content]]</div>
-                            </li>
-                          </div>
-                        <!-- loading -->
-                        <div v-show="this.isLoading == true" class="flex bg-gray-300 text-left w-16 items-center justify-center rounded-lg pt-1 mt-2">
-                            <pulse-loader></pulse-loader>
-                        </div>
-                    </ul>
-                </div>
-
-                <div class="h-[20%] border-black px-10 py-5  bottom-0">
-                    <div class="flex px-3 py-2 rounded-lg border-black ">
-                      <textarea class="flex resize-none w-full border-2 border-gray-400 mx-2 my-2 rounded-lg" 
-                                placeholder="Hi, you can ask me anything here to chat with me~" @keypress.enter.prevent="query"  @keydown.enter.shift.exact.prevent="user_input += '\n'"
-                                v-model="user_input"></textarea>
-                      <button class="bg-blue-200 my-3 rounded-lg px-1" v-on:click="query">chat</button>
-                    </div>
-                </div>
-                
-            </div>
-
-
-        </div>
-
-    </body>
-    <script>
-        const { createApp } = Vue
-    
-        createApp({
-        data() {
-            return {
-                user_input: "",
-                conversation: [
-                        {
-                            type: 2,
-                            name: 'Your Assistant',
-                            content: 'Welcome to chat with me !',
-                        }
-                    ],
-                isLoading: false
-            }
-        },
-        components: {
-             PulseLoader
-        },
-        delimiters: ['[[', ']]'],
-        methods:{
-            query() {
-                if(this.user_input == "") {
-                    alert("Your input is empty")
-                    return 0
-                }
-                this.isLoading = true
-                this.conversation.push({
-                type:1,
-                name: 'User',
-                content: this.user_input
-                })
-                // axios.post('/predict', 
-                //             {'History': this.conversation, 'Input' : this.user_input})
-                //             .then(response => {
-                //                 console.log(response.data)
-                //                 this.conversation.push({
-                //                 type:2,
-                //                 name: 'Your Assistant',
-                //                 content: response.data,
-                //                 })
-                //                 this.isLoading = false
-                //                 this.user_input = ""
-                // })
-                fetch('/predict',
-                    {method:"POST",
-                        headers: {
-                            "Content-Type": "application/json",
-                        },
-                        body: JSON.stringify({'History': this.conversation, 'Input' : this.user_input}),
-                    }) 
-                .then(response => { 
-
-                    this.conversation.push({
-                        type:2,
-                        name: 'Your Assistant',
-                        content: "",
-                        vote:0
-                    })
-                    const reader = response.body.getReader() 
-                    const decoder = new TextDecoder()
-
-                    const read = () => { 
-                        reader.read().then(res => {
-                            const chunk = decoder.decode(res.value || new Uint8Array(), {stream: !res.done}) 
-                            let conv_len = this.conversation.length
-                            if(chunk != ""){
-                                this.conversation[conv_len - 1]["content"] =  chunk
-                            }
-                            // console.log(this.conversation)
-                            // console.log(chunk) // do something with the chunk of data received 
-                            this.isLoading = false
-
-                            if(!res.done) read() 
-                        })
-                    }
-                    read() 
-                    this.user_input = ""
-                }) 
-                .catch(error => console.error(error)) 
-
-                
-            },
-            reset(){
-                this.conversation = [
-                {
-                    type: 2,
-                    name: 'Your Assistant',
-                    content: 'Welcome to chat with me !',
-                }
-            ]
-            }
-        },
-        updated(){
-          let ele = document.getElementById('chatWindow');
-          ele.scrollTop = ele.scrollHeight;
-        }
-
-        }).mount('#app')
-    </script>
-</html>
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 37ee0e965..1a8393607 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 import os
-from setuptools import find_packages
-from setuptools import setup
+
+from setuptools import find_packages, setup
 
 folder = os.path.dirname(__file__)
 version_path = os.path.join(folder, "src", "lmflow", "version.py")
@@ -22,7 +22,10 @@
     "gradio": ["gradio"],
     "flask": ["flask", "flask_cors"],
     "flash_attn": ["flash-attn>=2.0.2"],
-    "trl": ["trl==0.8.0"]
+    "trl": ["trl==0.8.0"],
+    "deepspeed": ["deepspeed>=0.14.4"],
+    "develop": ["pytest"],
+    "dev": ["ruff", "pytest", "pre-commit"],
 }
 
 readme_path = os.path.join(folder, "README.md")
@@ -53,4 +56,4 @@
 )
 
 # optionals
-# lm-eval==0.3.0
\ No newline at end of file
+# lm-eval==0.3.0
diff --git a/src/lmflow/__init__.py b/src/lmflow/__init__.py
index 529b2232f..6430d5b67 100644
--- a/src/lmflow/__init__.py
+++ b/src/lmflow/__init__.py
@@ -10,4 +10,6 @@
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.27.0.dev0")
 
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
\ No newline at end of file
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+
+__all__ = ["args", "datasets", "models", "pipeline", "utils"]
diff --git a/src/lmflow/args.py b/src/lmflow/args.py
index 29d4f3062..e8cb15781 100644
--- a/src/lmflow/args.py
+++ b/src/lmflow/args.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 """This script defines dataclasses: ModelArguments and DatasetArguments,
 that contain the arguments for the model and dataset used in training.
 
@@ -11,10 +10,11 @@
 MODEL_FOR_CAUSAL_LM_MAPPING. MODEL_TYPES is assigned a tuple of the model types
 extracted from the MODEL_CONFIG_CLASSES.
 """
+
 import logging
-from dataclasses import dataclass, field, fields, Field, make_dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Optional, List, Union, Dict
+from typing import Optional
 
 from transformers import (
     MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -31,7 +31,7 @@
 logger = logging.getLogger(__name__)
 
 
-class OptimizerNames():
+class OptimizerNames:
     DUMMY = "dummy"
     ADABELIEF = "adabelief"
     ADABOUND = "adabound"
@@ -52,13 +52,14 @@ class OptimizerNames():
     MUON = "muon"
     ADAMW_SCHEDULE_FREE = "adamw_schedule_free"
     SGD_SCHEDULE_FREE = "sgd_schedule_free"
-    
+
+
 @dataclass
 class ModelArguments:
     """
-    Define a class ModelArguments using the dataclass decorator. 
-    The class contains several optional parameters that can be used to configure a model. 
-    
+    Define a class ModelArguments using the dataclass decorator.
+    The class contains several optional parameters that can be used to configure a model.
+
     model_name_or_path : str
         a string representing the path or name of a pretrained
         model checkpoint for weights initialization. If None, a model will be trained from scratch.
@@ -66,15 +67,15 @@ class ModelArguments:
     model_type :  str
         a string representing the type of model to use if training from
         scratch. If not provided, a pretrained model will be used.
-    
+
     config_overrides :  str
         a string representing the default config settings to override
         when training a model from scratch.
-    
+
     config_name : str
         a string representing the name or path of the pretrained config to
         use, if different from the model_name_or_path.
-    
+
     tokenizer_name :  str
         a string representing the name or path of the pretrained tokenizer
         to use, if different from the model_name_or_path.
@@ -101,19 +102,19 @@ class ModelArguments:
     use_ram_optimized_load : bool
         a boolean indicating whether to use disk mapping when memory is not
         enough.
-        
+
     use_int8 : bool
         a boolean indicating whether to load int8 quantization for inference.
-        
+
     load_in_4bit : bool
         whether to load the model in 4bit
-        
+
     model_max_length : int
         The maximum length of the model.
-        
+
     truncation_side : str
         The side on which the model should have truncation applied.
-        
+
     arch_type : str
         Model architecture type.
     padding_side : str
@@ -140,7 +141,7 @@ class ModelArguments:
                 " Along with the original non-finetuned model forms the whole"
                 " finetuned model."
             )
-        }
+        },
     )
     model_type: Optional[str] = field(
         default=None,
@@ -182,17 +183,11 @@ class ModelArguments:
     )
     token: Optional[str] = field(
         default=None,
-        metadata={
-            "help": ("Necessary to specify when accessing a private model/dataset.")
-        },
+        metadata={"help": ("Necessary to specify when accessing a private model/dataset.")},
     )
     trust_remote_code: bool = field(
         default=False,
-        metadata={
-            "help": (
-                "Whether to trust remote code when loading model."
-            )
-        },
+        metadata={"help": ("Whether to trust remote code when loading model.")},
     )
     torch_dtype: Optional[str] = field(
         default=None,
@@ -216,15 +211,26 @@ class ModelArguments:
         default=False,
         metadata={"help": "Whether to use qlora."},
     )
-    bits: int = field(
+    bits: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "[deprecated] The number of bits for quantization.",
+            "choices": [4, 8],
+        },
+    )
+    quant_bit: int = field(
         default=4,
-        metadata={"help": "The number of bits for quantization.",
-                  "choices": [4, 8], },
+        metadata={
+            "help": "The number of bits for quantization.",
+            "choices": [4, 8],
+        },
     )
     quant_type: str = field(
-        default='nf4',
-        metadata={"help": "The quantization type for quantization.",
-                  "choices": ["nf4", "fp4"], },
+        default="nf4",
+        metadata={
+            "help": "The quantization type for quantization.",
+            "choices": ["nf4", "fp4"],
+        },
     )
     double_quant: bool = field(
         default=True,
@@ -237,7 +243,9 @@ class ModelArguments:
     lora_alpha: int = field(
         default=32,
         metadata={
-            "help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter called alpha in the paper."},
+            "help": "Merging ratio between the fine-tuned model and the original. This is controlled by a parameter "
+            "called alpha in the paper."
+        },
     )
     lora_target_modules: str = field(
         default=None, metadata={"help": "Model modules to apply LoRA to. Use comma to separate multiple modules."}
@@ -251,25 +259,14 @@ class ModelArguments:
         metadata={"help": "Whether to save aggregated lora."},
     )
     use_ram_optimized_load: bool = field(
-        default=True,
-        metadata={"help": "Whether use disk mapping when memory is not enough."}
+        default=True, metadata={"help": "Whether use disk mapping when memory is not enough."}
     )
     use_flash_attention: bool = field(
         default=False,
-        metadata={
-            "help": (
-                "whether use flash attention layer to reduce GPU memory with"
-                " higher time cost."
-            )
-        }
+        metadata={"help": ("whether use flash attention layer to reduce GPU memory with higher time cost.")},
     )
     truncate_to_model_max_length: bool = field(
-        default=True,
-        metadata={
-            "help": (
-                "whether truncate the dataset to model max length."
-            )
-        }
+        default=True, metadata={"help": ("whether truncate the dataset to model max length.")}
     )
     do_rope_scaling: bool = field(
         default=False,
@@ -281,39 +278,22 @@ class ModelArguments:
                 "NTK_scaling credits to the Reddit users /u/bloc97 and /u/emozilla."
                 "https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/"
             )
-        }
-    )
-    rope_pi_ratio: int = field(
-        default=1,
-        metadata={
-            "help": (
-                "the ratio of pi in RoPE scaling."
-            )
-        }
-    )
-    rope_ntk_ratio: int = field(
-        default=1,
-        metadata={
-            "help": (
-                "the ratio of NTK in RoPE scaling."
-            )
-        }
-    )
-    use_int8: bool = field(
-        default=False,
-        metadata={"help": "whether to load int8 quantization for inference"}
+        },
     )
+    rope_pi_ratio: int = field(default=1, metadata={"help": ("the ratio of pi in RoPE scaling.")})
+    rope_ntk_ratio: int = field(default=1, metadata={"help": ("the ratio of NTK in RoPE scaling.")})
+    use_int8: bool = field(default=False, metadata={"help": "whether to load int8 quantization for inference"})
     load_in_4bit: Optional[bool] = field(
         default=True,
-        metadata={
-            "help": "whether to load the model in 4bit"
-        },
+        metadata={"help": "whether to load the model in 4bit"},
     )
     model_max_length: Optional[int] = field(
         default=None,
-        metadata={"help": (
-            "The maximum length of the model. When not specified, "
-            "will follow the model's default max length. (i.e., tokenizer.model_max_length)")
+        metadata={
+            "help": (
+                "The maximum length of the model. When not specified, "
+                "will follow the model's default max length. (i.e., tokenizer.model_max_length)"
+            )
         },
     )
     truncation_side: str = field(
@@ -322,24 +302,23 @@ class ModelArguments:
             "help": (
                 "The side on which the tokenizer should have truncation applied. "
                 "When not specified, will follow the tokenizer's default truncation strategy. "
-                "(i.e., tokenizer.truncation_side)"),
+                "(i.e., tokenizer.truncation_side)"
+            ),
             "choices": [None, "left", "right"],
         },
     )
     padding_side: str = field(
-        default='right',
+        default="right",
         metadata={
             "help": (
                 "The side on which the tokenizer should have padding applied. "
                 "LMFlow uses right padding by default. When set to `auto`, will "
-                "use padding_side from tokenizer.padding_side."),
+                "use padding_side from tokenizer.padding_side."
+            ),
             "choices": ["right", "left", "auto"],
-        }
-    )
-    eos_padding: Optional[bool] = field(
-        default=False, 
-        metadata={"help": "whether to pad with eos token"}
+        },
     )
+    eos_padding: Optional[bool] = field(default=False, metadata={"help": "whether to pad with eos token"})
     ignore_bias_buffers: Optional[bool] = field(
         default=False,
         metadata={
@@ -348,71 +327,62 @@ class ModelArguments:
             "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992"
         },
     )
-    
 
     def __post_init__(self):
         if self.config_overrides is not None and (self.config_name is not None or self.model_name_or_path is not None):
             raise ValueError(
                 "--config_overrides can't be used in combination with --config_name or --model_name_or_path"
             )
-            
+
         if self.use_qlora:
             if not self.use_lora:
                 logger.warning("use_qlora is set to True, but use_lora is not set to True. Setting use_lora to True.")
                 self.use_lora = True
-                
+
         if self.use_flash_attention:
             if not is_flash_attn_available():
                 self.use_flash_attention = False
-                logger.warning("Flash attention is not available in the current environment. Disabling flash attention.")
-                
+                logger.warning(
+                    "Flash attention is not available in the current environment. Disabling "
+                    "flash attention. If you want to use flash attention, please install by "
+                    "`pip install -e '.[flash_attn]'`."
+                )
+        else:
+            logger.warning(
+                "Flash attention is not enabled. We recommend enabling flash attention by "
+                "`--use_flash_attention 1` for better performance."
+            )
+
         if self.lora_target_modules is not None:
-            self.lora_target_modules: List[str] = split_args(self.lora_target_modules)
+            self.lora_target_modules: list[str] = split_args(self.lora_target_modules)
+
+        if "encoder_decoder" in self.arch_type:
+            raise NotImplementedError("The encoder-decoder model is not fully implemented yet.")
+
+        if self.bits is not None:
+            logger.warning("The argument `bits` is deprecated. Please use `quant_bit` instead.")
+            self.quant_bit = self.bits
 
 
 @dataclass
 class VisModelArguments(ModelArguments):
-    low_resource: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "Use 8 bit and float16 when loading llm"
-        }
-    )
-    custom_model: bool = field(
-        default=False,
-        metadata={"help": "flag for the model from huggingface or not"}
-    )
+    low_resource: Optional[bool] = field(default=False, metadata={"help": "Use 8 bit and float16 when loading llm"})
+    custom_model: bool = field(default=False, metadata={"help": "flag for the model from huggingface or not"})
     pretrained_language_projection_path: str = field(
-        default=None,
-        metadata={"help": "path for model pretrained_language_projection_path"}
-    )
-    custom_vision_model: bool = field(
-        default=False,
-        metadata={"help": "flag for the model from huggingface or not"}
+        default=None, metadata={"help": "path for model pretrained_language_projection_path"}
     )
+    custom_vision_model: bool = field(default=False, metadata={"help": "flag for the model from huggingface or not"})
     image_encoder_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help": (
-                "The name or path of the image encoder to use."
-            )
-        },
+        metadata={"help": ("The name or path of the image encoder to use.")},
     )
     qformer_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help": (
-                "llm model in multi-modality model"
-            )
-        },
+        metadata={"help": ("llm model in multi-modality model")},
     )
     llm_model_name_or_path: Optional[str] = field(
         default=None,
-        metadata={
-            "help": (
-                "llm model in multi-modality model"
-            )
-        },
+        metadata={"help": ("llm model in multi-modality model")},
     )
     use_prompt_cache: bool = field(
         default=False,
@@ -497,24 +467,18 @@ class DatasetArguments:
         a string representing the path to the dataset cache directory. Useful when the default cache dir
         (`~/.cache/huggingface/datasets`) has limited space.
 
-    The class also includes some additional parameters that can be used to configure the dataset further, such as `overwrite_cache`,
-    `validation_split_percentage`, `preprocessing_num_workers`, `disable_group_texts`, `demo_example_in_prompt`, `explanation_in_prompt`,
-    `keep_linebreaks`, and `prompt_structure`.
+    The class also includes some additional parameters that can be used to configure the dataset further, such as
+    `overwrite_cache`, `validation_split_percentage`, `preprocessing_num_workers`, `disable_group_texts`,
+    `demo_example_in_prompt`, `explanation_in_prompt`, `keep_linebreaks`, and `prompt_structure`.
 
-    The field function is used to set default values and provide help messages for each parameter. The Optional type hint is
-    used to indicate that a parameter is optional. The metadata argument is used to provide additional information about
-    each parameter, such as a help message.
+    The field function is used to set default values and provide help messages for each parameter. The Optional type
+    hint is used to indicate that a parameter is optional. The metadata argument is used to provide additional
+    information about each parameter, such as a help message.
     """
 
-    dataset_path: Optional[str] = field(
-        default=None, metadata={"help": "The path of the dataset to use."}
-    )
-    dataset_name: Optional[str] = field(
-        default="customized", metadata={"help": "Should be \"customized\""}
-    )
-    is_custom_dataset: Optional[bool] = field(
-        default=False, metadata={"help": "whether to use custom data"}
-    )
+    dataset_path: Optional[str] = field(default=None, metadata={"help": "The path of the dataset to use."})
+    dataset_name: Optional[str] = field(default="customized", metadata={"help": 'Should be "customized"'})
+    is_custom_dataset: Optional[bool] = field(default=False, metadata={"help": "whether to use custom data"})
     customized_cache_dir: Optional[str] = field(
         default=".cache/llm-ft/datasets",
         metadata={"help": "Where do you want to store the customized dataset caches"},
@@ -556,14 +520,10 @@ class DatasetArguments:
             )
         },
     )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
+    overwrite_cache: bool = field(default=False, metadata={"help": "Overwrite the cached training and evaluation sets"})
     validation_split_percentage: Optional[int] = field(
         default=5,
-        metadata={
-            "help": "The percentage of the train set used as validation set in case there's no validation split"
-        },
+        metadata={"help": "The percentage of the train set used as validation set in case there's no validation split"},
     )
     preprocessing_num_workers: Optional[int] = field(
         default=None,
@@ -577,7 +537,7 @@ class DatasetArguments:
                 " `group_texts` operation. See `--disable_group_texts` for"
                 " detailed explanation of this operation."
             )
-        }
+        },
     )
     disable_group_texts: bool = field(
         default=True,
@@ -606,17 +566,28 @@ class DatasetArguments:
         metadata={"help": "Evaluation File Path"},
     )
     train_on_prompt: bool = field(
-        default=False,
-        metadata={"help": "Whether to train on prompt for conversation datasets such as ShareGPT."}
+        default=False, metadata={"help": "Whether to train on prompt for conversation datasets such as ShareGPT."}
     )
     conversation_template: Optional[str] = field(
-        default=None,
-        metadata={"help": "The template for conversation datasets."}
+        default=None, metadata={"help": "The template for conversation datasets."}
     )
     dataset_cache_dir: Optional[str] = field(
         default=None,
-        metadata={"help": ("The path to the dataset cache directory. Useful when the "
-                           "default cache dir (`~/.cache/huggingface/datasets`) has limited space.")}
+        metadata={
+            "help": (
+                "The path to the dataset cache directory. Useful when the "
+                "default cache dir (`~/.cache/huggingface/datasets`) has limited space."
+            )
+        },
+    )
+    calculate_dataset_stats: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to calculate the dataset statistics, including the number of samples, "
+                "the average length of samples, total tokens, etc."
+            )
+        },
     )
 
     def __post_init__(self):
@@ -636,21 +607,11 @@ def __post_init__(self):
 
 @dataclass
 class MultiModalDatasetArguments(DatasetArguments):
-    image_folder: Optional[str] = field(
-        default=None, metadata={"help": "The folder of the image file."}
-    )
-    image_aspect_ratio: Optional[str] = field(
-        default="pad", metadata={"help": "The ratio type"}
-    )
-    is_multimodal: Optional[bool] = field(
-        default=True, metadata={"help": "Flag for the modality type."}
-    )
-    use_image_start_end: Optional[bool] = field(
-        default=True, metadata={"help": "Flag for the modality type."}
-    )
-    sep_style: Optional[str] = field(
-        default="plain", metadata={"help": "Sep style in multi_modality dataset."}
-    )
+    image_folder: Optional[str] = field(default=None, metadata={"help": "The folder of the image file."})
+    image_aspect_ratio: Optional[str] = field(default="pad", metadata={"help": "The ratio type"})
+    is_multimodal: Optional[bool] = field(default=True, metadata={"help": "Flag for the modality type."})
+    use_image_start_end: Optional[bool] = field(default=True, metadata={"help": "Flag for the modality type."})
+    sep_style: Optional[str] = field(default="plain", metadata={"help": "Sep style in multi_modality dataset."})
 
 
 @dataclass
@@ -658,128 +619,65 @@ class FinetunerArguments(TrainingArguments):
     """
     Adapt transformers.TrainingArguments
     """
-    eval_dataset_path: Optional[str] = field(
-        default=None, metadata={"help": "The path of the eval dataset to use."}
-    )
+
+    eval_dataset_path: Optional[str] = field(default=None, metadata={"help": "The path of the eval dataset to use."})
     remove_unused_columns: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "whether to remove the unused columns in collate fn"}
-    )
-    finetune_part: Optional[str] = field(
-        default="language_projection",
-        metadata={
-            "help": "the module to finetune."
-        }
+        default=False, metadata={"help": "whether to remove the unused columns in collate fn"}
     )
+    finetune_part: Optional[str] = field(default="language_projection", metadata={"help": "the module to finetune."})
     save_language_projection: Optional[str] = field(
-        default=False,
-        metadata={
-            "help": "whether to save language projection layer in multi-modal models."
-        }
-    )
-    use_lisa: bool = field(
-        default=False,
-        metadata={
-            "help": "whether to use LISA training strategy."
-        }
-    )
-    lisa_activated_layers: int = field(
-        default=2,
-        metadata={
-            "help": "the number of activated layers in LISA."
-        }
+        default=False, metadata={"help": "whether to save language projection layer in multi-modal models."}
     )
+    use_lisa: bool = field(default=False, metadata={"help": "whether to use LISA training strategy."})
+    lisa_activated_layers: int = field(default=2, metadata={"help": "the number of activated layers in LISA."})
     lisa_interval_steps: int = field(
         default=20,
         metadata={
-            "help": "the number of steps in each freezing interval of LISA, i.e. the selected unfreezed layers are randomly switched every {lisa_interval_steps} steps."
-        }
+            "help": "the number of steps in each freezing interval of LISA, i.e. the selected unfreezed layers "
+            "are randomly switched every {lisa_interval_steps} steps."
+        },
     )
     lisa_layers_attribute: str = field(
-        default="model.model.layers",
-        metadata={
-            "help": "where the layer attribute stores, e.g. model.model.layers"
-        }
-    )
-    use_customized_optim: bool = field(
-        default=False,
-        metadata={
-            "help": "whether to use customized optimizers."
-        }
-    )
-    customized_optim: str = field(
-        default="sign_sgd",
-        metadata={
-            "help": "name of the customized optimizer."
-        }
-    )
-    customized_optim_args: str = field(
-        default=None,
-        metadata={
-            "help": "optional arguments that are supplied."
-        }
+        default="model.model.layers", metadata={"help": "where the layer attribute stores, e.g. model.model.layers"}
     )
+    use_customized_optim: bool = field(default=False, metadata={"help": "whether to use customized optimizers."})
+    customized_optim: str = field(default="sign_sgd", metadata={"help": "name of the customized optimizer."})
+    customized_optim_args: str = field(default=None, metadata={"help": "optional arguments that are supplied."})
     optim_dummy_beta1: float = field(
-        default=0.9,
-        metadata={
-            "help": "A useless argument for dummy optimizer, just for tutorial"
-        }
+        default=0.9, metadata={"help": "A useless argument for dummy optimizer, just for tutorial"}
     )
     optim_dummy_beta2: float = field(
-        default=0.999,
-        metadata={
-            "help": "A useless argument for dummy optimizer, just for tutorial"
-        }
+        default=0.999, metadata={"help": "A useless argument for dummy optimizer, just for tutorial"}
     )
     optim_adam_beta1: float = field(
-        default=0.9,
-        metadata={
-            "help": "Coefficient used for computing running averages of gradient"
-        }
+        default=0.9, metadata={"help": "Coefficient used for computing running averages of gradient"}
     )
     optim_adam_beta2: float = field(
-        default=0.999,
-        metadata={
-            "help": "Coefficient used for computing running averages of squared gradient"
-        }
+        default=0.999, metadata={"help": "Coefficient used for computing running averages of squared gradient"}
     )
     optim_beta1: float = field(
-        default=0.9,
-        metadata={
-            "help": "Coefficient used for computing running averages of gradient"
-        }
+        default=0.9, metadata={"help": "Coefficient used for computing running averages of gradient"}
     )
     optim_beta2: float = field(
-        default=0.999,
-        metadata={
-            "help": "Coefficient used for computing running averages of squared gradient"
-        }
+        default=0.999, metadata={"help": "Coefficient used for computing running averages of squared gradient"}
     )
     optim_beta3: float = field(
-        default=0.9,
-        metadata={
-            "help": "Coefficient used for computing running averages of gradient"
-        }
+        default=0.9, metadata={"help": "Coefficient used for computing running averages of gradient"}
     )
     optim_momentum: float = field(
-        default=0.999,
-        metadata={
-            "help": "Coefficient used for the momentum term in optimizers like SGD with momentum"
-        }
+        default=0.999, metadata={"help": "Coefficient used for the momentum term in optimizers like SGD with momentum"}
     )
     optim_weight_decay: float = field(
-        default=0,
-        metadata={
-            "help": "Weight decay (L2 penalty) added to the loss to prevent overfitting"
-        }
+        default=0, metadata={"help": "Weight decay (L2 penalty) added to the loss to prevent overfitting"}
     )
-    
+
+
 @dataclass
 class RewardModelTunerArguments(FinetunerArguments):
     """
     Arguments for reward modeling.
     """
+
     pass
 
 
@@ -813,33 +711,18 @@ class EvaluatorArguments:
     repetition_penalty : float
         An argument of model.generate in huggingface to penalize repetitions.
     """
-    local_rank: int = field(
-        default=-1,
-        metadata={"help": "For distributed training: local_rank"
-                  }
-    )
 
-    random_shuffle: Optional[bool] = field(
-        default=False,
-        metadata={"help": ""
-                  }
-    )
+    local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"})
+
+    random_shuffle: Optional[bool] = field(default=False, metadata={"help": ""})
 
     use_wandb: Optional[bool] = field(
         default=False,
-        metadata={
-            "help": (
-                "When this flag is True, wandb will be enabled"
-            )
-        },
+        metadata={"help": ("When this flag is True, wandb will be enabled")},
     )
     random_seed: Optional[int] = field(
         default=1,
-        metadata={
-            "help": (
-                "used to set random seed"
-            )
-        },
+        metadata={"help": ("used to set random seed")},
     )
     output_dir: Optional[str] = field(
         default="./output_dir",
@@ -848,9 +731,7 @@ class EvaluatorArguments:
     mixed_precision: Optional[str] = field(
         default="bf16",
         metadata={
-            "help": (
-                "mixed precision mode, whether to use bf16 or fp16"
-            ),
+            "help": ("mixed precision mode, whether to use bf16 or fp16"),
             "choices": ["bf16", "fp16"],
         },
     )
@@ -867,13 +748,13 @@ class EvaluatorArguments:
         default="text",
         metadata={
             "help": (
-                'Question type for answer extraction from the decoder output.'
-                ' Supported types: \n'
+                "Question type for answer extraction from the decoder output."
+                " Supported types: \n"
                 '   1) "multiple_choice", e.g. A, B, C, D, ...\n'
                 '   2) "binary_choice", e.g. yes, no, maybe\n'
                 '   3) "math", e.g. 1.0, -3.52\n'
                 '   4) "text", e.g. "I think that it is okay"\n'
-                '   5) Special treatment for several datasets\n'
+                "   5) Special treatment for several datasets\n"
                 '     - "gsm8k"\n'
                 '     - "svamp"\n'
                 '     - "asdiv"\n'
@@ -893,9 +774,9 @@ class EvaluatorArguments:
         default="{input}",
         metadata={
             "help": (
-                'Prompt structure to facilitate prompt engineering during'
-                ' inference. The model will receive'
-                ' `prompt_structure.format(input=input)` as its input.'
+                "Prompt structure to facilitate prompt engineering during"
+                " inference. The model will receive"
+                " `prompt_structure.format(input=input)` as its input."
             )
         },
     )
@@ -903,8 +784,9 @@ class EvaluatorArguments:
         default=512,
         metadata={
             "help": (
-                "the model will have at least block_size tokens for context when calculating the conditional likelihood of any one token"
-                " (provided there are block_size preceding tokens available to condition on)"
+                "the model will have at least block_size tokens for context when calculating the "
+                "conditional likelihood of any one token (provided there are block_size preceding "
+                "tokens available to condition on)"
             )
         },
     )
@@ -925,8 +807,9 @@ class EvaluatorArguments:
             ),
         },
     )
-    use_accelerator_for_evaluator: bool = field(
-        default=False, metadata={"help": "Whether to use Huggingface Accelerator instead of Deepspeed"},
+    use_accelerator_for_evaluator: Optional[bool] = field(
+        default=None,
+        metadata={"help": "[Deprecated] Whether to use Huggingface Accelerator instead of Deepspeed"},
     )
 
     temperature: float = field(
@@ -944,6 +827,19 @@ class EvaluatorArguments:
         metadata={"help": "Maximum length during inference."},
     )
 
+    minibatch_size: int = field(
+        default=1,
+        metadata={"help": "Mini batch size during evaluation."},
+    )
+
+    def __post_init__(self):
+        if self.use_accelerator_for_evaluator is not None:
+            logger.warning(
+                "You've specified `use_accelerator_for_evaluator`. This argument is deprecated. "
+                "It will not take effect and will be removed in a future version, "
+                "since LMFlow now can automatically detect whether is in Accelerate or Deepspeed environment."
+            )
+
 
 @dataclass
 class InferencerArguments:
@@ -967,13 +863,13 @@ class InferencerArguments:
     use_beam_search : Optional[bool]
         Whether to use beam search during inference, By default False.
     num_output_sequences : Optional[int]
-        Number of output sequences to return for the given prompt, 
+        Number of output sequences to return for the given prompt,
         currently only used in vllm inference, By default 8.
     top_p : Optional[float]
         top_p for sampling, By default 1.0.
     top_k : Optional[int]
         top_k for sampling, By default -1 (no top_k).
-    additional_stop_token_ids : Optional[List[int]]
+    additional_stop_token_ids : Optional[list[int]]
         the ids of the end of sentence tokens, By default [].
     apply_chat_template : Optional[bool]
         Whether to apply chat template, By default True.
@@ -982,10 +878,10 @@ class InferencerArguments:
     results_path : Optional[str]
         The **json file** path of inference results, By default None.
     enable_decode_inference_result : Optional[bool]
-        Whether to detokenize the inference results. 
+        Whether to detokenize the inference results.
 
-        NOTE: For iterative align pipelines, whether to detokenize depends on 
-        the homogeneity of the policy model and the reward model 
+        NOTE: For iterative align pipelines, whether to detokenize depends on
+        the homogeneity of the policy model and the reward model
         (i.e., if they have the same tokenizer).
     use_vllm: bool, optional
         Whether to use VLLM for inference, By default False.
@@ -995,6 +891,7 @@ class InferencerArguments:
         The GPU memory utilization for VLLM inference. The proportion of GPU
         memory (per GPU) to use for VLLM inference.
     """
+
     device: str = field(
         default="gpu",
         metadata={
@@ -1004,17 +901,13 @@ class InferencerArguments:
     )
     local_rank: int = field(
         default=-1,
-        metadata={"help": "For distributed training: local_rank"
-                  },
+        metadata={"help": "For distributed training: local_rank"},
     )
     inference_batch_size: int = field(
         default=1,
         metadata={"help": "batch size for inference"},
     )
-    vllm_inference_batch_size: int = field(
-        default=1,
-        metadata={"help": "The batch size for VLLM inference."}
-    )
+    vllm_inference_batch_size: int = field(default=1, metadata={"help": "The batch size for VLLM inference."})
     temperature: float = field(
         default=0.0,
         metadata={"help": "Temperature during inference."},
@@ -1032,11 +925,7 @@ class InferencerArguments:
 
     random_seed: Optional[int] = field(
         default=1,
-        metadata={
-            "help": (
-                "used to set random seed"
-            )
-        },
+        metadata={"help": ("used to set random seed")},
     )
     deepspeed: Optional[str] = field(
         default=None,
@@ -1050,20 +939,17 @@ class InferencerArguments:
     mixed_precision: Optional[str] = field(
         default="bf16",
         metadata={
-            "help": (
-                "mixed precision mode, whether to use bf16 or fp16"
-            ),
+            "help": ("mixed precision mode, whether to use bf16 or fp16"),
             "choices": ["bf16", "fp16"],
         },
     )
     do_sample: Optional[bool] = field(
         default=False,
-        metadata={
-            "help": "whether turn on true random sampling during inference."
-        },
+        metadata={"help": "whether turn on true random sampling during inference."},
     )
-    use_accelerator: bool = field(
-        default=False, metadata={"help": "Whether to use Huggingface Accelerator instead of Deepspeed"},
+    use_accelerator: Optional[bool] = field(
+        default=None,
+        metadata={"help": "[Deprecated] Whether to use Huggingface Accelerator instead of Deepspeed"},
     )
     use_beam_search: Optional[bool] = field(
         default=False,
@@ -1071,10 +957,11 @@ class InferencerArguments:
     )
     num_output_sequences: Optional[int] = field(
         default=8,
-        metadata={"help": (
-            "number of output sequences to return for the given prompt, "
-            "currently only used in vllm inference."
-        )},
+        metadata={
+            "help": (
+                "number of output sequences to return for the given prompt, currently only used in vllm inference."
+            )
+        },
     )
     top_p: Optional[float] = field(
         default=1.0,
@@ -1084,8 +971,8 @@ class InferencerArguments:
         default=-1,
         metadata={"help": "top_k for sampling."},
     )
-    additional_stop_token_ids: Optional[List[int]] = field(
-        default_factory=lambda: [], 
+    additional_stop_token_ids: Optional[list[int]] = field(
+        default_factory=lambda: [],
         metadata={"help": "the ids of the end of sentence tokens"},
     )
     apply_chat_template: Optional[bool] = field(
@@ -1097,41 +984,36 @@ class InferencerArguments:
         metadata={"help": "Whether to decode the inference results."},
     )
     tensor_parallel_size: Optional[int] = field(
-        default=1,
-        metadata={"help": "The tp size for distributed (multi-instance) inference."}
+        default=1, metadata={"help": "The tp size for distributed (multi-instance) inference."}
     )
     enable_distributed_inference: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to use multi-instance VLLM inference."}
+        default=False, metadata={"help": "Whether to use multi-instance VLLM inference."}
     )
     distributed_inference_num_instances: Optional[int] = field(
-        default=1,
-        metadata={"help": "The number of instances for multi-instance VLLM inference."}
+        default=1, metadata={"help": "The number of instances for multi-instance VLLM inference."}
     )
-    
+
     # vllm inference args
-    use_vllm: bool = field(
-        default=False,
-        metadata={"help": "Whether to use VLLM for inference, By default False."}
-    )
+    use_vllm: bool = field(default=False, metadata={"help": "Whether to use VLLM for inference, By default False."})
     vllm_tensor_parallel_size: Optional[int] = field(
-        default=1,
-        metadata={"help": "The tensor parallel size for VLLM inference."}
+        default=1, metadata={"help": "The tensor parallel size for VLLM inference."}
     )
     vllm_gpu_memory_utilization: Optional[float] = field(
-        default=0.95,
-        metadata={"help": "The GPU memory utilization for VLLM inference."}
+        default=0.95, metadata={"help": "The GPU memory utilization for VLLM inference."}
     )
-    
+
     # Args for result saving
-    save_results: Optional[bool] = field(
-        default=False, metadata={"help": "Whether to save inference results."}
-    )
-    results_path: Optional[str] = field(
-        default=None, metadata={"help": "The path of inference results."}
-    )
-    
+    save_results: Optional[bool] = field(default=False, metadata={"help": "Whether to save inference results."})
+    results_path: Optional[str] = field(default=None, metadata={"help": "The path of inference results."})
+
     def __post_init__(self):
+        if self.use_accelerator is not None:
+            logger.warning(
+                "You've specified `use_accelerator`. This argument is deprecated. "
+                "It will not take effect and will be removed in a future version, "
+                "since LMFlow now can automatically detect whether is in Accelerate or Deepspeed environment."
+            )
+
         if self.save_results:
             if self.results_path is None:
                 raise ValueError("Need to specify results_path when save_results is True.")
@@ -1147,43 +1029,29 @@ class RaftAlignerArguments(TrainingArguments):
     """
     Define a class RaftAlignerArguments to configure raft aligner.
     """
+
     output_reward_path: Optional[str] = field(
-        default="tmp/raft_aligner/",
-        metadata={
-            "help": "The path of output rewards."
-        }
+        default="tmp/raft_aligner/", metadata={"help": "The path of output rewards."}
     )
     output_min_length: Optional[int] = field(
         default=64,
         metadata={
-            "help": (
-                "minimum length of the output token sequence generated from"
-                " model given an input."
-            ),
+            "help": ("minimum length of the output token sequence generated from model given an input."),
         },
     )
     output_max_length: Optional[int] = field(
         default=128,
         metadata={
-            "help": (
-                "maximum length of the output token sequence generated from"
-                " model given an output."
-            ),
+            "help": ("maximum length of the output token sequence generated from model given an output."),
         },
     )
     num_raft_iteration: Optional[int] = field(
         default=20,
-        metadata={
-            "help": "number of iterations of the raft aligner."
-        },
+        metadata={"help": "number of iterations of the raft aligner."},
     )
     raft_batch_size: Optional[int] = field(
         default=1024,
-        metadata={
-            "help": (
-                "only select {raft_batch_size} samples each time for STF training."
-            )
-        },
+        metadata={"help": ("only select {raft_batch_size} samples each time for STF training.")},
     )
     top_reward_percentage: Optional[float] = field(
         default=0.2,
@@ -1220,17 +1088,30 @@ class RaftAlignerArguments(TrainingArguments):
 class BenchmarkingArguments:
     dataset_name: Optional[str] = field(
         default=None,
-        metadata={
-            "help": "benchmark dataset name provided by lmflow"
-        },
+        metadata={"help": "benchmark dataset name provided by lmflow"},
     )
     lm_evaluation_metric: Optional[str] = field(
         default="accuracy",
         metadata={
             "help": "the metric the model will be evaluated on",
-            "choices": ["acc", "acc_norm", "bleu", "chrf", "em", "f1", "ppl", \
-                        "ter", "r@1", "r@2", "mrr", "mc1", "mc2", "word_perplexity", \
-                        "byte_perplexity", "bits_per_byte"],
+            "choices": [
+                "acc",
+                "acc_norm",
+                "bleu",
+                "chrf",
+                "em",
+                "f1",
+                "ppl",
+                "ter",
+                "r@1",
+                "r@2",
+                "mrr",
+                "mc1",
+                "mc2",
+                "word_perplexity",
+                "byte_perplexity",
+                "bits_per_byte",
+            ],
         },
     )
 
@@ -1240,149 +1121,81 @@ class DPOAlignerArguments:
     """
     The arguments for the DPO training script.
     """
+
     local_rank: int = field(
         default=-1,
-        metadata={"help": "For distributed training: local_rank"
-                  },
+        metadata={"help": "For distributed training: local_rank"},
     )
     # data parameters
-    beta: Optional[float] = field(
-        default=0.1,
-        metadata={
-            "help": "the beta parameter for DPO loss"
-        }
-    )
+    beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"})
     # # training parameters
-    learning_rate: Optional[float] = field(
-        default=5e-4,
-        metadata={
-            "help": "optimizer learning rate"
-        }
-    )
-    lr_scheduler_type: Optional[str] = field(
-        default="cosine",
-        metadata={
-            "help": "the lr scheduler type"
-        }
-    )
-    warmup_steps: Optional[int] = field(
-        default=100, metadata={
-            "help": "the number of warmup steps"
-        }
-    )
-    weight_decay: Optional[float] = field(
-        default=0.05, metadata={
-            "help": "the weight decay"
-        }
-    )
-    optimizer_type: Optional[str] = field(
-        default="paged_adamw_32bit",
-        metadata={
-            "help": "the optimizer type"
-        }
-    )
-
-    per_device_train_batch_size: Optional[int] = field(
-        default=4,
-        metadata={
-            "help": "train batch size per device"
-        }
-    )
-    per_device_eval_batch_size: Optional[int] = field(
-        default=1, metadata={
-            "help": "eval batch size per device"
-        }
-    )
+    learning_rate: Optional[float] = field(default=5e-4, metadata={"help": "optimizer learning rate"})
+    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
+    warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
+    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
+    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})
+
+    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "train batch size per device"})
+    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "eval batch size per device"})
     gradient_accumulation_steps: Optional[int] = field(
         default=4,
-        metadata={
-            "help": "the number of gradient accumulation steps"
-        },
+        metadata={"help": "the number of gradient accumulation steps"},
     )
     gradient_checkpointing: Optional[bool] = field(
         default=True,
-        metadata={
-            "help": "whether to use gradient checkpointing"
-        },
+        metadata={"help": "whether to use gradient checkpointing"},
     )
 
     gradient_checkpointing_use_reentrant: Optional[bool] = field(
         default=False,
-        metadata={
-            "help": "whether to use reentrant for gradient checkpointing"
-        },
+        metadata={"help": "whether to use reentrant for gradient checkpointing"},
     )
     max_prompt_length: Optional[int] = field(
         default=512,
-        metadata={
-            "help": "the maximum prompt length"
-        },
+        metadata={"help": "the maximum prompt length"},
     )
     max_length: Optional[int] = field(
         default=1024,
-        metadata={
-            "help": "the maximum sequence length"
-        },
+        metadata={"help": "the maximum sequence length"},
     )
     max_steps: Optional[int] = field(
         default=1000,
-        metadata={
-            "help": "max number of training steps"
-        },
+        metadata={"help": "max number of training steps"},
     )
     logging_steps: Optional[int] = field(
         default=10,
-        metadata={
-            "help": "the logging frequency"
-        },
+        metadata={"help": "the logging frequency"},
     )
     save_steps: Optional[int] = field(
         default=100,
-        metadata={
-            "help": "the saving frequency"
-        },
+        metadata={"help": "the saving frequency"},
     )
     eval_steps: Optional[int] = field(
         default=100,
-        metadata={
-            "help": "the evaluation frequency"
-        },
+        metadata={"help": "the evaluation frequency"},
     )
     output_dir: Optional[str] = field(
         default="./results",
-        metadata={
-            "help": "the output directory"
-        },
+        metadata={"help": "the output directory"},
     )
     log_freq: Optional[int] = field(
         default=1,
-        metadata={
-            "help": "the logging frequency"
-        },
-    )
-    sanity_check: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "only train on 1000 samples"
-        }
+        metadata={"help": "the logging frequency"},
     )
+    sanity_check: Optional[bool] = field(default=False, metadata={"help": "only train on 1000 samples"})
     report_to: Optional[str] = field(
         default="wandb",
         metadata={
             "help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,'
-                    '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. '
-                    'Use `"all"` to report to all integrations installed, `"none"` for no integrations.'
+            '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. '
+            'Use `"all"` to report to all integrations installed, `"none"` for no integrations.'
         },
     )
     seed: Optional[int] = field(
         default=0, metadata={"help": "Random seed that will be set at the beginning of training."}
     )
-    run_name: Optional[str] = field(
-        default="dpo", metadata={"help": "The name of the run."}
-    )
-    eval_dataset_path: Optional[str] = field(
-        default=None, metadata={"help": "The path of the eval dataset."}
-    )
+    run_name: Optional[str] = field(default="dpo", metadata={"help": "The name of the run."})
+    eval_dataset_path: Optional[str] = field(default=None, metadata={"help": "The path of the eval dataset."})
 
 
 @dataclass
@@ -1390,11 +1203,11 @@ class DPOv2AlignerArguments(FinetunerArguments):
     """
     The arguments for the DPOv2 training script.
     """
+
     # general args
     random_seed: Optional[int] = field(default=42, metadata={"help": "the random seed"})
     accelerate_config_file: Optional[str] = field(
-        default=None, 
-        metadata={"help": "file path for accelerate config file, only used in memory safe dpov2 align."}
+        default=None, metadata={"help": "file path for accelerate config file, only used in memory safe dpov2 align."}
     )
     # pair sampling args
     margin_scale: Optional[float] = field(default=1.0, metadata={"help": "the margin scale"})
@@ -1414,15 +1227,13 @@ class IterativeAlignerArguments(InferencerArguments):
     """
     Arguments for iterative aligners.
     """
-    dataset_path_list: List[str] = field(
-        default_factory=list,
-        metadata={"help": "The list of dataset paths for iterative aligners."}
+
+    dataset_path_list: list[str] = field(
+        default_factory=list, metadata={"help": "The list of dataset paths for iterative aligners."}
     )
     initial_iter_idx: int = field(
-        default=0,
-        metadata={"help": "The initial iteration index, 0 refers to the first dataset in dataset_path_list."}
+        default=0, metadata={"help": "The initial iteration index, 0 refers to the first dataset in dataset_path_list."}
     )
-        
 
 
 @dataclass
@@ -1430,31 +1241,23 @@ class IterativeDPOAlignerArguments(IterativeAlignerArguments, DPOv2AlignerArgume
     """
     Arguments for iterative DPO aligners.
     """
+
     output_dir: Optional[str] = field(
         default="./runs",
         metadata={"help": "Output path for the inferenced results"},
     )
     reward_model_inference_batch_size: int = field(
-        default=1,
-        metadata={"help": "The batch size for reward model inference."}
+        default=1, metadata={"help": "The batch size for reward model inference."}
     )
     reward_model_inference_block_size: int = field(
-        default=2048,
-        metadata={"help": "The block size for reward model inference."}
+        default=2048, metadata={"help": "The block size for reward model inference."}
     )
     do_response_generation: bool = field(
-        default=True,
-        metadata={"help": "Whether to generate responses using the model."}
-    )
-    do_scoring: bool = field(
-        default=True,
-        metadata={"help": "Whether to score the responses using the reward model."}
-    )
-    do_dpo_align: bool = field(
-        default=True,
-        metadata={"help": "Whether to perform DPO alignment."}
+        default=True, metadata={"help": "Whether to generate responses using the model."}
     )
-                    
+    do_scoring: bool = field(default=True, metadata={"help": "Whether to score the responses using the reward model."})
+    do_dpo_align: bool = field(default=True, metadata={"help": "Whether to perform DPO alignment."})
+
 
 PIPELINE_ARGUMENT_MAPPING = {
     "finetuner": FinetunerArguments,
diff --git a/src/lmflow/datasets/__init__.py b/src/lmflow/datasets/__init__.py
index aa63cb48c..7bd8b2276 100644
--- a/src/lmflow/datasets/__init__.py
+++ b/src/lmflow/datasets/__init__.py
@@ -1,12 +1,16 @@
 """This Python code defines a class Dataset with methods for initializing, loading,
 and manipulating datasets from different backends such as Hugging Face and JSON.
- 
+
 The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging
 Face dataset, mapping datasets, and retrieving the backend dataset and arguments.
 """
-from lmflow.utils.versioning import is_multimodal_available
-
 
 from lmflow.datasets.dataset import Dataset
+from lmflow.utils.versioning import is_multimodal_available
+
 if is_multimodal_available():
     from lmflow.datasets.multi_modal_dataset import CustomMultiModalDataset
+
+    __all__ = ["Dataset", "CustomMultiModalDataset"]
+else:
+    __all__ = ["Dataset"]
diff --git a/src/lmflow/datasets/dataset.py b/src/lmflow/datasets/dataset.py
index f3537a411..014e52f4a 100644
--- a/src/lmflow/datasets/dataset.py
+++ b/src/lmflow/datasets/dataset.py
@@ -1,38 +1,30 @@
 #!/usr/bin/env python
-# coding=utf-8
 """This Python code defines a class Dataset with methods for initializing, loading,
 and manipulating datasets from different backends such as Hugging Face and JSON.
- 
+
 The `Dataset` class includes methods for loading datasets from a dictionary and a Hugging
 Face dataset, mapping datasets, and retrieving the backend dataset and arguments.
 """
 
-
-
 # Importing necessary libraries and modules
 import copy
 import json
 import logging
 from pathlib import Path
+from typing import Optional
 
-from cmath import e
-from pathlib import Path
-from typing import Optional, List
-
-from datasets import load_dataset
 from datasets import Dataset as HFDataset
+from datasets import load_dataset
 from tqdm import tqdm
 
 from lmflow.args import DatasetArguments
 from lmflow.utils.constants import (
     DATASET_DESCRIPTION_MAP,
-    TEXT_ONLY_DATASET_DESCRIPTION,
-    TEXT2TEXT_DATASET_DESCRIPTION,
-    FLOAT_ONLY_DATASET_DESCRIPTION,
     INSTANCE_FIELDS_MAP,
+    TEXT_ONLY_DATASET_DESCRIPTION,
 )
+from lmflow.utils.data_utils import check_dataset_instances_key_fast, get_dataset_type_fast
 from lmflow.utils.versioning import is_multimodal_available
-from lmflow.utils.data_utils import get_dataset_type_fast, check_dataset_instances_key_fast
 
 if is_multimodal_available():
     from .multi_modal_dataset import CustomMultiModalDataset
@@ -50,13 +42,14 @@
     "paired_conversation",
     "paired_text_to_text",
     "text_to_textlist",
-    "text_to_scored_textlist"
+    "text_to_scored_textlist",
 ]
 
 KEY_TYPE = "type"
 KEY_INSTANCES = "instances"
 KEY_SCORE = "score"
 
+
 class Dataset:
     r"""
     Initializes the Dataset object with the given parameters.
@@ -68,34 +61,32 @@ class Dataset:
 
     backend : str,  default="huggingface"
         A string representing the dataset backend. Defaults to "huggingface".
-    
+
     args : Optional.
         Positional arguments.
-    
+
     kwargs : Optional.
         Keyword arguments.
     """
-    def __init__(self, data_args: DatasetArguments=None, backend: str="huggingface", *args, **kwargs):
+
+    def __init__(self, data_args: DatasetArguments = None, backend: str = "huggingface", *args, **kwargs):
         self.data_args = data_args
         self.backend = backend
         self.backend_dataset = None
-        self.type = None        # Original type of the dataset
+        self.type = None  # Original type of the dataset
         self.dataset_path = data_args.dataset_path
 
         if data_args.dataset_path is None:
             return
 
         if backend == "huggingface":
-            data_files = [
-                x.absolute().as_posix()
-                 for x in Path(self.dataset_path).glob("*.json")
-            ]
+            data_files = [x.absolute().as_posix() for x in Path(self.dataset_path).glob("*.json")]
             logger.info(f"Data files: \n{data_files}")
-            
+
             # check if the dataset is in the correct format and get the dataset type (text_only, text2text, etc.)
             self._check_hf_json_format(data_files)
             # Load the dataset using the HuggingFace dataset library
-            logger.info('Loading datasets')
+            logger.info("Loading datasets")
             extensions = "json"
             raw_dataset = load_dataset(
                 extensions,
@@ -112,48 +103,38 @@ def __init__(self, data_args: DatasetArguments=None, backend: str="huggingface",
         elif backend == "custom_multi_modal":
             # FIXME refactor the backend name
             if not is_multimodal_available():
-                raise ValueError(
-                    'Multimodal not available. Please install via `pip install -e ".[multimodal]"`'
-                )
+                raise ValueError('Multimodal not available. Please install via `pip install -e ".[multimodal]"`')
             raw_dataset = CustomMultiModalDataset(self.dataset_path, data_args)
             self.backend_dataset = raw_dataset
         else:
             raise NotImplementedError(f'Unsupported dataset backend "{backend}"')
 
-    
     def __len__(self):
         return len(self.backend_dataset)
-    
 
     def _check_instance_format(self):
         """
-        Checks if data (instances) have required fields. 
+        Checks if data (instances) have required fields.
         Raises messages with hints if not matched.
         """
         fields = self.backend_dataset.features
         correct_fields = INSTANCE_FIELDS_MAP[self.type]
         if not set(correct_fields).issubset(set(fields)):
-            raise ValueError(
-                f'data instance fields incorrect'
-                f' {list(correct_fields)} are required.'
-            )
-            
-    
-    def _check_hf_json_format(self, data_files: List[str]):
-        for single_file in tqdm(data_files, desc='Checking dataset keys'):
+            raise ValueError(f"data instance fields incorrect {list(correct_fields)} are required.")
+
+    def _check_hf_json_format(self, data_files: list[str]):
+        for single_file in tqdm(data_files, desc="Checking dataset keys"):
             # get type and check if it is consistent
             json_data_type = get_dataset_type_fast(single_file)
             if not json_data_type:
                 raise ValueError(
-                    f'"{KEY_TYPE}" must be provided to initialize a dataset,'
-                    f' e.g.\n'
-                    f'    {TEXT_ONLY_DATASET_DESCRIPTION}'
+                    f'"{KEY_TYPE}" must be provided to initialize a dataset, e.g.\n    {TEXT_ONLY_DATASET_DESCRIPTION}'
                 )
             if self.type is None:
                 self.type = json_data_type
             elif self.type != json_data_type:
                 raise ValueError(
-                    'All task files must have same data types. Previous'
+                    "All task files must have same data types. Previous"
                     f' files have type "{self.type}", but in file'
                     f' {single_file}, it has type "{self.type}".'
                 )
@@ -162,11 +143,10 @@ def _check_hf_json_format(self, data_files: List[str]):
             if not key_instances_exists_flag:
                 raise ValueError(
                     f'"{KEY_INSTANCES}" must be provided to initialize a'
-                    f' dataset, e.g.\n'
-                    f'    {TEXT_ONLY_DATASET_DESCRIPTION}'
+                    f" dataset, e.g.\n"
+                    f"    {TEXT_ONLY_DATASET_DESCRIPTION}"
                 )
 
-
     def from_dict(self, dict_obj: dict, *args, **kwargs):
         r"""
         Create a Dataset object from a dictionary.
@@ -194,10 +174,10 @@ def from_dict(self, dict_obj: dict, *args, **kwargs):
 
         dict_obj : dict.
             A dictionary containing the dataset information.
-        
+
         args : Optional.
             Positional arguments.
-        
+
         kwargs : Optional.
             Keyword arguments.
 
@@ -209,19 +189,17 @@ def from_dict(self, dict_obj: dict, *args, **kwargs):
         if self.backend == "huggingface":
             if KEY_TYPE not in dict_obj:
                 raise ValueError(
-                    f'"{KEY_TYPE}" must be provided to initialize a dataset,'
-                    f' e.g.\n'
-                    f'    {TEXT_ONLY_DATASET_DESCRIPTION}'
+                    f'"{KEY_TYPE}" must be provided to initialize a dataset, e.g.\n    {TEXT_ONLY_DATASET_DESCRIPTION}'
                 )
             if KEY_INSTANCES not in dict_obj:
                 raise ValueError(
                     f'"{KEY_INSTANCES}" must be provided to initialize a'
-                    f' dataset, e.g.\n'
-                    f'    {TEXT_ONLY_DATASET_DESCRIPTION}'
+                    f" dataset, e.g.\n"
+                    f"    {TEXT_ONLY_DATASET_DESCRIPTION}"
                 )
 
             self.type = dict_obj[KEY_TYPE]
-            if not self.type in INSTANCE_FIELDS_MAP:
+            if self.type not in INSTANCE_FIELDS_MAP:
                 raise ValueError(f'type "{self.type}" is not supported')
 
             correct_fields = INSTANCE_FIELDS_MAP[self.type]
@@ -229,27 +207,23 @@ def from_dict(self, dict_obj: dict, *args, **kwargs):
             for i, instance in enumerate(dict_obj[KEY_INSTANCES]):
                 fields = instance.keys()
                 if not set(correct_fields).issubset(set(fields)):
-                    raise ValueError(
-                        f'data instance fields incorrect'
-                        f' {list(correct_fields)} are required.'
-                    )
+                    raise ValueError(f"data instance fields incorrect {list(correct_fields)} are required.")
 
             try:
                 hf_dict = {}
                 if len(dict_obj[KEY_INSTANCES]) > 0:
                     for key in dict_obj[KEY_INSTANCES][0].keys():
-                        hf_dict[key] = [
-                            instance[key] for instance in dict_obj[KEY_INSTANCES]
-                        ]
+                        hf_dict[key] = [instance[key] for instance in dict_obj[KEY_INSTANCES]]
 
                 self.backend_dataset = HFDataset.from_dict(hf_dict, *args, **kwargs)
             except AttributeError as ex:
                 raise ValueError(
                     f"Error occurs: {ex}. Failed to convert dict to"
-                    f" \"{self.type}\" dataset," f" the standard format is as"
+                    f' "{self.type}" dataset,'
+                    f" the standard format is as"
                     f" follows:\n"
                     f"    {DATASET_DESCRIPTION_MAP[self.type]}"
-                )
+                ) from ex
             self._check_instance_format()
 
             return self
@@ -258,10 +232,7 @@ def from_dict(self, dict_obj: dict, *args, **kwargs):
             self.type = dict_obj[KEY_TYPE]
             return self
         else:
-            raise NotImplementedError(
-                f'Currently .from_dict is not supported for backend "{self.backend}"'
-            )
-
+            raise NotImplementedError(f'Currently .from_dict is not supported for backend "{self.backend}"')
 
     @classmethod
     def create_from_dict(cls, dict_obj, *args, **kwargs):
@@ -275,7 +246,6 @@ def create_from_dict(cls, dict_obj, *args, **kwargs):
         dataset = Dataset(empty_data_args)
         return dataset.from_dict(dict_obj)
 
-
     def to_dict(self):
         r"""
         Returns
@@ -315,10 +285,7 @@ def to_dict(self):
             if first_key is not None:
                 num_instances = len(hf_dict[first_key])
                 dict_obj[KEY_INSTANCES] = [
-                    {
-                        key: hf_dict[key][i] for key in hf_dict.keys()
-                    }
-                    for i in range(num_instances)
+                    {key: hf_dict[key][i] for key in hf_dict.keys()} for i in range(num_instances)
                 ]
 
             return dict_obj
@@ -326,26 +293,19 @@ def to_dict(self):
             dict_obj = self.backend_dataset
             return dict_obj
         else:
-            raise NotImplementedError(
-                f'Current .to_dict is not supported for backend "{self.backend}"'
-            )
-
+            raise NotImplementedError(f'Current .to_dict is not supported for backend "{self.backend}"')
 
     def to_list(self):
         """Returns a list of instances."""
         if self.backend == "huggingface":
-            instance_list = [self.backend_dataset.__getitem__(idx)
-                             for idx in range(len(self.backend_dataset))]
+            instance_list = [self.backend_dataset.__getitem__(idx) for idx in range(len(self.backend_dataset))]
             return instance_list
         elif self.backend == "dict":
             instance_list = copy.deepcopy(self.backend_dataset[KEY_INSTANCES])
             # TODO: should be a list of instances, instance should be huggingface datasets row format
             return instance_list
         else:
-            raise NotImplementedError(
-                f'Current .to_list is not supported for backend "{self.backend}"'
-            )
-
+            raise NotImplementedError(f'Current .to_list is not supported for backend "{self.backend}"')
 
     def map(self, *args, **kwargs):
         r"""
@@ -353,7 +313,7 @@ def map(self, *args, **kwargs):
         ------------
         args : Optional.
             Positional arguments.
-        
+
         kwargs : Optional.
             Keyword arguments.
 
@@ -362,7 +322,7 @@ def map(self, *args, **kwargs):
 
         self : Dataset object.
         """
-        # If the dataset uses Hugging Face as the backend, 
+        # If the dataset uses Hugging Face as the backend,
         # call the `map()` function of the Hugging Face backend dataset
         if self.backend == "huggingface":
             # Set the mapped dataset as the backend dataset of the current dataset
@@ -371,10 +331,7 @@ def map(self, *args, **kwargs):
             return self
         else:
             # If the backend is not Hugging Face, raise a NotImplementedError
-            raise NotImplementedError(
-                f'Currently .map is not supported for backend "{self.backend}"'
-            )
-
+            raise NotImplementedError(f'Currently .map is not supported for backend "{self.backend}"')
 
     def get_backend(self) -> Optional[str]:
         r"""
@@ -385,7 +342,6 @@ def get_backend(self) -> Optional[str]:
         """
         return self.backend
 
-
     def get_backend_dataset(self):
         r"""
         Returns
@@ -395,7 +351,6 @@ def get_backend_dataset(self):
         """
         return self.backend_dataset
 
-
     def get_fingerprint(self):
         r"""
         Returns
@@ -405,7 +360,6 @@ def get_fingerprint(self):
         """
         return self.backend_dataset._fingerprint
 
-    
     def get_data_args(self):
         r"""
         Returns
@@ -415,7 +369,6 @@ def get_data_args(self):
         """
         return self.data_args
 
-
     def get_type(self) -> str:
         r"""
         Returns
@@ -424,13 +377,8 @@ def get_type(self) -> str:
         self.type
         """
         return self.type
-    
-    
-    def save(
-        self, 
-        file_path: str, 
-        format: str="json"
-    ):
+
+    def save(self, file_path: str, format: str = "json"):
         r"""
         Save the dataset to a json file.
 
@@ -441,14 +389,13 @@ def save(
         """
         if format == "json":
             assert Path(file_path).suffix == ".json", "The file path must have a .json extension."
-            with open(file_path, "w", encoding='utf-8') as fout:
+            with open(file_path, "w", encoding="utf-8") as fout:
                 json.dump(self.to_dict(), fout, indent=4, ensure_ascii=False)
-                
+
         else:
             logger.error(f"Unsupported format when saving the dataset: {format}.")
-        
-            
-    def sample(self, n: int, seed: int=42):
+
+    def sample(self, n: int, seed: int = 42):
         r"""
         Sample n instances from the dataset.
 
@@ -469,20 +416,16 @@ def sample(self, n: int, seed: int=42):
                 {
                     "type": self.get_type(),
                     "instances": [
-                        {
-                            col_name: sampled_dataset[col_name][i] for col_name in sampled_dataset.column_names
-                        } for i in range(n)
-                    ]
+                        {col_name: sampled_dataset[col_name][i] for col_name in sampled_dataset.column_names}
+                        for i in range(n)
+                    ],
                 }
             )
             return output_dataset
         else:
-            raise NotImplementedError(
-                f'Currently .sample is not supported for backend "{self.backend}"'
-            )
-            
-            
-    def train_test_split(self, test_size: float=0.2, shuffle: bool=True, seed: int=42):
+            raise NotImplementedError(f'Currently .sample is not supported for backend "{self.backend}"')
+
+    def train_test_split(self, test_size: float = 0.2, shuffle: bool = True, seed: int = 42):
         r"""
         Split the dataset into training and testing sets.
 
@@ -496,41 +439,34 @@ def train_test_split(self, test_size: float=0.2, shuffle: bool=True, seed: int=4
 
         train_dataset : Dataset object.
             A new dataset object containing the training instances.
-        
+
         test_dataset : Dataset object.
             A new dataset object containing the testing instances.
         """
         if self.backend == "huggingface":
-            splited = self.backend_dataset.train_test_split(
-                test_size=test_size, shuffle=shuffle, seed=seed
-            )
+            splited = self.backend_dataset.train_test_split(test_size=test_size, shuffle=shuffle, seed=seed)
             train_dataset = self.create_from_dict(
                 {
                     "type": self.get_type(),
                     "instances": [
-                        {
-                            col_name: splited["train"][col_name][i] for col_name in splited["train"].column_names
-                        } for i in range(len(splited["train"]))
-                    ]
+                        {col_name: splited["train"][col_name][i] for col_name in splited["train"].column_names}
+                        for i in range(len(splited["train"]))
+                    ],
                 }
             )
             test_dataset = self.create_from_dict(
                 {
                     "type": self.get_type(),
                     "instances": [
-                        {
-                            col_name: splited["test"][col_name][i] for col_name in splited["test"].column_names
-                        } for i in range(len(splited["test"]))
-                    ]
+                        {col_name: splited["test"][col_name][i] for col_name in splited["test"].column_names}
+                        for i in range(len(splited["test"]))
+                    ],
                 }
             )
             return train_dataset, test_dataset
         else:
-            raise NotImplementedError(
-                f'Currently .train_test_split is not supported for backend "{self.backend}"'
-            )
-            
-            
+            raise NotImplementedError(f'Currently .train_test_split is not supported for backend "{self.backend}"')
+
     def drop_instances(self, indices: list):
         r"""
         Drop instances from the dataset.
@@ -543,14 +479,11 @@ def drop_instances(self, indices: list):
         if self.backend == "huggingface":
             self.backend_dataset = self.backend_dataset.remove_indices(indices)
         else:
-            raise NotImplementedError(
-                f'Currently .drop_instances is not supported for backend "{self.backend}"'
-            )
-            
-    
+            raise NotImplementedError(f'Currently .drop_instances is not supported for backend "{self.backend}"')
+
     def sanity_check(
-        self, 
-        drop_invalid: bool=True,
+        self,
+        drop_invalid: bool = True,
     ):
         r"""
         Perform a sanity check on the dataset.
@@ -558,14 +491,11 @@ def sanity_check(
         if self.backend == "huggingface":
             self.hf_dataset_sanity_check(drop_invalid)
         else:
-            raise NotImplementedError(
-                f'Currently .sanity_check is not supported for backend "{self.backend}"'
-            )
-            
-            
+            raise NotImplementedError(f'Currently .sanity_check is not supported for backend "{self.backend}"')
+
     def hf_dataset_sanity_check(
         self,
-        drop_invalid: bool=True,
+        drop_invalid: bool = True,
     ):
         r"""
         Perform a sanity check on the HuggingFace dataset.
@@ -573,25 +503,26 @@ def hf_dataset_sanity_check(
         if self.backend_dataset is None or len(self.backend_dataset) == 0:
             raise ValueError("Dataset is empty.")
 
-        if self.type == 'text_to_textlist':
-            num_output_per_instance = len(self.backend_dataset['output'][0])
-            dataset_cache = self.backend_dataset.filter(lambda x: len(x['input'])!=0)
-            dataset_cache = self.backend_dataset.filter(lambda x: len(x['output']) == num_output_per_instance)
-            dataset_cache = self.backend_dataset.filter(lambda x: not all([len(output) == 0 for output in x['output']]))
-            
+        if self.type == "text_to_textlist":
+            num_output_per_instance = len(self.backend_dataset["output"][0])
+            dataset_cache = self.backend_dataset.filter(lambda x: len(x["input"]) != 0)
+            dataset_cache = self.backend_dataset.filter(lambda x: len(x["output"]) == num_output_per_instance)
+            dataset_cache = self.backend_dataset.filter(lambda x: not all([len(output) == 0 for output in x["output"]]))
+
             if len(dataset_cache) != len(self.backend_dataset):
                 warning_info = (
                     f"Found {len(self.backend_dataset) - len(dataset_cache)} invalid instances "
                     "during hf_dataset_sanity_check, please check:\n"
                     "   1. length of input strings should not be empty\n"
                     "   2. length of output strings should not be all empty\n"
-                    "   3. number of output strings should be consistent\n" # since we will use tensor reshape later
+                    "   3. number of output strings should be consistent\n"  # since we will use tensor reshape later
                 )
                 if drop_invalid:
                     self.backend_dataset = dataset_cache
-                    logger.warning(warning_info+"Invalid instances are dropped.")
+                    logger.warning(warning_info)
+                    logger.warning("Invalid instances are dropped.")
                 else:
                     raise ValueError(warning_info)
-        
+
         else:
-            logger.warning(f"No sanity check for {self.type} dataset.")
\ No newline at end of file
+            logger.warning(f"No sanity check for {self.type} dataset.")
diff --git a/src/lmflow/datasets/multi_modal_dataset.py b/src/lmflow/datasets/multi_modal_dataset.py
index 5b204443c..053ec957c 100644
--- a/src/lmflow/datasets/multi_modal_dataset.py
+++ b/src/lmflow/datasets/multi_modal_dataset.py
@@ -1,29 +1,34 @@
 #!/usr/bin/env python
-# coding=utf-8
 # FIXME update the doc string.
-"""This Python code defines a class Multi Modal Dataset.
-"""
+"""This Python code defines a class Multi Modal Dataset."""
+
 import copy
-from dataclasses import dataclass, field
 import json
-from PIL import Image
 import os.path as osp
-import transformers
+from dataclasses import dataclass
+
 import torch
+import transformers
+from PIL import Image
 from torch.utils.data import Dataset
 
 from lmflow.args import DatasetArguments
 from lmflow.utils import llava_conversation_lib as conversation_lib
+from lmflow.utils.constants import (
+    DEFAULT_IM_END_TOKEN,
+    DEFAULT_IM_START_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    IGNORE_INDEX,
+    IMAGE_TOKEN_INDEX,
+)
 
-from lmflow.utils.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
 
 class CustomMultiModalDataset(Dataset):
     """Dataset for Multi Modal data"""
 
-    def __init__(self, dataset_path: str,
-                 data_args: DatasetArguments):
-        super(CustomMultiModalDataset, self).__init__()
-        data_dict = json.load(open(dataset_path, "r"))
+    def __init__(self, dataset_path: str, data_args: DatasetArguments):
+        super().__init__()
+        data_dict = json.load(open(dataset_path))
         self.data_dict = data_dict
         print("Finish loading json file in dataset.")
         self.data_args = data_args
@@ -34,8 +39,7 @@ def __len__(self):
 
     def register_tokenizer(self, tokenizer, image_processor=None):
         self.tokenizer = tokenizer
-        self.image_processor = getattr(
-            tokenizer, "image_processor", image_processor)
+        self.image_processor = getattr(tokenizer, "image_processor", image_processor)
 
     def __getitem__(self, i):
         data = self.data_dict[i]
@@ -43,11 +47,11 @@ def __getitem__(self, i):
             data = [data]
         assert len(data) == 1
         processor = self.image_processor
-        if 'image' in data[0]:
-            image_file = data[0]['image']
-            image = Image.open(
-                osp.join(self.image_folder, image_file)).convert("RGB")
-            if self.data_args.image_aspect_ratio == 'pad':
+        if "image" in data[0]:
+            image_file = data[0]["image"]
+            image = Image.open(osp.join(self.image_folder, image_file)).convert("RGB")
+            if self.data_args.image_aspect_ratio == "pad":
+
                 def expand2square(pil_img, background_color):
                     width, height = pil_img.size
                     if width == height:
@@ -59,44 +63,34 @@ def expand2square(pil_img, background_color):
                     else:
                         result = Image.new(pil_img.mode, (height, height), background_color)
                         result.paste(pil_img, ((height - width) // 2, 0))
-                        return result    
-                image = expand2square(image, tuple(int(x*255) for x in processor.image_mean))
-                image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                        return result
+
+                image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
             else:
-                image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
-            data = preprocess_multimodal_llava(
-                copy.deepcopy([e["conversations"] for e in data]),
-                self.data_args)
+                image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+            data = preprocess_multimodal_llava(copy.deepcopy([e["conversations"] for e in data]), self.data_args)
         else:
             data = copy.deepcopy([e["conversations"] for e in data])
         if self.data_args.sep_style == "plain":
             data_dict = preprocess_llama_from_llava_plain(
-                data,
-                self.tokenizer,
-                has_image=('image' in self.data_dict[i])
+                data, self.tokenizer, has_image=("image" in self.data_dict[i])
             )
         else:
-            data_dict = preprocess_llama_from_llava_v1(
-                data,
-                self.tokenizer,
-                has_image=('image' in self.data_dict[i])
-            )
+            data_dict = preprocess_llama_from_llava_v1(data, self.tokenizer, has_image=("image" in self.data_dict[i]))
         if isinstance(i, int):
-            data_dict = dict(input_ids=data_dict["input_ids"][0],
-                             labels=data_dict["labels"][0])
-        
+            data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
         # image exist in the data
-        if 'image' in self.data_dict[i]:
-            data_dict['image'] = image
+        if "image" in self.data_dict[i]:
+            data_dict["image"] = image
         else:
             # image does not exist in the data, but the model is multimodal
             crop_size = self.image_processor.crop_size
-            data_dict['image'] = torch.zeros(
-                3, crop_size['height'], crop_size['width'])
+            data_dict["image"] = torch.zeros(3, crop_size["height"], crop_size["width"])
         return data_dict
 
 
-
 def preprocess_multimodal_llava(sources, data_args):
     is_multimodal = data_args.is_multimodal
     if not is_multimodal:
@@ -104,31 +98,26 @@ def preprocess_multimodal_llava(sources, data_args):
 
     for source in sources:
         for sentence in source:
-            if DEFAULT_IMAGE_TOKEN in sentence['value']:
-                sentence['value'] = sentence['value'].replace(
-                                DEFAULT_IMAGE_TOKEN, '').strip()
-                sentence['value'] = DEFAULT_IMAGE_TOKEN + '\n' + sentence['value']
-                sentence['value'] = sentence['value'].strip()
+            if DEFAULT_IMAGE_TOKEN in sentence["value"]:
+                sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, "").strip()
+                sentence["value"] = DEFAULT_IMAGE_TOKEN + "\n" + sentence["value"]
+                sentence["value"] = sentence["value"].strip()
                 if "mmtag" in conversation_lib.default_conversation.version:
-                    sentence['value'] = sentence['value'].replace(
-                        DEFAULT_IMAGE_TOKEN, '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>"
+                    )
             replace_token = DEFAULT_IMAGE_TOKEN
             if data_args.use_image_start_end:
-                replace_token = DEFAULT_IM_START_TOKEN + \
-                    replace_token + DEFAULT_IM_END_TOKEN
-            sentence["value"] = sentence["value"].replace(
-                    DEFAULT_IMAGE_TOKEN, replace_token)
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token)
     return sources
 
 
-def tokenizer_image_token(prompt,
-                          tokenizer,
-                          image_token_index=IMAGE_TOKEN_INDEX,
-                          return_tensors=None):
-    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
 
     def insert_separator(X, sep):
-        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
 
     input_ids = []
     offset = 0
@@ -140,16 +129,13 @@ def insert_separator(X, sep):
         input_ids.extend(x[offset:])
 
     if return_tensors is not None:
-        if return_tensors == 'pt':
+        if return_tensors == "pt":
             return torch.tensor(input_ids, dtype=torch.long)
-        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
     return input_ids
 
 
-def preprocess_llama_from_llava_plain(
-    sources,
-    tokenizer: transformers.PreTrainedTokenizer,
-    has_image: bool = False):
+def preprocess_llama_from_llava_plain(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False):
     """
     This function just add the image in the front of text.
     And don't add any prompt.
@@ -163,23 +149,21 @@ def preprocess_llama_from_llava_plain(
     conversations = []
     for source in sources:
         assert len(source) == 2
-        assert DEFAULT_IMAGE_TOKEN in source[0]['value']
-        source[0]['value'] = DEFAULT_IMAGE_TOKEN
-        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
         conversations.append(conversation)
     # tokenize conversations
-    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations]
     targets = copy.deepcopy(input_ids)
     for target, source in zip(targets, sources):
-        tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
         target[:tokenized_len] = IGNORE_INDEX
 
     return dict(input_ids=input_ids, labels=targets)
 
-def preprocess_llama_from_llava_v1(
-    sources,
-    tokenizer: transformers.PreTrainedTokenizer,
-    has_image: bool = False):
+
+def preprocess_llama_from_llava_v1(sources, tokenizer: transformers.PreTrainedTokenizer, has_image: bool = False):
     """
     This function add the prompt and then put the image after the prompt.
     So it needs additional code to generate the target label.
@@ -211,7 +195,9 @@ def preprocess_llama_from_llava_v1(
     # Tokenize conversations
 
     if has_image:
-        input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+        input_ids = torch.stack(
+            [tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations], dim=0
+        )
     else:
         input_ids = tokenizer(
             conversations,
@@ -256,10 +242,7 @@ def preprocess_llama_from_llava_v1(
         if cur_len < tokenizer.model_max_length:
             if cur_len != total_len:
                 target[:] = IGNORE_INDEX
-                print(
-                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
-                    f" (ignored)"
-                )
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}. (ignored)")
 
     return dict(
         input_ids=input_ids,
@@ -268,33 +251,29 @@ def preprocess_llama_from_llava_v1(
 
 
 @dataclass
-class DataCollatorForSupervisedDataset(object):
+class DataCollatorForSupervisedDataset:
     """Collate examples for supervised fine-tuning."""
 
     tokenizer: transformers.PreTrainedTokenizer
 
     def __call__(self, instances):
-        input_ids, labels = tuple([instance[key] for instance in instances]
-                                  for key in ("input_ids", "labels"))
+        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
         input_ids = torch.nn.utils.rnn.pad_sequence(
-            input_ids,
-            batch_first=True,
-            padding_value=self.tokenizer.pad_token_id)
-        labels = torch.nn.utils.rnn.pad_sequence(labels,
-                                                 batch_first=True,
-                                                 padding_value=IGNORE_INDEX)
-        input_ids = input_ids[:, :self.tokenizer.model_max_length]
-        labels = labels[:, :self.tokenizer.model_max_length]
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+        labels = labels[:, : self.tokenizer.model_max_length]
         batch = dict(
             input_ids=input_ids,
             labels=labels,
             attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
         )
 
-        if 'image' in instances[0]:
-            images = [instance['image'] for instance in instances]
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
             if all(x is not None and x.shape == images[0].shape for x in images):
-                batch['images'] = torch.stack(images)
+                batch["images"] = torch.stack(images)
             else:
-                batch['images'] = images
+                batch["images"] = images
         return batch
diff --git a/src/lmflow/models/auto_model.py b/src/lmflow/models/auto_model.py
index c21790f57..74d199f0a 100644
--- a/src/lmflow/models/auto_model.py
+++ b/src/lmflow/models/auto_model.py
@@ -1,14 +1,13 @@
 #!/usr/bin/env python
-# coding=utf-8
-"""Automatically get correct model type.
-"""
+"""Automatically get correct model type."""
 
 from lmflow.models.hf_decoder_model import HFDecoderModel
 from lmflow.models.hf_text_regression_model import HFTextRegressionModel
-from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
 
-class AutoModel:
+# from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
+
 
+class AutoModel:
     @classmethod
     def get_model(self, model_args, *args, **kwargs):
         arch_type = model_args.arch_type
@@ -16,10 +15,8 @@ def get_model(self, model_args, *args, **kwargs):
             return HFDecoderModel(model_args, *args, **kwargs)
         elif arch_type == "text_regression":
             return HFTextRegressionModel(model_args, *args, **kwargs)
-        elif arch_type == "encoder_decoder" or \
-                arch_type == "vision_encoder_decoder":
-            return HFEncoderDecoderModel(model_args, *args, **kwargs)
+        # elif arch_type == "encoder_decoder" or \
+        #         arch_type == "vision_encoder_decoder":
+        #     return HFEncoderDecoderModel(model_args, *args, **kwargs)
         else:
-            raise NotImplementedError(
-                f"model architecture type \"{arch_type}\" is not supported"
-            )
+            raise NotImplementedError(f'model architecture type "{arch_type}" is not supported')
diff --git a/src/lmflow/models/base_model.py b/src/lmflow/models/base_model.py
index 335dbe963..a42bdf576 100644
--- a/src/lmflow/models/base_model.py
+++ b/src/lmflow/models/base_model.py
@@ -1,12 +1,6 @@
 #!/usr/bin/env python
-# coding=utf-8
-"""Base model class.
-"""
+"""Base model class."""
 
-from abc import ABC
 
-
-class BaseModel(ABC):
-
-    def __init__(self, *args, **kwargs):
-        pass
+class BaseModel:
+    pass
diff --git a/src/lmflow/models/decoder_model.py b/src/lmflow/models/decoder_model.py
index 19f3c41eb..5593ae28c 100644
--- a/src/lmflow/models/decoder_model.py
+++ b/src/lmflow/models/decoder_model.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 """A one-line summary of the module or program, terminated by a period.
 
 Leave one blank line.  The rest of this docstring should contain an
@@ -17,6 +16,5 @@
 
 
 class DecoderModel(BaseModel):
-
     def __init__(self, *args, **kwargs):
         pass
diff --git a/src/lmflow/models/encoder_decoder_model.py b/src/lmflow/models/encoder_decoder_model.py
index 5ccc29c38..f911fa782 100644
--- a/src/lmflow/models/encoder_decoder_model.py
+++ b/src/lmflow/models/encoder_decoder_model.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python
-# coding=utf-8
 """A one-line summary of the module or program, terminated by a period.
 
 Leave one blank line.  The rest of this docstring should contain an
@@ -17,6 +16,5 @@
 
 
 class EncoderDecoderModel(BaseModel):
-
     def __init__(self, *args, **kwargs):
-        pass
\ No newline at end of file
+        pass
diff --git a/src/lmflow/models/hf_decoder_model.py b/src/lmflow/models/hf_decoder_model.py
index 0fd6da2f0..25b4aae9f 100644
--- a/src/lmflow/models/hf_decoder_model.py
+++ b/src/lmflow/models/hf_decoder_model.py
@@ -1,18 +1,13 @@
 #!/usr/bin/env python
-# coding=utf-8
 """This is a class called HFDecoderModel which is a wrapper around transformers model and
-tokenizer classes. It has several methods such as __init__, tokenize, and train that are 
+tokenizer classes. It has several methods such as __init__, tokenize, and train that are
 used for training and fine-tuning the model. The __init__ method takes in several arguments
-such as model_args, tune_strategy, and ds_config, which are used to load the pretrained 
+such as model_args which are used to load the pretrained
 model and tokenizer, and initialize the training settings.
 
 The tokenize method is used to tokenize the input text and return the input IDs and attention
 masks that can be fed to the model for training or inference.
 
-This class supports different tune_strategy options such as 'normal', 'none', 'lora', and
-'adapter', which allow for different fine-tuning settings of the model. However, the 'lora'
-and 'adapter' strategies are not yet implemented.
-
 Overall, this class provides a convenient interface for loading and fine-tuning transformer
 models and can be used for various NLP tasks such as language modeling, text classification,
 and question answering.
@@ -20,45 +15,42 @@
 
 import hashlib
 import logging
-import os, shutil
-from typing import List, Union, Optional, Dict
+import os
+from typing import Optional, Union
 
 import torch
+from peft import PeftModel
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
 )
-from peft import PeftModel
 
 from lmflow.datasets.dataset import Dataset
-from lmflow.models.hf_model_mixin import HFModelMixin
 from lmflow.models.decoder_model import DecoderModel
+from lmflow.models.hf_model_mixin import HFModelMixin
 from lmflow.models.interfaces.tunable import Tunable
+from lmflow.tokenization.hf_decoder_model import conversation_tokenize_function, tokenize_function
 from lmflow.utils.constants import (
-    TEXT_ONLY_DATASET_DESCRIPTION,
-    TEXT2TEXT_DATASET_DESCRIPTION,
     CONVERSATION_DATASET_DESCRIPTION,
+    TEXT2TEXT_DATASET_DESCRIPTION,
+    TEXT_ONLY_DATASET_DESCRIPTION,
 )
 from lmflow.utils.conversation_template import PRESET_TEMPLATES
 from lmflow.utils.data_utils import VLLMInferenceResultWithInput
-from lmflow.tokenization.hf_decoder_model import (
-    tokenize_function, 
-    conversation_tokenize_function
-)
-from lmflow.utils.versioning import is_ray_available, is_vllm_available, is_flash_attn_available
-
+from lmflow.utils.envs import is_accelerate_env
+from lmflow.utils.versioning import is_flash_attn_available, is_ray_available, is_vllm_available
 
 logger = logging.getLogger(__name__)
 
 
 if is_flash_attn_available():
-    import flash_attn
+    pass
 else:
     logger.warning("Consider install flash_attn for better performance.")
-    
+
 if is_vllm_available():
     from vllm import SamplingParams
-    
+
 if is_ray_available():
     import ray
     import ray.data
@@ -71,70 +63,37 @@ class HFDecoderModel(DecoderModel, HFModelMixin, Tunable):
     Parameters
     ------------
 
-    model_args : 
+    model_args :
         Model arguments such as model name, path, revision, etc.
 
-    tune_strategy : str or none,  default="normal".
-        A string representing the dataset backend. Defaults to "huggingface".
-    
-    ds_config :   
-        Deepspeed configuations.
-    
+    do_train : bool, default True
+        Determines whether to prepare the model for training, including distribtued env, model placement, quantization,
+        lora, etc.
+
     args : Optional.
         Positional arguments.
-    
+
     kwargs : Optional.
-        Keyword arguments.    
+        Keyword arguments.
     """
 
-    def __init__(
-        self,
-        model_args,
-        tune_strategy='normal',
-        ds_config=None,
-        device="gpu",
-        use_accelerator=False,
-        *args,
-        **kwargs
-    ):
-        """
-        Initializes a HFDecoderModel instance.
-        :param model_args: dictionary with model arguments such as model name, path, revision, etc.
-        :param tune_strategy: tuning strategy: normal, none, lora or adapter
-        :param ds_config: deepspeed configuration for distributed training
-        """
-        HFModelMixin.__init__(
-            self,
-            model_args=model_args,
-            do_train=True if tune_strategy == "normal" else False,
-            ds_config=ds_config,
-            device=device,
-            use_accelerator=use_accelerator,
-            *args,
-            **kwargs
-        )
-
+    def __init__(self, model_args, do_train=True, device="gpu", **kwargs):
+        HFModelMixin.__init__(self, model_args=model_args, do_train=do_train, device=device, **kwargs)
 
-    def tokenize(
-        self, 
-        dataset: Dataset, 
-        add_special_tokens=True, 
-        *args, 
-        **kwargs
-    ) -> Dataset:
+    def tokenize(self, dataset: Dataset, add_special_tokens=True, *args, **kwargs) -> Dataset:
         """
         Tokenize the full dataset.
-    
+
         Parameters
         ------------
         dataset : lmflow.datasets.Dataset.
 
         args : Optional.
             Positional arguments.
-        
+
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         tokenized_datasets :
@@ -145,10 +104,7 @@ def tokenize(
         # Preprocessing the datasets.
         # First we tokenize all the texts.
         if dataset.get_backend() != "huggingface":
-            raise NotImplementedError(
-                "tokenization of datasets with non-huggingface backend are"
-                "not supported yet"
-            )
+            raise NotImplementedError("tokenization of datasets with non-huggingface backend arenot supported yet")
 
         dataset_type = dataset.get_type()
         model_args = self.model_args
@@ -167,8 +123,8 @@ def tokenize(
         #   3) Which fields require loss in final computation, e.g.
         #        "text_only": "text"
         #        "text2text": "output" only
-        tokenized_column_order = None       # Handles 1) and 2)
-        label_columns = None                # Handles 3)
+        tokenized_column_order = None  # Handles 1) and 2)
+        label_columns = None  # Handles 3)
         if dataset_type == "text_only":
             tokenized_column_order = ["text"]
             label_columns = ["text"]
@@ -186,12 +142,12 @@ def tokenize(
                     )
             else:
                 logger.warning("No conversation template provided. Using default template.")
-                conversation_template = PRESET_TEMPLATES['empty']
-                        
+                conversation_template = PRESET_TEMPLATES["empty"]
+
             logger.warning(f"Conversation template: {conversation_template}")
         else:
             raise NotImplementedError(
-                f"dataset type \"{dataset_type}\" is not supported, currently"
+                f'dataset type "{dataset_type}" is not supported, currently'
                 " only support following data types:\n"
                 f"    1) {TEXT_ONLY_DATASET_DESCRIPTION}\n"
                 f"    2) {TEXT2TEXT_DATASET_DESCRIPTION}\n"
@@ -202,7 +158,7 @@ def tokenize(
         use_truncation = False
         if model_args.use_lora or data_args.disable_group_texts:
             use_truncation = True
-        
+
         tokenize_fn = conversation_tokenize_function if "conversation" in dataset_type else tokenize_function
         tokenize_fn_kwargs = {
             "data_args": data_args,
@@ -216,17 +172,21 @@ def tokenize(
             tokenize_fn_kwargs["tokenized_column_order"] = tokenized_column_order
             tokenize_fn_kwargs["add_special_tokens"] = add_special_tokens
             tokenize_fn_kwargs["use_truncation"] = use_truncation
-                           
+
         tokenize_kwargs = {}
         if not data_args.streaming:
             fingerprint = hashlib.md5(
                 (
                     raw_datasets.get_fingerprint()
                     + str(self.tokenizer)
-                    + f'###padding_side={self.tokenizer.padding_side}'
-                    + ('###conversation_template=' + str(conversation_template) if "conversation" in dataset_type else "")
-                    + f'###disable_group_texts={data_args.disable_group_texts}'
-                    + f'###block_size={data_args.block_size}'
+                    + f"###padding_side={self.tokenizer.padding_side}"
+                    + (
+                        "###conversation_template=" + str(conversation_template)
+                        if "conversation" in dataset_type
+                        else ""
+                    )
+                    + f"###disable_group_texts={data_args.disable_group_texts}"
+                    + f"###block_size={data_args.block_size}"
                 ).encode("utf-8")
             ).hexdigest()
             tokenize_kwargs = {
@@ -242,62 +202,60 @@ def tokenize(
                 "use block_size for maximum tokenized sequence length."
             )
         tokenized_datasets = raw_datasets.map(
-            tokenize_fn,
-            batched=True,
-            remove_columns=column_names,
-            fn_kwargs=tokenize_fn_kwargs,
-            **tokenize_kwargs
+            tokenize_fn, batched=True, remove_columns=column_names, fn_kwargs=tokenize_fn_kwargs, **tokenize_kwargs
         )
 
         return tokenized_datasets
 
-
-    def encode(self, input: Union[str, List[str]], *args, **kwargs ) -> Union[List[int], List[List[int]]]:
+    def encode(self, input: Union[str, list[str]], *args, **kwargs) -> Union[list[int], list[list[int]]]:
         """
         Perform encoding process of the tokenizer.
-    
+
         Parameters
         ------------
         inputs : str or list.
             The text sequence.
-            
+
         args : Optional.
             Positional arguments.
-        
+
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         outputs :
             if string input,return the tokenized inputs.
             "Hello,world!"-> [101, 7592, 1010, 2088, 102]
             if batch input,return {input_ids,attention_mask,token_type_ids}
-            ["Hello,world!","Hello!"]-> {'input_ids': tensor([[  101,  7592,  1010,  2088,   102],...),'attention_mask': tensor([[1, 1, 1, 1, 1],[0,0,1,1,1]])}
+            ["Hello,world!","Hello!"] ->
+            {
+                'input_ids': tensor([[  101,  7592,  1010,  2088,   102],...),
+                'attention_mask': tensor([[1, 1, 1, 1, 1],[0,0,1,1,1]])
+            }
         """
         if isinstance(input, list):
-            return self.tokenizer(text=input, *args, **kwargs)#batch encode,will automatically do left padding
+            return self.tokenizer(text=input, **kwargs)  # batch encode,will automatically do left padding
         elif isinstance(input, str):
-            return self.tokenizer.encode(text=input, *args, **kwargs)
+            return self.tokenizer.encode(text=input, **kwargs)
         else:
             raise NotImplementedError(f'type "{type(input)}" cannot be encoded')
 
-
-    def decode(self, input, *args, **kwargs ) -> Union[str, List[str]]:
+    def decode(self, input, **kwargs) -> Union[str, list[str]]:
         """
         Perform decoding process of the tokenizer.
-    
+
         Parameters
         ------------
         inputs : list or tensor.
             The token sequence.
-            
+
         args : Optional.
             Positional arguments.
-        
+
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         outputs :
@@ -307,25 +265,18 @@ def decode(self, input, *args, **kwargs ) -> Union[str, List[str]]:
             if single input,return the text
             [101, 7592, 1010, 2088, 102]-> "Hello,world!"
         """
-        if isinstance(input, List):
-            input=torch.tensor(input)
-        if input.dim()==2:
-            return self.tokenizer.batch_decode(input, *args, **kwargs)#batch_decode
+        if isinstance(input, list):
+            input = torch.tensor(input)
+        if input.dim() == 2:
+            return self.tokenizer.batch_decode(input, **kwargs)  # batch_decode
         else:
             # Can be list of ints or a Tensor
-            return self.tokenizer.decode(input, *args, **kwargs)
+            return self.tokenizer.decode(input, **kwargs)
 
-        
-    def inference(
-        self, 
-        inputs, 
-        release_gpu: bool = False,
-        use_vllm: bool = False,
-        **kwargs
-    ):
+    def inference(self, inputs, release_gpu: bool = False, use_vllm: bool = False, **kwargs):
         """
         Perform generation process of the model.
-    
+
         Parameters
         ------------
         inputs :
@@ -337,109 +288,94 @@ def inference(
         use_vllm : bool, optional
             Whether to use VLLM for inference, by default False.
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         outputs :
-            The generated sequence output 
+            The generated sequence output
         """
         if not self._activated:
             self.activate_model_for_inference(
                 use_vllm=use_vllm,
                 **kwargs,
             )
-            
+
         if use_vllm:
             if not is_vllm_available():
                 raise ImportError("vllm is not installed. Please install vllm to use VLLM inference.")
             res = self.__vllm_inference(inputs, **kwargs)
         else:
             res = self.__inference(inputs, **kwargs)
-            
+
         if release_gpu:
             self.deactivate_model_for_inference(use_vllm=use_vllm)
-            
-        return res
 
+        return res
 
     def __inference(self, inputs, *args, **kwargs):
         """
         Perform generation process of the model.
-    
+
         Parameters
         ------------
         inputs :
             The **tokenized** sequence used as a prompt for the generation or as model inputs to the model.
-            
+
         args : Optional.
             Positional arguments.
-        
+
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         outputs :
-            The generated sequence output 
+            The generated sequence output
         """
         with torch.no_grad():
-            if self.use_accelerator:
+            if is_accelerate_env():
                 outputs = self.backend_model.generate(
-                    input_ids=inputs,
-                    pad_token_id=self.tokenizer.pad_token_id,
-                    *args,
-                    **kwargs
+                    input_ids=inputs, pad_token_id=self.tokenizer.pad_token_id, **kwargs
                 )
             else:
                 if self.device == "gpu":
                     outputs = self.ds_engine.module.generate(
-                        input_ids=inputs,
-                        synced_gpus=True,
-                        pad_token_id=self.tokenizer.pad_token_id,
-                        *args,
-                        **kwargs
+                        input_ids=inputs, synced_gpus=True, pad_token_id=self.tokenizer.pad_token_id, **kwargs
                     )
                 elif self.device == "cpu":
                     outputs = self.backend_model.generate(
-                        input_ids=inputs,
-                        synced_gpus=True,
-                        pad_token_id=self.tokenizer.pad_token_id,
-                        *args,
-                        **kwargs
+                        input_ids=inputs, synced_gpus=True, pad_token_id=self.tokenizer.pad_token_id, **kwargs
                     )
                 else:
-                    raise NotImplementedError(
-                        f"device \"{self.device}\" is not supported"
-                    )
+                    raise NotImplementedError(f'device "{self.device}" is not supported')
         return outputs
-    
-    
+
     def __vllm_inference(
-        self, 
-        inputs: Union[str, List[str]],
-        sampling_params: Optional['SamplingParams'] = None,
+        self,
+        inputs: Union[str, list[str]],
+        sampling_params: Optional["SamplingParams"] = None,
         **kwargs,
-    ) -> List[VLLMInferenceResultWithInput]:
+    ) -> list[VLLMInferenceResultWithInput]:
         """Perform VLLM inference process of the model.
 
         Parameters
         ----------
-        inputs : Union[str, List[str]]
+        inputs : Union[str, list[str]]
             Prompt(s), string or a list of strings.
         sampling_params : Optional[SamplingParams], optional
             vllm SamplingParams object, by default None.
 
         Returns
         -------
-        List[VLLMInferenceResultWithInput]
+        list[VLLMInferenceResultWithInput]
             Return a list of VLLMInferenceResultWithInput, where each
             element contains the input prompt and the corresponding output.
-            
+
             When `sampling_params.detokenize = True`, the output would be a list of strings,
             contains sampling_params.n samples for the corresponding prompt.
-            
-            When `sampling_params.detokenize = False`, return a list of list of ints 
+
+            When `sampling_params.detokenize = False`, return a list of list of ints
             (token ids, no decoding after generation).
         """
         vllm_outputs = self.backend_model_for_inference.generate(
@@ -450,15 +386,14 @@ def __vllm_inference(
         final_output = []
         for output in vllm_outputs:
             if sampling_params.detokenize:
-                output_list = [sentence.text for sentence in output.outputs]  
+                output_list = [sentence.text for sentence in output.outputs]
             else:
                 output_list = [sentence.token_ids for sentence in output.outputs]
-                
+
             final_output.append({"input": output.prompt, "output": output_list})
-                                
+
         return final_output
-    
-    
+
     def prepare_inputs_for_inference(
         self,
         dataset: Dataset,
@@ -466,21 +401,21 @@ def prepare_inputs_for_inference(
         enable_distributed_inference: bool = False,
         use_vllm: bool = False,
         **kwargs,
-    ) -> Union[List[str], "ray.data.Dataset", Dict[str, torch.Tensor]]:
+    ) -> Union[list[str], "ray.data.Dataset", dict[str, torch.Tensor]]:
         """
         Prepare inputs for inference.
-    
+
         Parameters
         ------------
         dataset : lmflow.datasets.Dataset.
             The dataset used for inference.
-            
+
         args : Optional.
             Positional arguments.
-        
+
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         outputs :
@@ -490,7 +425,7 @@ def prepare_inputs_for_inference(
             if not is_ray_available() and enable_distributed_inference:
                 raise ImportError("ray is not installed. Please install ray to use distributed vllm inference.")
             inference_inputs = self.__prepare_inputs_for_vllm_inference(
-                dataset=dataset, 
+                dataset=dataset,
                 apply_chat_template=apply_chat_template,
                 enable_distributed_inference=enable_distributed_inference,
             )
@@ -500,93 +435,95 @@ def prepare_inputs_for_inference(
                 apply_chat_template=apply_chat_template,
                 enable_distributed_inference=enable_distributed_inference,
             )
-            
+
         return inference_inputs
-    
-    
+
     def __prepare_inputs_for_vllm_inference(
         self,
         dataset: Dataset,
         apply_chat_template: bool = True,
         enable_distributed_inference: bool = False,
-    ) -> Union[List[str], "ray.data.Dataset"]:
-        if dataset.get_type() == 'text_only':
+    ) -> Union[list[str], "ray.data.Dataset"]:
+        if dataset.get_type() == "text_only":
             if apply_chat_template:
                 dataset = dataset.map(
                     lambda sample: {
                         "templated": self.tokenizer.apply_chat_template(
-                            [{"role":"user", "content": sample['text']}], 
-                            tokenize=False, 
-                            add_generation_prompt=True
+                            [{"role": "user", "content": sample["text"]}], tokenize=False, add_generation_prompt=True
                         )
                     },
                     num_proc=dataset.data_args.preprocessing_num_workers,
                 )
-                inference_inputs = dataset.get_backend_dataset()['templated']
+                inference_inputs = dataset.get_backend_dataset()["templated"]
             else:
-                inference_inputs = dataset.get_backend_dataset()['text']
-            
+                inference_inputs = dataset.get_backend_dataset()["text"]
+
         elif dataset.get_type() == "text2text":
-            logger.warning(f"For a text2text dataset, only `input` will be used as the model input.")
+            logger.warning("For a text2text dataset, only `input` will be used as the model input.")
             if apply_chat_template:
                 dataset = dataset.map(
                     lambda sample: {
                         "templated": self.tokenizer.apply_chat_template(
-                            conversation=[{"role":"user", "content": sample['input']}], 
-                            tokenize=False, 
-                            add_generation_prompt=True
+                            conversation=[{"role": "user", "content": sample["input"]}],
+                            tokenize=False,
+                            add_generation_prompt=True,
                         )
                     },
                     num_proc=dataset.data_args.preprocessing_num_workers,
                 )
-                inference_inputs = dataset.get_backend_dataset()['templated']
+                inference_inputs = dataset.get_backend_dataset()["templated"]
             else:
-                inference_inputs = dataset.get_backend_dataset()['input']
-            
-        elif dataset.get_type() == 'conversation':
+                inference_inputs = dataset.get_backend_dataset()["input"]
+
+        elif dataset.get_type() == "conversation":
             if apply_chat_template:
+
                 def preprocess_conversation(sample):
-                    conversation = sample['messages'][:-1] if len(sample['messages'])%2 == 0 else sample['messages']
-                        
-                    if sample['messages'][-1]['role'] != 'user':
+                    conversation = sample["messages"][:-1] if len(sample["messages"]) % 2 == 0 else sample["messages"]
+
+                    if sample["messages"][-1]["role"] != "user":
                         logger.warning(
                             "Not a valid conversation for generation, since the conversation "
                             "doesn't end up with an user message. Skip."
                         )
                         sample_out = {"templated": ""}
                     else:
-                        sample_out = {"templated": self.tokenizer.apply_chat_template(
-                            conversation=conversation,
-                            tokenize=False,
-                            add_generation_prompt=True,
-                        )}
-                        
+                        sample_out = {
+                            "templated": self.tokenizer.apply_chat_template(
+                                conversation=conversation,
+                                tokenize=False,
+                                add_generation_prompt=True,
+                            )
+                        }
+
                     return sample_out
+
                 dataset = dataset.map(
                     preprocess_conversation,
                     num_proc=dataset.data_args.preprocessing_num_workers,
                 )
-                inference_inputs = dataset.get_backend_dataset()['templated']
+                inference_inputs = dataset.get_backend_dataset()["templated"]
             else:
                 logger.warning(
                     "Your dataset is `conversation` type but `apply_chat_template` is set to False. "
                     "Will use the first user input in conversation as model input."
                 )
-                inference_inputs = [conversation[0]['content'] for conversation in dataset.get_backend_dataset()['messages']]
+                inference_inputs = [
+                    conversation[0]["content"] for conversation in dataset.get_backend_dataset()["messages"]
+                ]
 
         else:
-            raise NotImplementedError(
-                f"Currently `{dataset.get_type()}` data are not supported for vllm inference."
-            )
+            raise NotImplementedError(f"Currently `{dataset.get_type()}` data are not supported for vllm inference.")
 
         inference_inputs = [sentence for sentence in inference_inputs if len(sentence) > 0]
-        
+
         if enable_distributed_inference:
-            inference_inputs = ray.data.from_items(inference_inputs) # -> Dict[str, np.ndarray], {"item": array(['...', '...', '...'])}
-        
+            inference_inputs = ray.data.from_items(
+                inference_inputs
+            )  # -> dict[str, np.ndarray], {"item": array(['...', '...', '...'])}
+
         return inference_inputs
-    
-    
+
     def __prepare_inputs_for_inference(
         self,
         dataset: Dataset,
@@ -594,13 +531,14 @@ def __prepare_inputs_for_inference(
     ):
         raise NotImplementedError("prepare_inputs_for_inference is not implemented")
 
-
     def merge_lora_weights(self):
         if self.model_args.use_lora and not self.model_args.use_qlora:
             self.get_backend_model().merge_and_unload()
         elif self.model_args.use_qlora:
-            logger.warning("Reloading base model in 16-bit precision to merge adapter weights. NOTE: Your device must have"
-                           "sufficient memory to reload the model in half-precision without quantization.")
+            logger.warning(
+                "Reloading base model in 16-bit precision to merge adapter weights. NOTE: Your device must have"
+                "sufficient memory to reload the model in half-precision without quantization."
+            )
             self.get_peft_without_qlora()
             self.get_backend_model().merge_and_unload()
         else:
@@ -610,8 +548,7 @@ def get_peft_without_qlora(self):
         import tempfile
 
         with tempfile.TemporaryDirectory() as tmpdirname:
-            print('created temporary directory', tmpdirname)
-
+            print("created temporary directory", tmpdirname)
 
             self.get_backend_model().save_pretrained(tmpdirname)
 
@@ -627,9 +564,9 @@ def get_peft_without_qlora(self):
             }
             config = AutoConfig.from_pretrained(self.model_args.model_name_or_path, **config_kwargs)
             device_map = "auto"
-            if os.environ.get('LOCAL_RANK') is not None:
-                local_rank = int(os.environ.get('LOCAL_RANK','0'))
-                device_map = {'': local_rank}
+            if os.environ.get("LOCAL_RANK") is not None:
+                local_rank = int(os.environ.get("LOCAL_RANK", "0"))
+                device_map = {"": local_rank}
 
             self.backend_model_full = AutoModelForCausalLM.from_pretrained(
                 self.model_args.model_name_or_path,
@@ -640,31 +577,31 @@ def get_peft_without_qlora(self):
                 token=self.model_args.token,
                 torch_dtype=torch_dtype,
                 device_map=device_map,
-                trust_remote_code = self.model_args.trust_remote_code,
+                trust_remote_code=self.model_args.trust_remote_code,
                 attn_implementation="flash_attention_2" if self.model_args.use_flash_attention else None,
             )
-        
+
             self.backend_model = PeftModel.from_pretrained(self.backend_model_full, tmpdirname)
 
     def save(self, dir, save_full_model=False, *args, **kwargs):
         """
         Perform generation process of the model.
-    
+
         Parameters
         ------------
         dir :
             The directory to save model and tokenizer
-            
+
         save_full_model : Optional.
             Whether to save full model.
-        
+
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         outputs :
-            The generated sequence output 
+            The generated sequence output
         """
         self.get_tokenizer().save_pretrained(dir)
         if save_full_model and self.model_args.use_lora:
diff --git a/src/lmflow/models/hf_encoder_decoder_model.py b/src/lmflow/models/hf_encoder_decoder_model.py
deleted file mode 100644
index 6eb6ff2f7..000000000
--- a/src/lmflow/models/hf_encoder_decoder_model.py
+++ /dev/null
@@ -1,522 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-"""This is a class called HFDecoderModel which is a wrapper around transformers model and
-tokenizer classes. It has several methods such as __init__, tokenize, and train that are
-used for training and fine-tuning the model. The __init__ method takes in several arguments
-such as model_args, tune_strategy, and ds_config, which are used to load the pretrained
-model and tokenizer, and initialize the training settings.
-
-The tokenize method is used to tokenize the input text and return the input IDs and attention
-masks that can be fed to the model for training or inference.
-
-This class supports different tune_strategy options such as 'normal', 'none', 'lora', and
-'adapter', which allow for different fine-tuning settings of the model. However, the 'lora'
-and 'adapter' strategies are not yet implemented.
-
-Overall, this class provides a convenient interface for loading and fine-tuning transformer
-models and can be used for various NLP tasks such as language modeling, text classification,
-and question answering.
-"""
-
-import copy
-import logging
-import time
-from typing import List, Union
-
-import deepspeed
-import torch
-from peft import (
-    LoraConfig,
-    PeftModel,
-    TaskType,
-    get_peft_config,
-    get_peft_model,
-)
-from transformers import (
-    CONFIG_MAPPING,
-    AutoConfig,
-    AutoTokenizer,
-    AutoModelForSeq2SeqLM,
-    AutoModelForVision2Seq,
-    AutoModel,
-    AutoProcessor,
-    LlamaConfig
-)
-from transformers.testing_utils import CaptureLogger
-
-from lmflow.datasets.dataset import Dataset
-from lmflow.models.encoder_decoder_model import EncoderDecoderModel
-from lmflow.models.interfaces.tunable import Tunable
-from lmflow.models.vision2seq_model import CustomAutoVision2SeqModel
-from lmflow.utils.multimodal import update_custom_config, load_llava_pretrain_model
-from lmflow.utils.versioning import is_package_version_at_least
-
-if is_package_version_at_least("transformers", "4.46.0"):
-    from transformers.integrations.deepspeed import HfDeepSpeedConfig, HfTrainerDeepSpeedConfig
-else:
-    from transformers.deepspeed import HfDeepSpeedConfig, HfTrainerDeepSpeedConfig
-
-
-logger = logging.getLogger(__name__)
-
-
-class HFEncoderDecoderModel(EncoderDecoderModel, Tunable):
-    r"""
-    Initializes a HFEncoderDecoderModel instance.
-
-    Parameters
-    ------------
-
-    model_args :
-        Model arguments such as model name, path, revision, etc.
-
-    tune_strategy : str or none,  default="normal".
-        A string representing the dataset backend. Defaults to "huggingface".
-
-    ds_config :
-        Deepspeed configuations.
-
-    args : Optional.
-        Positional arguments.
-
-    kwargs : Optional.
-        Keyword arguments.
-    """
-
-    def __init__(
-        self,
-        model_args,
-        tune_strategy='normal',
-        ds_config=None,
-        device="gpu",
-        use_accelerator=False,
-        custom_model=False,
-        with_deepspeed=True,
-        pipeline_args=None,
-        *args,
-        **kwargs
-    ):
-        """
-        Initializes a HFDecoderModel instance.
-        :param model_args: dictionary with model arguments such as model name, path, revision, etc.
-        :param tune_strategy: tuning strategy: normal, none, lora or adapter
-        :param ds_config: deepspeed configuration for distributed training
-        """
-
-        # See more about loading any type of standard or custom dataset (from
-        # files, python dict, pandas DataFrame, etc) at
-        # https://huggingface.co/docs/datasets/loading_datasets.html.
-
-        # Load pretrained model and tokenizer
-        #
-        # Distributed training: The .from_pretrained methods guarantee that
-        # only one local process can concurrently download model & vocab.
-
-        self.device = device
-
-        if tune_strategy == 'normal':
-            raise NotImplementedError(
-                f"tune_strategy \"{tune_strategy}\" is not supported"
-            )
-        elif tune_strategy == 'none':
-            if use_accelerator:
-                raise NotImplementedError(
-                    f"Currently encoder2decoder model is not supported with accelerator"
-                )
-            # dschf = HfDeepSpeedConfig(ds_config)
-            dschf = HfTrainerDeepSpeedConfig(ds_config)
-            if pipeline_args is not None:
-                dschf.trainer_config_process(pipeline_args)
-            peft_model_id = model_args.lora_model_path
-            # NOTE: Currently offload is not supported by llama
-            if "llama" in model_args.model_name_or_path and model_args.use_ram_optimized_load:
-                logger.warning(
-                    "llama does not support RAM optimized load. Automatically"
-                    " use original load instead."
-                )
-                model_args.use_ram_optimized_load = False
-
-            # get model register
-            self.arch_type = model_args.arch_type
-            if self.arch_type == "encoder_decoder":
-                if model_args.model_name_or_path == 'THUDM/chatglm-6b':
-                    model_register = AutoModel
-                else:
-                    model_register = AutoModelForSeq2SeqLM
-            elif self.arch_type == "vision_encoder_decoder":
-                if not custom_model:
-                    model_register = AutoModelForVision2Seq
-                else:
-                    model_register = CustomAutoVision2SeqModel
-            else:
-                raise NotImplementedError
-            if not custom_model:
-                if model_args.model_name_or_path == 'THUDM/chatglm-6b':
-                    self.backend_model = model_register.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
-
-                elif model_args.use_ram_optimized_load and peft_model_id is None:
-                    try:
-                        # RAM-optimized load
-                        self.backend_model = model_register.from_pretrained(
-                            model_args.model_name_or_path,
-                            device_map="auto",
-                            offload_folder="offload",
-                            offload_state_dict=True,
-                        )
-                    except:
-                        logger.warning(
-                            "Failed to use RAM optimized load. Automatically"
-                            " use original load instead."
-                        )
-                        # Normal load
-                        self.backend_model = model_register.from_pretrained(
-                            model_args.model_name_or_path,
-                        )
-                else:
-                    if peft_model_id is not None:
-                        logger.warning(
-                            "LoRA does not support RAM optimized load currently."
-                            " Automatically use original load instead."
-                        )
-                    self.backend_model = model_register.from_pretrained(
-                        model_args.model_name_or_path,
-                    )
-            # else:
-            #     self.backend_model = model_register.from_pretrained(
-            #         model_args.model_name_or_path)
-            else:
-                if model_args.llava_loading is False:
-                    # FIXME remove the following from_pretrained code by
-                    # creating a unified pretrained model.
-                    model = CustomAutoVision2SeqModel.from_pretrained(model_args.model_name_or_path)
-                    if model_args.llm_model_name_or_path is not None:
-                        text_config = LlamaConfig.from_pretrained(model_args.llm_model_name_or_path)
-                        model.config.text_config = text_config
-                    model.language_model_from_pretrained(model_args.llm_model_name_or_path,
-                                                        low_resource=model_args.low_resource)
-                    state_dict = torch.load(
-                        model_args.pretrained_language_projection_path,
-                        map_location="cpu")
-                    model.load_state_dict(state_dict, strict=False)
-                else:
-                    config = AutoConfig.from_pretrained(
-                        model_args.model_name_or_path)
-                    if model_args.low_resource:
-                        kwargs = dict(
-                            torch_dtype=torch.float16,
-                            load_in_8bit=True,
-                            device_map="auto",
-                        )
-                    else:
-                        # kwargs = dict(torch_dtype=torch.float16)
-                        kwargs = dict(device_map="auto")
-                    if (model_args.image_encoder_name_or_path is None and
-                        model_args.qformer_name_or_path is None and
-                        model_args.llm_model_name_or_path is None):
-                        config = AutoConfig.from_pretrained(
-                            model_args.model_name_or_path)
-                        model = CustomAutoVision2SeqModel.from_pretrained(
-                            model_args.model_name_or_path, **kwargs)
-                    else:
-                        config = update_custom_config(config, model_args)
-                        model = CustomAutoVision2SeqModel(
-                            config,
-                            image_encoder_name_or_path=model_args.image_encoder_name_or_path,
-                            qformer_name_or_path=model_args.qformer_name_or_path,
-                            language_model_name_or_path=model_args.llm_model_name_or_path,
-                            low_resource=model_args.low_resource)
-                        if model_args.pretrained_language_projection_path is not None:
-                            state_dict = torch.load(
-                                model_args.pretrained_language_projection_path, map_location="cpu")
-                            new_state_dict = {}
-                            new_state_dict['model.language_projection.weight'] = \
-                                state_dict['model.mm_projector.weight']
-                            new_state_dict['model.language_projection.bias'] = \
-                                state_dict['model.mm_projector.bias']
-                if model_args.llava_pretrain_model_path is not None:
-                    # used for inference that directly load the preatrain model
-                    model = load_llava_pretrain_model(
-                        model, model_args.llava_pretrain_model_path)
-                    if model_args.save_pretrain_model_path is not None:
-                        model.save_pretrained(
-                            model_args.save_pretrain_model_path)
-                self.backend_model = model
-            # init tokenizer
-            if self.arch_type == "encoder_decoder":
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    model_args.model_name_or_path, trust_remote_code=True)
-            elif self.arch_type == "vision_encoder_decoder":
-                if model_args.llava_loading is False:
-                    # blip2 image and token processor
-                    self.tokenizer = AutoProcessor.from_pretrained(
-                        model_args.model_name_or_path, trust_remote_code=True)
-                    if model_args.llm_model_name_or_path is not None:
-                        # update the tokenizer from the custom llm.
-                        self.tokenizer.tokenizer = (
-                            AutoTokenizer.from_pretrained(
-                                model_args.llm_model_name_or_path)
-                        )
-                    self.image_processor = self.tokenizer.image_processor
-
-                else:
-                    # image processor is stored in the vision encoder
-                    if model_args.llm_model_name_or_path is not None:
-                        self.tokenizer = AutoTokenizer.from_pretrained(
-                            model_args.llm_model_name_or_path)
-                    else:
-                        self.tokenizer = AutoTokenizer.from_pretrained(
-                            config.text_config._name_or_path)
-                    self.image_processor = self.backend_model.image_processor
-            else:
-                raise NotImplementedError
-
-            self.backend_model_full = self.backend_model
-            if peft_model_id is not None:
-                self.backend_model = PeftModel.from_pretrained(
-                    self.backend_model, peft_model_id
-                )
-            if tune_strategy == "none" and with_deepspeed is True:
-                # when load the model with 4bit / 8bit.
-                # fail to use deepspeed.
-                if device == "gpu":
-                    deepspeed.init_distributed()
-                    self.ds_engine = deepspeed.initialize(model=self.backend_model, config_params=ds_config)[0]
-                    self.ds_engine.module.eval()
-
-            self.tokenizer.padding_side = "left" # necessary for auto-gressive inference
-
-        elif tune_strategy == 'adapter':
-            raise NotImplementedError('adapter tune strategy not implemented')
-
-        if self.arch_type == "encoder_decoder":
-            if self.tokenizer.eos_token_id is None:
-                self.tokenizer.eos_token_id = self.backend_model.config.eos_token_id
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-
-    def tokenize(self, dataset, *args, **kwargs):
-        """
-        Tokenize the full dataset.
-
-        Parameters
-        ------------
-        dataset :
-            Text dataset.
-
-        args : Optional.
-            Positional arguments.
-
-        kwargs : Optional.
-            Keyword arguments.
-
-        Returns
-        ------------
-        tokenized_datasets :
-            The tokenized dataset.
-        """
-        raise NotImplementedError('tokenize not implemented')
-
-    def encode(self, input: Union[str, List[str]], *args, **kwargs ) -> Union[List[int], List[List[int]]]:
-        """
-        Perform encoding process of the tokenizer.
-
-        Parameters
-        ------------
-        inputs : str or list.
-            The text sequence.
-
-        args : Optional.
-            Positional arguments.
-
-        kwargs : Optional.
-            Keyword arguments.
-
-        Returns
-        ------------
-        outputs :
-            The tokenized inputs.
-        """
-        # check how to handle the image processor
-        if isinstance(input, dict):
-            # TODO refactor the input type to make it elegant.
-            kwargs.update(input)
-            if "images" not in input:
-                tokens = self.tokenizer(*args, **kwargs)
-            else:
-                if getattr(self.tokenizer, "image_processor", None) is not None:
-                    tokens = self.tokenizer(*args, **kwargs)
-                elif getattr(self, "image_processor", None) is not None:
-                    images = kwargs.pop("images")
-                    tokens = self.tokenizer(*args, **kwargs)
-                    images = self.image_processor.preprocess(
-                        images, return_tensors='pt')['pixel_values'][0]
-                    tokens['pixel_values'] = images
-                else:
-                    print("Can not find the image processor")
-                    raise NotImplementedError
-            return tokens
-        elif isinstance(input, list):
-            return self.tokenizer(text=input, *args, **kwargs)#batch encode,will automatically do left padding
-        elif isinstance(input, str):
-            return self.tokenizer.encode(text=input, *args, **kwargs)
-        else:
-            raise NotImplementedError(f'type "{type(input)}" cannot be encoded')
-
-
-    def decode(self, input, *args, **kwargs ) -> Union[str, List[str]]:
-        """
-        Perform decoding process of the tokenizer.
-
-        Parameters
-        ------------
-        inputs : list.
-            The token sequence.
-
-        args : Optional.
-            Positional arguments.
-
-        kwargs : Optional.
-            Keyword arguments.
-
-        Returns
-        ------------
-        outputs :
-            The text decoded from the token inputs.
-        """
-        if isinstance(input, List):
-            input=torch.tensor(input)
-        if input.dim()==2:
-            return self.tokenizer.batch_decode(input, *args, **kwargs)#batch_decode
-        else:
-            # Can be list of ints or a Tensor
-            return self.tokenizer.decode(input, *args, **kwargs)
-
-
-    def inference(self, inputs, *args, **kwargs):
-        """
-        Perform generation process of the model.
-
-        Parameters
-        ------------
-        inputs :
-            The sequence used as a prompt for the generation or as model inputs to the model.
-
-        args : Optional.
-            Positional arguments.
-
-        kwargs : Optional.
-            Keyword arguments.
-
-        Returns
-        ------------
-        outputs :
-            The generated sequence output
-        """
-        # current_time = time.strftime("%H:%M:%S", time.localtime())
-        # print(f"{current_time}: model.inference: start", flush=True)
-
-        # TODO need to discuss how to handle pad_token_id
-        if self.arch_type == "encoder_decoder":
-            kwargs.update(pad_token_id=self.tokenizer.pad_token_id)
-        elif self.arch_type == "vision_encoder_decoder":
-            # TODO disucss how to modify the interface to remove this part.
-            inputs = copy.deepcopy(inputs)
-            input_ids = inputs.pop('input_ids')
-            kwargs.update(**inputs)
-            inputs = input_ids
-
-        # current_time = time.strftime("%H:%M:%S", time.localtime())
-        # print(f"{current_time}: model.inference: kwargs update end", flush=True)
-
-        with torch.no_grad():
-            if self.device == "gpu":
-                if getattr(self, "ds_engine", None) is not None:
-                    outputs = self.ds_engine.module.generate(
-                        input_ids=inputs,
-                        synced_gpus=True,
-                        *args,
-                        **kwargs
-                    )
-                else:
-                    outputs = self.backend_model.generate(
-                        input_ids=inputs,
-                        synced_gpus=True,
-                        *args,
-                        **kwargs,
-                    )
-            elif self.device == "cpu":
-                outputs = self.backend_model.generate(
-                    input_ids=inputs,
-                    synced_gpus=True,
-                    *args,
-                    **kwargs
-                )
-            else:
-                raise NotImplementedError(
-                    f"device \"{self.device}\" is not supported"
-                )
-
-        # current_time = time.strftime("%H:%M:%S", time.localtime())
-        # print(f"{current_time}: model.inference: end", flush=True)
-
-        return outputs
-
-
-    def merge_lora_weights(self):
-        if self.model_args.use_lora:
-            self.get_backend_model().merge_and_unload()
-        else:
-            logger.warning("LoRA training is NOT enabled. Merging LoRA weights is not applicable.")
-
-
-    def save(self, dir, save_full_model=False, *args, **kwargs):
-        """
-        Perform generation process of the model.
-
-        Parameters
-        ------------
-        dir :
-            The directory to save model and tokenizer
-
-        save_full_model : Optional.
-            Whether to save full model.
-
-        kwargs : Optional.
-            Keyword arguments.
-
-        Returns
-        ------------
-        outputs :
-            The generated sequence output
-        """
-        self.get_tokenizer().save_pretrained(dir)
-        if save_full_model and self.model_args.use_lora:
-            self.backend_model_full.save_pretrained(dir)
-        else:
-            self.get_backend_model().save_pretrained(dir)
-
-
-    def get_max_length(self):
-        """
-        Return max acceptable input length in terms of tokens.
-        """
-        if "tokenizer" not in self.tokenizer.__dict__:
-            return self.tokenizer.model_max_length
-        else:
-            # for the multi-modality processor,
-            # the max length is stored in the inner text tokenizer
-            return self.tokenizer.tokenizer.model_max_length
-
-
-    def get_tokenizer(self):
-        """
-        Return the tokenizer of the model.
-        """
-        return self.tokenizer
-
-
-    def get_backend_model(self):
-        """
-        Return the backend model.
-        """
-        return self.backend_model
diff --git a/src/lmflow/models/hf_model_mixin.py b/src/lmflow/models/hf_model_mixin.py
index 45f414292..42ff9b398 100644
--- a/src/lmflow/models/hf_model_mixin.py
+++ b/src/lmflow/models/hf_model_mixin.py
@@ -1,57 +1,46 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+import copy
 import gc
-import os
 import logging
-from typing import Union, Optional, Dict, List
-import copy
+from contextlib import nullcontext
+from typing import Optional, Union
 
 import torch
-import deepspeed
+from peft import LoraConfig, PeftModel, TaskType, get_peft_model
+from peft.utils.constants import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
 from transformers import (
     CONFIG_MAPPING,
     AutoConfig,
-    BitsAndBytesConfig,
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
     AutoTokenizer,
+    BitsAndBytesConfig,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
-    AutoModelForCausalLM,
-    AutoModelForSequenceClassification,
 )
-from peft import (
-    LoraConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-    prepare_model_for_kbit_training
-)
-from peft.utils.constants import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
+from transformers.modeling_utils import is_fsdp_enabled
 
-from lmflow.models.base_model import BaseModel
-from lmflow.utils.constants import (
-    LMFLOW_LORA_TARGET_MODULES_MAPPING
-)
 from lmflow.args import ModelArguments
-from lmflow.utils.versioning import is_vllm_available
+from lmflow.models.base_model import BaseModel
+from lmflow.utils.constants import LMFLOW_LORA_TARGET_MODULES_MAPPING
+from lmflow.utils.envs import is_accelerate_env
+from lmflow.utils.versioning import is_deepspeed_available, is_vllm_available
 
 if is_vllm_available():
-    from vllm import LLM, SamplingParams
+    from vllm import LLM
     from vllm.distributed.parallel_state import destroy_model_parallel
 
 
 logger = logging.getLogger(__name__)
 
 
-HF_AUTOMODEL_MAPPING = {
-    "decoder_only": AutoModelForCausalLM,
-    "text_regression": AutoModelForSequenceClassification
-}
+HF_AUTOMODEL_MAPPING = {"decoder_only": AutoModelForCausalLM, "text_regression": AutoModelForSequenceClassification}
 
 HF_AUTOMODEL_TYPE = Union[AutoModelForCausalLM, AutoModelForSequenceClassification]
 
 LORA_TARGET_MODULES_MAPPING = {
-    k: TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.get(k, LMFLOW_LORA_TARGET_MODULES_MAPPING.get(k)) 
+    k: TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING.get(k, LMFLOW_LORA_TARGET_MODULES_MAPPING.get(k))
     for k in set(TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING) | set(LMFLOW_LORA_TARGET_MODULES_MAPPING)
 }
 
@@ -61,27 +50,21 @@ def __init__(
         self,
         model_args: ModelArguments,
         do_train: bool,
-        ds_config=None,
-        device: Optional[str]="gpu",
-        use_accelerator: bool=False,
-        hf_auto_model_additional_args: Optional[Dict]=None,
+        device: Optional[str] = "gpu",
+        hf_auto_model_additional_args: Optional[dict] = None,
         *args,
-        **kwargs
+        **kwargs,
     ):
         """Initializes a HFModel instance.
 
         Parameters
         ----------
-        model_args : 
+        model_args :
             Dictionary with model arguments such as model name, path, revision, etc.
         do_train : bool
             To prepare the model for training or inference.
-        ds_config : optional
-            Deepspeed configuration for distributed training, by default None
         device : str, optional
             By default "gpu"
-        use_accelerator : bool, optional
-            By default False
         """
 
         # See more about loading any type of standard or custom dataset (from
@@ -96,23 +79,22 @@ def __init__(
         self.device = device
         self.model_args = model_args
         self.hf_auto_model = HF_AUTOMODEL_MAPPING[model_args.arch_type]
-        self.use_accelerator = use_accelerator
-        self.ds_config = ds_config
         self.do_train = do_train
-        
+
         self.tokenizer = self.__prepare_tokenizer(model_args)
         self.torch_dtype = self.__prepare_dtype(model_args)
         self.hf_model_config = self.__prepare_model_config(model_args, hf_auto_model_additional_args)
         self.quant_config = self.__prepare_quant_config(model_args)
         self.peft_config = self.__prepare_peft_config(model_args)
-        self._activated = False # for inference load and offload
-        
+        self._activated = False  # for inference load and offload
+
         # Some implementations require custom modules to be injected into the model.
         self.__model_module_inject(model_args)
 
         if self.do_train:
             self.__prepare_model_for_training(model_args, self.hf_auto_model)
-                        
+
+        self.__fix_special_tokens()
 
     def __prepare_tokenizer(
         self,
@@ -126,7 +108,7 @@ def __prepare_tokenizer(
                 " script, save it, and load it from here, using"
                 " --tokenizer_name."
             )
-        
+
         tokenizer_kwargs = {
             "cache_dir": model_args.cache_dir,
             "use_fast": model_args.use_fast_tokenizer,
@@ -134,31 +116,30 @@ def __prepare_tokenizer(
             "token": model_args.token,
             "trust_remote_code": model_args.trust_remote_code,
         }
-        if model_args.padding_side != 'auto':
+        if model_args.padding_side != "auto":
             tokenizer_kwargs["padding_side"] = model_args.padding_side
-        
+
         try:
             tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, **tokenizer_kwargs)
         except RecursionError:
             logger.warning(
                 "The tokenizer_config.json file doesn't set the special tokens. Using default values: "
-                "<unk>, <s>, </s> for unknown token, bos token and eos token respectively.")
-            tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, unk_token="<unk>",
-                                                bos_token="<s>",
-                                                eos_token="</s>",
-                                                **tokenizer_kwargs)
+                "<unk>, <s>, </s> for unknown token, bos token and eos token respectively."
+            )
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_name, unk_token="<unk>", bos_token="<s>", eos_token="</s>", **tokenizer_kwargs
+            )
 
         tokenizer.truncation_side = model_args.truncation_side or tokenizer.truncation_side
         tokenizer.model_max_length = model_args.model_max_length or tokenizer.model_max_length
-        
+
         return tokenizer
-    
-    
+
     def __prepare_dtype(
         self,
         model_args: ModelArguments,
     ) -> torch.dtype:
-        if model_args.arch_type == 'text_regression':
+        if model_args.arch_type == "text_regression":
             if model_args.torch_dtype in ["auto", None, "bf16", "bfloat16"]:
                 torch_dtype = torch.bfloat16
             else:
@@ -167,32 +148,32 @@ def __prepare_dtype(
                     f"If you are doing reward modeling,"
                     f" InstructGPT uses torch.bfloat16 for reward model, but you"
                     f" are using {torch_dtype} for your reward model init. Ignore"
-                    f" this warning if it is intended.")
+                    f" this warning if it is intended."
+                )
         else:
             torch_dtype = (
                 model_args.torch_dtype
                 if model_args.torch_dtype in ["auto", None]
                 else getattr(torch, model_args.torch_dtype)
             )
-            
+
         logger.debug(f"torch_dtype on init: {torch_dtype}")
-        
-        return torch_dtype
 
+        return torch_dtype
 
     def __prepare_model_config(
         self,
         model_args: ModelArguments,
-        hf_auto_model_additional_args: Optional[Dict]=None,
+        hf_auto_model_additional_args: Optional[dict] = None,
     ):
         """Prepare model configuration for hf auto register,
         Parameters
         ----------
         model_args : ModelArguments
             LMFlow model arguments.
-        hf_auto_model_additional_args : Optional[Dict], optional
-            Special configurations such as `num_labels` in `AutoModelForSequenceClassification` 
-            (commonly used in reward modeling) will not preset in __prepare_model_config, 
+        hf_auto_model_additional_args : Optional[dict], optional
+            Special configurations such as `num_labels` in `AutoModelForSequenceClassification`
+            (commonly used in reward modeling) will not preset in __prepare_model_config,
             so it should be passed in hf_auto_model_additional_args.
         Returns
         -------
@@ -209,7 +190,7 @@ def __prepare_model_config(
         }
         if hf_auto_model_additional_args is not None:
             config_kwargs.update(hf_auto_model_additional_args)
-            
+
         if model_args.config_name:
             config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs)
         elif model_args.model_name_or_path:
@@ -221,10 +202,9 @@ def __prepare_model_config(
                 logger.info(f"Overriding config: {model_args.config_overrides}")
                 config.update_from_string(model_args.config_overrides)
                 logger.info(f"New config: {config}")
-        
+
         return config
-    
-    
+
     def __prepare_quant_config(
         self,
         model_args: ModelArguments,
@@ -232,24 +212,44 @@ def __prepare_quant_config(
         quant_config = None
         if self.do_train:
             if model_args.use_qlora:
-                quant_config = BitsAndBytesConfig(
-                    load_in_4bit=model_args.bits == 4,
-                    load_in_8bit=model_args.bits == 8,
-                    llm_int8_threshold=6.0,
-                    llm_int8_has_fp16_weight=False,
-                    bnb_4bit_compute_dtype=self.torch_dtype,
-                    bnb_4bit_use_double_quant=model_args.double_quant,
-                    bnb_4bit_quant_type=model_args.quant_type,
-                )
-        else: # inference
+                if model_args.quant_bit == 8:
+                    if is_fsdp_enabled():
+                        raise ValueError("FSDP + Qlora 8-bit quantization is not supported.")
+                    quant_config_kwargs = {
+                        "load_in_8bit": True,
+                    }
+                elif model_args.quant_bit == 4:
+                    logger.warning(
+                        "For users who are using Accelerate (FSDP backend) or DeepSpeed, "
+                        "we only implement Qlora 4-bit quantization with torch.bfloat16 dtype currently. "
+                        "Carefully check the Accelerate or DeepSpeed configurations, since they may cast dtype "
+                        "and cause errors like "
+                        "(DeepSpeed) `TypeError: output tensor must have the same type as input tensor`, or "
+                        "(Accelerate FSDP) `ValueError: Must flatten tensors with uniform dtype but got "
+                        "torch.bfloat16 and torch.float32`. "
+                        "Consider using other peft methods if your device doesn't support torch.bfloat16. "
+                        "(This is just a notification and please self-check the compatibility of your device.)"
+                    )
+                    quant_config_kwargs = {
+                        "load_in_4bit": True,
+                        "bnb_4bit_compute_dtype": torch.bfloat16,
+                        "bnb_4bit_use_double_quant": model_args.double_quant,
+                        "bnb_4bit_quant_type": model_args.quant_type,
+                        "bnb_4bit_quant_storage": torch.bfloat16,  # fsdp+qlora, see https://huggingface.co/docs/bitsandbytes/v0.43.3/en/fsdp_qlora
+                    }
+                else:
+                    raise ValueError("Qlora only supports 4-bit and 8-bit.")
+
+                quant_config = BitsAndBytesConfig(**quant_config_kwargs)
+
+        else:  # inference
             if model_args.use_int8:
                 quant_config = BitsAndBytesConfig(
-                    load_in_8bit = model_args.use_int8,
+                    load_in_8bit=model_args.use_int8,
                 )
-                
+
         return quant_config
 
-    
     def __prepare_peft_config(
         self,
         model_args: ModelArguments,
@@ -263,9 +263,11 @@ def __prepare_peft_config(
                 if hasattr(model_config, "to_dict"):
                     model_config = model_config.to_dict()
                 if "model_type" not in model_config or not model_config["model_type"]:
-                    logger.warning("It seems that your base model is a custom model, since "
-                                   "model_type is not found in model_config when preparing peft config. "
-                                   "Setting model_type to 'custom' as a fallback.")
+                    logger.warning(
+                        "It seems that your base model is a custom model, since "
+                        "model_type is not found in model_config when preparing peft config. "
+                        "Setting model_type to 'custom' as a fallback."
+                    )
                     model_config["model_type"] = "custom"
                 lora_target_modules = LORA_TARGET_MODULES_MAPPING.get(model_config["model_type"], None)
 
@@ -285,12 +287,14 @@ def __prepare_peft_config(
                 if hasattr(model_config, "to_dict"):
                     model_config = model_config.to_dict()
                 if "model_type" not in model_config or not model_config["model_type"]:
-                    logger.warning("It seems that your base model is a custom model, since "
-                                   "model_type is not found in model_config when preparing peft config. "
-                                   "Setting model_type to 'custom' as a fallback.")
+                    logger.warning(
+                        "It seems that your base model is a custom model, since "
+                        "model_type is not found in model_config when preparing peft config. "
+                        "Setting model_type to 'custom' as a fallback."
+                    )
                     model_config["model_type"] = "custom"
                 lora_target_modules = LORA_TARGET_MODULES_MAPPING.get(model_config["model_type"], None)
-            
+
             peft_config = LoraConfig(
                 use_dora=True,
                 task_type=TaskType.CAUSAL_LM,
@@ -301,27 +305,26 @@ def __prepare_peft_config(
                 target_modules=lora_target_modules,
             )
         return peft_config
-    
-    
+
     def __model_module_inject(
         self,
         model_args: ModelArguments,
     ) -> None:
         """Override some model modules with custom implementations.
-        
+
         Current implementations:
-        - Position interpolation (model_args.do_rope_scaling): 
+        - Position interpolation (model_args.do_rope_scaling):
             replace llama embeddings with condense embeddings.
         """
         # position interpolation
         if model_args.do_rope_scaling:
             if "LlamaForCausalLM" in self.model_config.architectures:
                 from lmflow.utils.position_interpolation.llama_rope_scaled_monkey_patch import (
-                        replace_llama_with_condense,
+                    replace_llama_with_condense,
                 )
+
                 replace_llama_with_condense(model_args.rope_pi_ratio, model_args.rope_ntk_ratio)
-                
-                
+
     def __prepare_model_for_training(
         self,
         model_args: ModelArguments,
@@ -330,11 +333,11 @@ def __prepare_model_for_training(
         assert self.do_train, "To prepare the model for training, please set do_train=True."
         # TODO: change to accelerate
 
-        if 'hymba' in model_args.model_name_or_path:
+        if "hymba" in model_args.model_name_or_path:
             import torch._dynamo
+
             torch._dynamo.config.suppress_errors = True
             torch._dynamo.config.disable = True
-    
 
         logger.info("Preparing model for training")
         if model_args.model_name_or_path:
@@ -346,63 +349,56 @@ def __prepare_model_for_training(
                 trust_remote_code=model_args.trust_remote_code,
             )
 
-            if model_args.use_qlora:
-                model.gradient_checkpointing_enable()
-                model = prepare_model_for_kbit_training(model)
         else:
             model = hf_auto_model.from_config(self.hf_model_config)
             n_params = sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())
-            logger.info(f"Training new model from scratch - Total size={n_params/2**20:.2f}M params")
+            logger.info(f"Training new model from scratch - Total size={n_params / 2**20:.2f}M params")
         self.backend_model_full = model
-        
+
         if model_args.ignore_bias_buffers:
             # torch distributed hack
-            # fix for DDP issues with LM bias/mask buffers - invalid scalar type, inplace operation. 
+            # fix for DDP issues with LM bias/mask buffers - invalid scalar type, inplace operation.
             # See: https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992
             model._ddp_params_and_buffers_to_ignore = [
                 name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
             ]
-        
+
         if model_args.use_lora or model_args.use_dora:
             model.enable_input_require_grads()
-            if model_args.lora_model_path is not None:
-                # Load model from LoRA weights
-                model = PeftModel.from_pretrained(
-                    model,
-                    model_args.lora_model_path,
-                    is_trainable=True,
-                )
-            else:
-                # New LoRA Finetuning
-                model = get_peft_model(model, self.peft_config)
+            model = get_peft_model(model, self.peft_config)
             model.print_trainable_parameters()
 
         # We resize the embeddings only when necessary to avoid index errors.
         # If you are creating a model from scratch on a small vocab and want a
         # smaller embedding size, remove this test.
-        with deepspeed.zero.GatheredParameters(model.get_input_embeddings().weight, modifier_rank=None):
+        resize_embedding_context = nullcontext()
+        if is_deepspeed_available() and not is_accelerate_env():
+            import deepspeed
+
+            resize_embedding_context = deepspeed.zero.GatheredParameters(
+                model.get_input_embeddings().weight, modifier_rank=None
+            )
+
+        with resize_embedding_context:
             weights = model.get_input_embeddings().weight
             embedding_size = weights.shape[0]
+
         if len(self.tokenizer) > embedding_size:
             model.resize_token_embeddings(len(self.tokenizer))
 
         self.backend_model = model
-        self.__prepare_model_post_process()
 
-    
     def __prepare_model_for_inference(
         self,
         model_args: ModelArguments,
         hf_auto_model: HF_AUTOMODEL_TYPE,
-        use_accelerator: bool,
-        ds_config
     ):
         logger.info(f"Backend model already initialized, moving to device: {self.device}")
         if hasattr(self, "backend_model"):
             if self.backend_model.device == torch.device("cpu"):
                 self.backend_model.to(self.device)
             return
-            
+
         # TODO: change to accelerate
         logger.info("Preparing model for inference")
         inference_load_kwargs = {}
@@ -413,13 +409,9 @@ def __prepare_model_for_inference(
             "offload_state_dict": True,
         }
 
-        if use_accelerator or model_args.use_ram_optimized_load:
+        if model_args.use_ram_optimized_load:
             inference_load_kwargs.update(ram_optimized_load_kwargs)
-                   
-        if not use_accelerator:
-            from transformers.integrations import HfDeepSpeedConfig
-            dschf = HfDeepSpeedConfig(ds_config)
-            
+
         try:
             self.backend_model = hf_auto_model.from_pretrained(
                 model_args.model_name_or_path,
@@ -428,10 +420,8 @@ def __prepare_model_for_inference(
                 quantization_config=self.quant_config,
                 **inference_load_kwargs,
             )
-        except:
-            logger.warning(
-                "Failed to use RAM optimized load. Using original load instead."
-            )
+        except Exception:
+            logger.warning("Failed to use RAM optimized load. Using original load instead.")
             self.backend_model = hf_auto_model.from_pretrained(
                 model_args.model_name_or_path,
                 torch_dtype=self.torch_dtype,
@@ -439,23 +429,25 @@ def __prepare_model_for_inference(
                 quantization_config=self.quant_config,
                 **inference_load_kwargs_bak,
             )
-            
+
         self.backend_model_full = self.backend_model
-        
+
         if model_args.lora_model_path is not None:
             self.backend_model = PeftModel.from_pretrained(
-                self.backend_model, 
+                self.backend_model,
                 model_args.lora_model_path,
             )
 
-        if (not use_accelerator) and self.device == "gpu":
-            deepspeed.init_distributed()
-            self.ds_engine = deepspeed.initialize(model=self.backend_model, config_params=ds_config)[0]
-            self.ds_engine.module.eval()
-            
-        self.__prepare_model_post_process()
-                    
-        
+        if self.device == "gpu" and not is_accelerate_env():
+            if is_deepspeed_available():
+                import deepspeed
+
+                deepspeed.init_distributed()
+                self.ds_engine = deepspeed.initialize(model=self.backend_model)[0]
+                self.ds_engine.module.eval()
+            else:
+                raise ImportError("Deepspeed is not available. Please install via `pip install -e '.[deepspeed]'`.")
+
     def __prepare_model_for_vllm_inference(
         self,
         model_args: ModelArguments,
@@ -464,7 +456,7 @@ def __prepare_model_for_vllm_inference(
     ):
         if not is_vllm_available():
             raise ImportError('VLLM is not available. Please install via `pip install -e ".[vllm]"`.')
-        
+
         self.backend_model_for_inference = LLM(
             model=model_args.model_name_or_path,
             tokenizer=model_args.model_name_or_path,
@@ -473,40 +465,38 @@ def __prepare_model_for_vllm_inference(
             gpu_memory_utilization=vllm_gpu_memory_utilization,
             tensor_parallel_size=vllm_tensor_parallel_size,
         )
-        
-    
-    def __prepare_model_post_process(self):
-        # old models/tokenizers may not have these attributes, fixing            
+
+    def __fix_special_tokens(self):
+        # old models/tokenizers may not have these attributes, fixing
         if self.tokenizer.eos_token is None:
-            self.tokenizer.eos_token = self.backend_model.config.eos_token
+            self.tokenizer.eos_token = self.hf_model_config.eos_token
         if self.tokenizer.eos_token_id is None:
-            self.tokenizer.eos_token_id = self.backend_model.config.eos_token_id
-            
+            self.tokenizer.eos_token_id = self.hf_model_config.eos_token_id
+
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
         if self.tokenizer.pad_token_id is None:
             self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
-        
+
         if self.model_args.eos_padding:
             self.tokenizer.pad_token = self.tokenizer.eos_token
-            
-        if not hasattr(self.backend_model.config, "pad_token_id"):
+
+        if not hasattr(self.hf_model_config, "pad_token_id"):
             logger.warning("pad_token_id not found in model config. Setting pad_token_id to eos_token_id.")
-            self.backend_model.config.pad_token_id = self.backend_model.config.eos_token_id
-        elif self.backend_model.config.pad_token_id is None:
+            self.hf_model_config.pad_token_id = self.hf_model_config.eos_token_id
+        elif self.hf_model_config.pad_token_id is None:
             logger.warning("pad_token_id is None in model config. Setting pad_token_id to eos_token_id.")
-            self.backend_model.config.pad_token_id = self.backend_model.config.eos_token_id
-            
+            self.hf_model_config.pad_token_id = self.hf_model_config.eos_token_id
 
     def activate_model_for_inference(
         self,
-        use_vllm: bool=False,
+        use_vllm: bool = False,
         **kwargs,
     ):
         if self._activated:
             logger.warning("You are trying to activate the model for inference, but it is already activated.")
             return
-        
+
         if use_vllm:
             self.__prepare_model_for_vllm_inference(
                 model_args=self.model_args,
@@ -517,28 +507,25 @@ def activate_model_for_inference(
             self.__prepare_model_for_inference(
                 model_args=self.model_args,
                 hf_auto_model=self.hf_auto_model,
-                use_accelerator=self.use_accelerator,
-                ds_config=self.ds_config,
             )
-            
+
         self._activated = True
-            
-            
+
     def deactivate_model_for_inference(
         self,
-        use_vllm: bool=False,
+        use_vllm: bool = False,
     ):
         """Deactivate the model and release the resources.
-        
+
         NOTE: Currently, VLLM doesn't have an official way to do this, and the
         implementation below cannot release all gpu resources by our observation.
-        Thus this method is just a placeholder for future implementation. See: 
+        Thus this method is just a placeholder for future implementation. See:
         [Github issue](https://github.com/vllm-project/vllm/issues/1908)
         """
         if not self._activated:
             logger.warning("You are trying to deactivate the model for inference, but it is already deactivated.")
             return
-        
+
         if use_vllm:
             destroy_model_parallel()
             del self.backend_model_for_inference.llm_engine.model_executor.driver_worker
@@ -548,24 +535,21 @@ def deactivate_model_for_inference(
         else:
             self.backend_model.to("cpu")
             pass
-        
+
         self._activated = False
 
-    
     def get_max_length(self):
         """
         Return max acceptable input length in terms of tokens.
         """
         return self.tokenizer.model_max_length
 
-
     def get_tokenizer(self):
         """
         Return the tokenizer of the model.
         """
         return self.tokenizer
 
-
     def get_backend_model(self):
         """
         Return the backend model.
diff --git a/src/lmflow/models/hf_text_regression_model.py b/src/lmflow/models/hf_text_regression_model.py
index 46904cdbd..8ebd25077 100644
--- a/src/lmflow/models/hf_text_regression_model.py
+++ b/src/lmflow/models/hf_text_regression_model.py
@@ -1,50 +1,40 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-import os
 import copy
 import hashlib
 import logging
-from pathlib import Path
-from typing import List, Union, Dict, Optional
+from typing import Optional, Union
 
 import torch
-from peft import (
-    LoraConfig,
-    PeftModel,
-    TaskType,
-    get_peft_config,
-    get_peft_model,
-    prepare_model_for_kbit_training
-)
 from transformers.modeling_outputs import SequenceClassifierOutputWithPast
 
 from lmflow.args import ModelArguments
-from lmflow.datasets.dataset import Dataset, KEY_SCORE
-from lmflow.models.interfaces.tunable import Tunable
+from lmflow.datasets.dataset import KEY_SCORE, Dataset
 from lmflow.models.hf_model_mixin import HFModelMixin
+from lmflow.models.interfaces.tunable import Tunable
 from lmflow.models.text_regression_model import TextRegressionModel
 from lmflow.tokenization.hf_text_regression_model import (
-    paired_conversation_tokenize_function, 
     conversation_tokenize_function,
-    tokenize_function,
+    paired_conversation_tokenize_function,
     text_to_textlist_tokenize_function,
+    tokenize_function,
 )
-from lmflow.utils.conversation_template import PRESET_TEMPLATES
 from lmflow.utils.constants import (
-    PAIRED_CONVERSATION_DATASET_DESCRIPTION, 
+    CONVERSATION_DATASET_DESCRIPTION,
+    PAIRED_CONVERSATION_DATASET_DESCRIPTION,
     TEXT2TEXT_DATASET_DESCRIPTION,
     TEXT_ONLY_DATASET_DESCRIPTION,
     TEXT_TO_TEXTLIST_DATASET_DESCRIPTION,
-    CONVERSATION_DATASET_DESCRIPTION, 
 )
+from lmflow.utils.conversation_template import PRESET_TEMPLATES
 from lmflow.utils.data_utils import RewardModelInferenceResultWithInput
+from lmflow.utils.envs import is_accelerate_env
 from lmflow.utils.versioning import is_ray_available, is_vllm_available
 
 if is_ray_available():
     import ray
     import ray.data
-    
+
 if is_vllm_available():
     from vllm import SamplingParams
 
@@ -59,76 +49,48 @@ class HFTextRegressionModel(TextRegressionModel, HFModelMixin, Tunable):
     Parameters
     ------------
 
-    model_args : 
+    model_args :
         Model arguments such as model name, path, revision, etc.
 
-    tune_strategy : str or none,  default="normal".
-        A string representing the dataset backend. Defaults to "huggingface".
-    
-    ds_config :   
-        Deepspeed configuations.
-    
+    do_train : bool, default True
+        Determines whether to prepare the model for training, including distribtued env, model placement, quantization,
+        lora, etc.
+
     args : Optional.
         Positional arguments.
-    
+
     kwargs : Optional.
-        Keyword arguments.    
+        Keyword arguments.
     """
 
-    def __init__(
-        self,
-        model_args: ModelArguments,
-        tune_strategy: str='normal',
-        ds_config=None,
-        device="gpu",
-        use_accelerator=False,
-        *args,
-        **kwargs
-    ):
-        """
-        Initializes a HFTextRegressionModel instance.
-        :param model_args: dictionary with model arguments such as model name, path, revision, etc.
-        :param tune_strategy: tuning strategy: normal, none, lora or adapter
-        :param ds_config: deepspeed configuration for distributed training
-        """
+    def __init__(self, model_args: ModelArguments, do_train: bool = False, device="gpu", **kwargs):
         assert model_args.arch_type == "text_regression", (
-            f"Invalid model architecture type: {model_args.arch_type}. "
-            f"Expected: text_regression"
+            f"Invalid model architecture type: {model_args.arch_type}. Expected: text_regression"
         )
         config_additional_args = {"num_labels": 1}
         HFModelMixin.__init__(
             self,
             model_args=model_args,
-            do_train=True if tune_strategy == "normal" else False,
-            ds_config=ds_config,
+            do_train=do_train,
             device=device,
-            use_accelerator=use_accelerator,
             hf_auto_model_additional_args=config_additional_args,
-            *args,
-            **kwargs
+            **kwargs,
         )
 
-    
-    def tokenize(
-        self, 
-        dataset: Dataset, 
-        add_special_tokens=True, 
-        *args, 
-        **kwargs
-    ):
+    def tokenize(self, dataset: Dataset, add_special_tokens=True, *args, **kwargs):
         """
         Tokenize the full dataset.
-    
+
         Parameters
         ------------
         dataset : lmflow.datasets.Dataset.
 
         args : Optional.
             Positional arguments.
-        
+
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         tokenized_datasets :
@@ -139,18 +101,17 @@ def tokenize(
         # Preprocessing the datasets.
         # First we tokenize all the texts.
         if dataset.get_backend() != "huggingface":
-            raise NotImplementedError(
-                "tokenization of datasets with non-huggingface backend are"
-                "not supported yet"
-            )
+            raise NotImplementedError("tokenization of datasets with non-huggingface backend arenot supported yet")
 
         dataset_type = dataset.get_type()
         model_args = self.model_args
         raw_datasets = dataset
         hf_raw_datasets = dataset.get_backend_dataset()
-        column_names = list(hf_raw_datasets.features) # in paired conversation, for example, would be 'chosen' and 'rejected'
+        column_names = list(
+            hf_raw_datasets.features
+        )  # in paired conversation, for example, would be 'chosen' and 'rejected'
         data_args = raw_datasets.get_data_args()
-        
+
         # Whether to truncate long sequences to fit into max_length
         use_truncation = False
         if model_args.use_lora or data_args.disable_group_texts:
@@ -171,7 +132,7 @@ def tokenize(
             "data_args": data_args,
             "tokenizer": self.tokenizer,
             "column_names": column_names,
-        }            
+        }
         if dataset_type == "text_only":
             tokenize_fn = tokenize_function
             text_only_tokenize_fn_kwargs = {
@@ -181,7 +142,7 @@ def tokenize(
                 "use_truncation": use_truncation,
             }
             tokenize_fn_kwargs.update(text_only_tokenize_fn_kwargs)
-            
+
         elif dataset_type == "text2text":
             tokenize_fn = tokenize_function
             text2text_tokenize_fn_kwargs = {
@@ -191,13 +152,13 @@ def tokenize(
                 "use_truncation": use_truncation,
             }
             tokenize_fn_kwargs.update(text2text_tokenize_fn_kwargs)
-            
+
         elif dataset_type in ["conversation", "paired_conversation"]:
             if dataset_type == "conversation":
                 tokenize_fn = conversation_tokenize_function
             elif dataset_type == "paired_conversation":
                 tokenize_fn = paired_conversation_tokenize_function
-            
+
             if data_args.conversation_template:
                 if data_args.conversation_template in PRESET_TEMPLATES.keys():
                     conversation_template = PRESET_TEMPLATES[data_args.conversation_template]
@@ -207,10 +168,10 @@ def tokenize(
                     )
             else:
                 logger.warning("No conversation template provided. Using default template.")
-                conversation_template = PRESET_TEMPLATES['empty']
+                conversation_template = PRESET_TEMPLATES["empty"]
             tokenize_fn_kwargs["conversation_template"] = conversation_template
             logger.warning(f"Conversation template: {conversation_template}")
-            
+
         elif dataset_type == "text_to_textlist":
             tokenize_fn = text_to_textlist_tokenize_function
             text_to_textlist_tokenize_fn_kwargs = {
@@ -218,10 +179,10 @@ def tokenize(
                 "use_truncation": use_truncation,
             }
             tokenize_fn_kwargs.update(text_to_textlist_tokenize_fn_kwargs)
-            
+
         else:
             raise NotImplementedError(
-                f"Dataset type \"{dataset_type}\" is not supported, currently"
+                f'Dataset type "{dataset_type}" is not supported, currently'
                 " only support following data types for HFTextRegressionModel:\n"
                 f"    1) [Inference]{TEXT_ONLY_DATASET_DESCRIPTION}\n"
                 f"    2) [Inference]{TEXT2TEXT_DATASET_DESCRIPTION}\n"
@@ -229,17 +190,21 @@ def tokenize(
                 f"    4) [Inference]{CONVERSATION_DATASET_DESCRIPTION}\n"
                 f"    5) [Inference]{TEXT_TO_TEXTLIST_DATASET_DESCRIPTION}\n"
             )
-  
+
         tokenize_kwargs = {}
         if not data_args.streaming:
             fingerprint = hashlib.md5(
                 (
                     raw_datasets.get_fingerprint()
                     + str(self.tokenizer)
-                    + f'###padding_side={self.tokenizer.padding_side}'
-                    + ('###conversation_template=' + str(conversation_template) if "conversation" in dataset_type else "")
-                    + f'###disable_group_texts={data_args.disable_group_texts}'
-                    + f'###block_size={data_args.block_size}'
+                    + f"###padding_side={self.tokenizer.padding_side}"
+                    + (
+                        "###conversation_template=" + str(conversation_template)
+                        if "conversation" in dataset_type
+                        else ""
+                    )
+                    + f"###disable_group_texts={data_args.disable_group_texts}"
+                    + f"###block_size={data_args.block_size}"
                 ).encode("utf-8")
             ).hexdigest()
             tokenize_kwargs = {
@@ -250,25 +215,16 @@ def tokenize(
             }
 
         tokenized_datasets = raw_datasets.map(
-            tokenize_fn,
-            batched=True,
-            remove_columns=column_names,
-            fn_kwargs=tokenize_fn_kwargs,
-            **tokenize_kwargs
+            tokenize_fn, batched=True, remove_columns=column_names, fn_kwargs=tokenize_fn_kwargs, **tokenize_kwargs
         )
         return tokenized_datasets
 
-
     def inference(
-        self, 
-        inputs, 
-        release_gpu: bool = False,
-        use_vllm: bool = False,
-        **kwargs
-    ) -> Union[List[float], SequenceClassifierOutputWithPast]:
+        self, inputs, release_gpu: bool = False, use_vllm: bool = False, **kwargs
+    ) -> Union[list[float], SequenceClassifierOutputWithPast]:
         """
         Perform generation process of the model.
-    
+
         Parameters
         ------------
         inputs :
@@ -280,64 +236,58 @@ def inference(
         use_vllm : bool, optional
             Whether to use VLLM for inference, by default False.
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         outputs :
-            The generated sequence output 
+            The generated sequence output
         """
         if use_vllm:
-            logger.warning(
-                "VLLM inference is not supported for text regression model, using normal inference instead."
-            )
+            logger.warning("VLLM inference is not supported for text regression model, using normal inference instead.")
             use_vllm = False
-            
+
         if not self._activated:
             self.activate_model_for_inference(
                 use_vllm=use_vllm,
                 **kwargs,
             )
-            
+
         if use_vllm:
             res = self.__vllm_inference(inputs, **kwargs)
         else:
             res = self.__inference(inputs, **kwargs)
-            
+
         if release_gpu:
             self.deactivate_model_for_inference(use_vllm=use_vllm)
-            
-        return res
 
+        return res
 
-    def __inference(
-        self, 
-        inputs, 
-        **kwargs
-    ):
+    def __inference(self, inputs, **kwargs):
         """
         Perform generation process of the model.
-    
+
         Parameters
         ------------
         inputs :
             The **tokenized** sequence used as a prompt for the generation or as model inputs to the model.
         kwargs : Optional.
-            Keyword arguments.    
-        
+            Keyword arguments.
+
         Returns
         ------------
         outputs :
-            The generated sequence output 
-        """       
+            The generated sequence output
+        """
         with torch.no_grad():
-            if self.use_accelerator:
+            if is_accelerate_env():
                 outputs = self.backend_model(
                     input_ids=inputs,
                     **kwargs,
                 )
             else:
                 if self.device == "gpu":
+                    # for scripts that run using 'deepspeed script.py'
                     outputs = self.ds_engine.module(
                         input_ids=inputs,
                         synced_gpus=True,
@@ -350,27 +300,24 @@ def __inference(
                         **kwargs,
                     )
                 else:
-                    raise NotImplementedError(
-                        f"device \"{self.device}\" is not supported"
-                    )
-        
-        if kwargs.get('return_input', False):
+                    raise NotImplementedError(f'device "{self.device}" is not supported')
+
+        if kwargs.get("return_input", False):
             outputs = {"input": inputs, "output": outputs}
-        
+
         return outputs
-    
-    
+
     def __vllm_inference(
-        self, 
-        inputs: Union[str, List[str]],
-        sampling_params: Optional['SamplingParams'] = None,
+        self,
+        inputs: Union[str, list[str]],
+        sampling_params: Optional["SamplingParams"] = None,
         **kwargs,
-    ) -> Union[List[List[str]], List[List[List[int]]]]:
+    ) -> Union[list[list[str]], list[list[list[int]]]]:
         """Perform VLLM inference process of the model.
 
         Parameters
         ----------
-        inputs : Union[str, List[str]]
+        inputs : Union[str, list[str]]
             Prompt(s), string or a list of strings.
         sampling_params : Optional[SamplingParams], optional
             vllm SamplingParams object, by default None.
@@ -378,35 +325,28 @@ def __vllm_inference(
         Returns
         -------
         """
-        raise NotImplementedError(
-            "VLLM inference is not supported for text regression model."
-        )
-        
-        
+        raise NotImplementedError("VLLM inference is not supported for text regression model.")
+
     def prepare_inputs_for_inference(
         self,
         dataset: Dataset,
         enable_distributed_inference: bool = False,
         use_vllm: bool = False,
         **kwargs,
-    ) -> Union[Dataset, 'ray.data.Dataset']:
+    ) -> Union[Dataset, "ray.data.Dataset"]:
         if use_vllm:
-            raise NotImplementedError(
-                "VLLM inference is not supported for text regression model."
-            )
-            
+            raise NotImplementedError("VLLM inference is not supported for text regression model.")
+
         inference_inputs = self.tokenize(dataset)
-                
+
         if enable_distributed_inference:
             if not is_ray_available():
-                raise ValueError(
-                    'Ray is not available. Please install ray via `pip install -e ".[ray]"`.'
-                )
-            
+                raise ValueError('Ray is not available. Please install ray via `pip install -e ".[ray]"`.')
+
             inference_inputs.sanity_check(drop_invalid=True)
             inference_inputs = inference_inputs.get_backend_dataset()
-            inference_inputs = ray.data.from_items(inference_inputs) 
-            # -> Dict[str, np.ndarray]
+            inference_inputs = ray.data.from_items(inference_inputs)
+            # -> dict[str, np.ndarray]
             # Example (batch size=2):
             # {'input': array(['...','...'], dtype=object),
             #  'output': array([array(["...", "..."], dtype=object), array(['...','...'], dtype=object)], dtype=object),
@@ -419,14 +359,13 @@ def prepare_inputs_for_inference(
             #                 array([    27,     91,    882, ..., 128256, 128256, 128256])],
             #                dtype=object)
             #      ], dtype=object)}
-        
+
         return inference_inputs
-            
-    
+
     @staticmethod
     def postprocess_inference_outputs(
         dataset: Dataset,
-        scores: Union[List[float], List[List[float]]],
+        scores: Union[list[float], list[list[float]]],
     ):
         output_dict = {"type": "", "instances": []}
         if dataset.get_type() == "text_to_textlist":
@@ -442,11 +381,11 @@ def postprocess_inference_outputs(
                 )
         else:
             raise NotImplementedError(f"Dataset type {dataset.get_type()} is not supported for reward model inference.")
-        
+
         for i, instance_scores in enumerate(scores):
             for j, score in enumerate(instance_scores):
                 output_dict["instances"][i]["output"][j][KEY_SCORE] = score
-        
+
         output_dataset_args = copy.deepcopy(dataset.data_args)
         output_dataset_args.dataset_path = None
         output_dataset_args.dataset_name = f"{output_dataset_args.dataset_name}_scored"
@@ -454,12 +393,11 @@ def postprocess_inference_outputs(
         output_dataset = output_dataset.from_dict(output_dict)
 
         return output_dataset
-    
-    
+
     @staticmethod
     def postprocess_distributed_inference_outputs(
         dataset: Dataset,
-        inference_result: List[RewardModelInferenceResultWithInput],
+        inference_result: list[RewardModelInferenceResultWithInput],
     ):
         output_dict = {"type": "text_to_scored_textlist", "instances": inference_result}
         output_dataset_args = copy.deepcopy(dataset.data_args)
@@ -467,21 +405,20 @@ def postprocess_distributed_inference_outputs(
         output_dataset_args.dataset_name = f"{output_dataset_args.dataset_name}_scored"
         output_dataset = Dataset(output_dataset_args)
         output_dataset = output_dataset.from_dict(output_dict)
-        
+
         return output_dataset
-        
 
     def save(self, dir, *args, **kwargs):
         """
         Perform generation process of the model.
-    
+
         Parameters
         ------------
         dir :
             The directory to save model and tokenizer
-        
+
         kwargs : Optional.
-            Keyword arguments.    
+            Keyword arguments.
         """
         self.get_tokenizer().save_pretrained(dir)
         self.get_backend_model().save_pretrained(dir)
diff --git a/src/lmflow/models/interfaces/tunable.py b/src/lmflow/models/interfaces/tunable.py
index ac8998c3a..5852de243 100644
--- a/src/lmflow/models/interfaces/tunable.py
+++ b/src/lmflow/models/interfaces/tunable.py
@@ -1,10 +1,6 @@
 #!/usr/bin/env python
-# coding=utf-8
-"""Tunable class
-"""
+"""Tunable class"""
 
-from abc import ABC
 
-
-class Tunable(ABC):
+class Tunable:
     pass
diff --git a/src/lmflow/models/regression_model.py b/src/lmflow/models/regression_model.py
index 43d0dfc7b..9dff47c9e 100644
--- a/src/lmflow/models/regression_model.py
+++ b/src/lmflow/models/regression_model.py
@@ -1,11 +1,9 @@
 #!/usr/bin/env python
-# coding=utf-8
 """General regression model."""
 
 from lmflow.models.base_model import BaseModel
 
 
 class RegressionModel(BaseModel):
-
     def __init__(self, *args, **kwargs):
         pass
diff --git a/src/lmflow/models/text_regression_model.py b/src/lmflow/models/text_regression_model.py
index 285cbc4f4..25690d612 100644
--- a/src/lmflow/models/text_regression_model.py
+++ b/src/lmflow/models/text_regression_model.py
@@ -1,11 +1,10 @@
 #!/usr/bin/env python
-# coding=utf-8
 """
 A model maps "text_only" data to float.
 """
 
-from lmflow.models.regression_model import RegressionModel
 from lmflow.datasets.dataset import Dataset
+from lmflow.models.regression_model import RegressionModel
 
 
 class TextRegressionModel(RegressionModel):
@@ -15,36 +14,29 @@ class TextRegressionModel(RegressionModel):
     Parameters
     ------------
 
-    model_args : 
+    model_args :
         Model arguments such as model name, path, revision, etc.
 
     args : Optional.
         Positional arguments.
 
     kwargs : Optional.
-        Keyword arguments.    
+        Keyword arguments.
     """
 
-    def __init__(
-        self,
-        model_args,
-        *args,
-        **kwargs
-    ):
+    def __init__(self, model_args, *args, **kwargs):
         """
         Initializes a TextRegressionModel instance.
         :param model_args: dictionary with model arguments such as model name, path, revision, etc.
         """
         self.inference_func = None
 
-
     def register_inference_function(self, inference_func):
         """
         Registers a regression function.
         """
         self.inference_func = inference_func
 
-
     def inference(self, inputs: Dataset):
         """
         Gets regression results of a given dataset.
diff --git a/src/lmflow/models/vision2seq_model.py b/src/lmflow/models/vision2seq_model.py
index dc4d70a6a..b80982df7 100644
--- a/src/lmflow/models/vision2seq_model.py
+++ b/src/lmflow/models/vision2seq_model.py
@@ -1,25 +1,20 @@
 #!/usr/bin/env python
-# coding=utf-8
 # TODO update the doc
 
-import copy
-import logging
-import time
+from typing import Optional, Union
+
 import torch
 import torch.nn as nn
-from typing import List, Optional, Tuple, Union
 from torch.nn import CrossEntropyLoss
-
 from transformers import (
+    AutoModel,
     AutoModelForCausalLM,
     AutoModelForSeq2SeqLM,
-    AutoModel,
-    Blip2ForConditionalGeneration,
     Blip2Config,
+    Blip2ForConditionalGeneration,
+    Blip2PreTrainedModel,
     Blip2QFormerModel,
     Blip2VisionModel,
-    Blip2PreTrainedModel,
-    PreTrainedModel,
 )
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
@@ -27,20 +22,22 @@
 from lmflow.models.vision_encoder import build_vision_tower
 from lmflow.utils.versioning import is_package_version_at_least
 
-if is_package_version_at_least('transformers', '4.46.0'):
+if is_package_version_at_least("transformers", "4.46.0"):
     from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 else:
     from transformers.deepspeed import is_deepspeed_zero3_enabled
 
 
 class CustomAutoVision2SeqModel(Blip2ForConditionalGeneration, BaseModel):
-    def __init__(self,
-                 config: Blip2Config,
-                 image_encoder_name_or_path=None,
-                 qformer_name_or_path=None,
-                 language_model_name_or_path=None,
-                 low_resource=False,):
-        '''
+    def __init__(
+        self,
+        config: Blip2Config,
+        image_encoder_name_or_path=None,
+        qformer_name_or_path=None,
+        language_model_name_or_path=None,
+        low_resource=False,
+    ):
+        """
         TODO update the docs
         Args:
             config:
@@ -49,10 +46,9 @@ def __init__(self,
             qformer_name_or_path:
             language_model_name_or_path:
         Returns:
-        '''
+        """
         super(Blip2PreTrainedModel, self).__init__(config)
-        self.custom_vision_model = getattr(
-            config, "custom_vision_model", False)
+        self.custom_vision_model = getattr(config, "custom_vision_model", False)
         self.with_qformer = getattr(config, "with_qformer", True)
         # vision model
         if self.custom_vision_model:
@@ -63,8 +59,7 @@ def __init__(self,
             self.image_processor = self.vision_model.image_processor
         elif image_encoder_name_or_path is not None:
             # use the model from transformers
-            self.vision_model = AutoModel.from_pretrained(
-                image_encoder_name_or_path)
+            self.vision_model = AutoModel.from_pretrained(image_encoder_name_or_path)
             config.vision_config = self.vision_model.config
         else:
             # the default vit in Blip2
@@ -74,37 +69,28 @@ def __init__(self,
             # and the llava based models don't use qformer.
             if qformer_name_or_path is not None:
                 self.query_tokens = nn.Parameter(
-                    torch.zeros(1, config.num_query_tokens,
-                                config.qformer_config.hidden_size))
-                self.qformer = AutoModel.from_pretrained(
-                    qformer_name_or_path)
+                    torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)
+                )
+                self.qformer = AutoModel.from_pretrained(qformer_name_or_path)
             else:
                 self.query_tokens = nn.Parameter(
-                    torch.zeros(1, config.num_query_tokens,
-                                config.qformer_config.hidden_size))
+                    torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)
+                )
                 self.qformer = Blip2QFormerModel(config.qformer_config)
         kwargs = dict()
         if language_model_name_or_path is not None:
             if low_resource:
-                kwargs = dict(
-                    torch_dtype=torch.float16,
-                    load_in_8bit=True,
-                    device_map="auto",
-                    low_cpu_mem_usage=True)
+                kwargs = dict(torch_dtype=torch.float16, load_in_8bit=True, device_map="auto", low_cpu_mem_usage=True)
             else:
                 if not is_deepspeed_zero3_enabled:
-                    kwargs = dict(device_map="auto",
-                            torch_dtype=torch.float16)
-            language_model = AutoModelForCausalLM.from_pretrained(
-                language_model_name_or_path, **kwargs)
+                    kwargs = dict(device_map="auto", torch_dtype=torch.float16)
+            language_model = AutoModelForCausalLM.from_pretrained(language_model_name_or_path, **kwargs)
             config.text_config = language_model.config
         else:
             if config.use_decoder_only_language_model:
-                language_model = AutoModelForCausalLM.from_config(
-                    config.text_config, **kwargs)
+                language_model = AutoModelForCausalLM.from_config(config.text_config, **kwargs)
             else:
-                language_model = AutoModelForSeq2SeqLM.from_config(
-                    config.text_config, **kwargs)
+                language_model = AutoModelForSeq2SeqLM.from_config(config.text_config, **kwargs)
         # Update _tied_weights_keys using the base model used.
         if getattr(language_model, "_tied_weights_keys", None) is not None:
             self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
@@ -112,14 +98,11 @@ def __init__(self,
         self.language_model = language_model
         if self.with_qformer:
             self.language_projection = nn.Linear(
-                self.qformer.config.hidden_size,
-                self.language_model.config.hidden_size)
+                self.qformer.config.hidden_size, self.language_model.config.hidden_size
+            )
         else:
-            self.language_projection = nn.Linear(
-                self.vision_model.hidden_size,
-                self.language_model.config.hidden_size)
-        if image_encoder_name_or_path is None and \
-           language_model_name_or_path is None:
+            self.language_projection = nn.Linear(self.vision_model.hidden_size, self.language_model.config.hidden_size)
+        if image_encoder_name_or_path is None and language_model_name_or_path is None:
             self.post_init()
         # for deepspeed
         self.hidden_size = self.language_model.config.hidden_size
@@ -129,40 +112,26 @@ def get_backend_model(self):
         return self
 
     def vision_model_from_pretrained(self, pretrained_path):
-        self.vision_model = self.vision_model.from_pretrained(
-                                pretrained_path,
-                                config=self.config.vision_config)
+        self.vision_model = self.vision_model.from_pretrained(pretrained_path, config=self.config.vision_config)
 
     def qformer_from_pretrained(self, pretrained_path):
-        self.qformer = self.qformer.from_pretrained(
-                                pretrained_path,
-                                config=self.config.qformer_config)
-
-    def language_model_from_pretrained(self,
-                                       pretrained_path,
-                                       low_resource=False,
-                                       use_prompt_cache=False):
+        self.qformer = self.qformer.from_pretrained(pretrained_path, config=self.config.qformer_config)
+
+    def language_model_from_pretrained(self, pretrained_path, low_resource=False, use_prompt_cache=False):
         # TODO remove the low resource related loading in the future
         self.use_prompt_cache = use_prompt_cache
         if low_resource:
-            kwargs = dict(
-                torch_dtype=torch.float16,
-                load_in_8bit=True,
-                device_map="auto"
-            )
+            kwargs = dict(torch_dtype=torch.float16, load_in_8bit=True, device_map="auto")
         else:
             kwargs = {}
         past_model_dim = self.language_model.model_dim
         self.language_model = AutoModelForCausalLM.from_pretrained(
-                                pretrained_path,
-                                config=self.config.text_config,
-                                **kwargs)
+            pretrained_path, config=self.config.text_config, **kwargs
+        )
         if self.config.text_config.hidden_size != past_model_dim:
             # should update the language projection layer
             in_channels = self.language_projection.in_features
-            self.language_projection = nn.Linear(in_channels,
-                                                 self.config.text_config.hidden_size,
-                                                 bias=True)
+            self.language_projection = nn.Linear(in_channels, self.config.text_config.hidden_size, bias=True)
 
     def vision_feature_select(self, image_forward_outs):
         image_features = image_forward_outs.hidden_states[self.vision_feature_select_layer]
@@ -171,7 +140,7 @@ def vision_feature_select(self, image_forward_outs):
         elif self.select_vision_feature_type == "cls_patch":
             image_features = image_features
         else:
-            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
         return image_features
 
     def register_prompt_cache(self, prompt_ids, prompt_keys_values):
@@ -200,12 +169,7 @@ def save_prompt_cache(self, path):
             None
         """
 
-        torch.save(
-            dict(
-                prompt_ids=self.prompt_ids,
-                prompt_keys_values=self.prompt_keys_values
-            ),
-            path)
+        torch.save(dict(prompt_ids=self.prompt_ids, prompt_keys_values=self.prompt_keys_values), path)
 
     def load_prompt_cache(self, path):
         """
@@ -217,8 +181,7 @@ def load_prompt_cache(self, path):
             None
         """
         prompt_cache = torch.load(path)
-        self.register_prompt_cache(prompt_cache["prompt_ids"],
-                                   prompt_cache["prompt_keys_values"])
+        self.register_prompt_cache(prompt_cache["prompt_ids"], prompt_cache["prompt_keys_values"])
 
     def get_tokenizer(self):
         return self.tokenizer
@@ -229,16 +192,19 @@ def forward(
         pixel_values: Optional[torch.FloatTensor] = None,
         images: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        past_key_values: Optional[list[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        image_token_indexes: Optional[List] = [0],
+        image_token_indexes: Optional[list] = None,
         one_sample_multiple_images: bool = False,
-    ) -> Union[Tuple, CausalLMOutputWithPast]:
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        if not image_token_indexes:
+            image_token_indexes = [0]
+
         if pixel_values is None and images is not None:
             pixel_values = images
 
@@ -255,21 +221,18 @@ def forward(
             # do the processing as blip2 and mini gpt-4;
             if past_key_values is not None and input_ids.shape[1] == 1:
                 # no need to recompute the key values
-                attention_mask = torch.ones((
-                    attention_mask.shape[0],
-                    past_key_values[-1][-1].shape[-2] + 1),
+                attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1),
                     dtype=attention_mask.dtype,
-                    device=attention_mask.device)
+                    device=attention_mask.device,
+                )
             else:
-                image_embeds = self.vision_model(
-                    pixel_values, return_dict=True).last_hidden_state
+                image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
                 image_attention_mask = torch.ones(
-                    image_embeds.size()[:-1],
-                    dtype=torch.long,
-                    device=image_embeds.device)
+                    image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device
+                )
                 if self.with_qformer:
-                    query_tokens = self.query_tokens.expand(
-                        image_embeds.shape[0], -1, -1)
+                    query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
                     query_outputs = self.qformer(
                         query_embeds=query_tokens,
                         encoder_hidden_states=image_embeds,
@@ -280,26 +243,25 @@ def forward(
                     query_outputs = image_embeds
                 query_output = query_outputs.last_hidden_state
                 language_model_inputs = self.language_projection(query_output)
-                inputs_embeds, attention_mask = \
-                    self.processor_image_token_in_minigpt4(
-                        input_ids,
-                        language_model_inputs,
-                        attention_mask,
-                        image_token_indexes,
-                        pixel_values,
-                        batch_size)
+                inputs_embeds, attention_mask = self.processor_image_token_in_minigpt4(
+                    input_ids, language_model_inputs, attention_mask, image_token_indexes, pixel_values, batch_size
+                )
                 input_ids = None
         else:
             # do the processing in the vision model
             # language is the causallm model.
             # so use language model.model to do the embed_tokens
-            input_ids, attention_mask, past_key_values, inputs_embeds, labels = \
+            input_ids, attention_mask, past_key_values, inputs_embeds, labels = (
                 self.vision_model.prepare_inputs_labels_for_multimodal(
-                    input_ids, attention_mask,
-                    past_key_values, labels,
+                    input_ids,
+                    attention_mask,
+                    past_key_values,
+                    labels,
                     pixel_values,
                     self.language_projection,
-                    self.language_model.model)
+                    self.language_model.model,
+                )
+            )
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
         # TODO check how to generate the labels with image embeddings
         # print(input_ids, attention_mask)
@@ -307,8 +269,7 @@ def forward(
         #     print("input_embeds", inputs_embeds.shape)
         # attention_mask.shape, inputs_embeds.shape)
         # TODO remove this code by fixing the ddp training issue
-        inputs_embeds = inputs_embeds.to(
-            self.language_model.lm_head.weight.dtype)
+        inputs_embeds = inputs_embeds.to(self.language_model.lm_head.weight.dtype)
         outputs = self.language_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -317,7 +278,7 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
-            return_dict=return_dict
+            return_dict=return_dict,
         )
         if labels is not None:
             logits = outputs[0]
@@ -325,8 +286,7 @@ def forward(
             shift_labels = labels[..., 1:].contiguous()
             # Flatten the tokens
             loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(
-                -1, self.config.text_config.vocab_size)
+            shift_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
             shift_labels = shift_labels.view(-1)
             # Enable model/pipeline parallelism
             shift_labels = shift_labels.to(shift_logits.device)
@@ -344,16 +304,11 @@ def forward(
             attentions=outputs.attentions,
         )
 
-    def processor_image_token_in_minigpt4(self,
-                                          input_ids,
-                                          language_model_inputs,
-                                          attention_mask,
-                                          image_token_indexes,
-                                          pixel_values,
-                                          batch_size=1):
+    def processor_image_token_in_minigpt4(
+        self, input_ids, language_model_inputs, attention_mask, image_token_indexes, pixel_values, batch_size=1
+    ):
         language_attention_mask = torch.ones(
-            language_model_inputs.size()[:-1],
-            dtype=torch.long, device=language_model_inputs.device
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
         )
         if input_ids is None:
             input_ids = (
@@ -378,27 +333,19 @@ def processor_image_token_in_minigpt4(self,
 
         for idx, image_token_index in enumerate(image_token_indexes):
             end_index += image_token_index
-            inputs_embeds_with_images.append(
-                inputs_embeds[:, start_index:end_index])
+            inputs_embeds_with_images.append(inputs_embeds[:, start_index:end_index])
             inputs_embeds_with_images.append(language_model_inputs[idx][None])
-            attention_mask_with_images.append(
-                attention_mask[:, start_index:end_index])
-            attention_mask_with_images.append(
-                language_attention_mask[idx][None])
+            attention_mask_with_images.append(attention_mask[:, start_index:end_index])
+            attention_mask_with_images.append(language_attention_mask[idx][None])
             start_index = end_index
 
-        inputs_embeds_with_images.append(
-            inputs_embeds[:, image_token_indexes[-1]:])
-        inputs_embeds = torch.cat(
-            inputs_embeds_with_images, dim=1)
-        attention_mask_with_images.append(
-            attention_mask[:, image_token_indexes[-1]:])
+        inputs_embeds_with_images.append(inputs_embeds[:, image_token_indexes[-1] :])
+        inputs_embeds = torch.cat(inputs_embeds_with_images, dim=1)
+        attention_mask_with_images.append(attention_mask[:, image_token_indexes[-1] :])
         attention_mask = torch.cat(attention_mask_with_images, dim=1)
         # comebine the embeds
-        inputs_embeds = inputs_embeds.to(
-            self.language_model.lm_head.weight.dtype)
-        attention_mask = attention_mask.to(
-            self.language_model.lm_head.weight.dtype)
+        inputs_embeds = inputs_embeds.to(self.language_model.lm_head.weight.dtype)
+        attention_mask = attention_mask.to(self.language_model.lm_head.weight.dtype)
         return inputs_embeds, attention_mask
 
     @torch.no_grad()
@@ -407,7 +354,7 @@ def generate(
         pixel_values: torch.FloatTensor,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
-        image_token_indexes: Optional[List] = [0],
+        image_token_indexes: Optional[list] = None,
         one_sample_multiple_images: Optional[bool] = False,
         images: Optional[torch.LongTensor] = None,
         **generate_kwargs,
@@ -430,6 +377,8 @@ def generate(
         Returns:
             captions (list): A list of strings of length batch_size * num_captions.
         """
+        if not image_token_indexes:
+            image_token_indexes = [0]
         if pixel_values is None and images is not None:
             pixel_values = images
 
@@ -439,15 +388,10 @@ def generate(
             batch_size = 1
         if not self.custom_vision_model:
             # do the processing as blip2 and mini gpt-4;
-            image_embeds = self.vision_model(
-                pixel_values, return_dict=True).last_hidden_state
-            image_attention_mask = torch.ones(
-                image_embeds.size()[:-1],
-                dtype=torch.long,
-                device=image_embeds.device)
+            image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
+            image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
             if self.with_qformer:
-                query_tokens = self.query_tokens.expand(
-                    image_embeds.shape[0], -1, -1)
+                query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
                 query_outputs = self.qformer(
                     query_embeds=query_tokens,
                     encoder_hidden_states=image_embeds,
@@ -458,14 +402,9 @@ def generate(
                 query_outputs = image_embeds
             query_output = query_outputs.last_hidden_state
             language_model_inputs = self.language_projection(query_output)
-            inputs_embeds, attention_mask = \
-                self.processor_image_token_in_minigpt4(
-                    input_ids,
-                    language_model_inputs,
-                    attention_mask,
-                    image_token_indexes,
-                    pixel_values,
-                    batch_size)
+            inputs_embeds, attention_mask = self.processor_image_token_in_minigpt4(
+                input_ids, language_model_inputs, attention_mask, image_token_indexes, pixel_values, batch_size
+            )
             input_ids = None
         else:
             # do the processing in the vision model
@@ -474,19 +413,21 @@ def generate(
             if pixel_values.dim() == 3:
                 # the batch dim is missing;
                 pixel_values = pixel_values[None]
-            input_ids, attention_mask, past_key_values, inputs_embeds, labels = \
+            input_ids, attention_mask, past_key_values, inputs_embeds, labels = (
                 self.vision_model.prepare_inputs_labels_for_multimodal(
-                    input_ids, attention_mask,
-                    None, None,
+                    input_ids,
+                    attention_mask,
+                    None,
+                    None,
                     pixel_values,
                     self.language_projection,
-                    self.language_model.model)
+                    self.language_model.model,
+                )
+            )
         # convert the dtype.
         # FIXME check when need to do this
-        inputs_embeds = inputs_embeds.to(
-            device=self.language_model.lm_head.weight.device)
-        inputs_embeds = inputs_embeds.to(
-            self.language_model.lm_head.weight.dtype)
+        inputs_embeds = inputs_embeds.to(device=self.language_model.lm_head.weight.device)
+        inputs_embeds = inputs_embeds.to(self.language_model.lm_head.weight.dtype)
         outputs = self.language_model.generate(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
diff --git a/src/lmflow/models/vision_encoder/__init__.py b/src/lmflow/models/vision_encoder/__init__.py
index 66c5887b5..8a6e94bed 100644
--- a/src/lmflow/models/vision_encoder/__init__.py
+++ b/src/lmflow/models/vision_encoder/__init__.py
@@ -1 +1,3 @@
-from .clip_encoder import build_vision_tower
\ No newline at end of file
+from .clip_encoder import build_vision_tower
+
+__all__ = ["build_vision_tower"]
diff --git a/src/lmflow/models/vision_encoder/clip_encoder.py b/src/lmflow/models/vision_encoder/clip_encoder.py
index 8b85a8e4c..d1e069a67 100644
--- a/src/lmflow/models/vision_encoder/clip_encoder.py
+++ b/src/lmflow/models/vision_encoder/clip_encoder.py
@@ -1,21 +1,17 @@
-from webbrowser import get
 import torch
 import torch.nn as nn
+from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
 
-from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+from lmflow.utils.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX
 
-from lmflow.utils.constants import (IGNORE_INDEX,
-                                    IMAGE_TOKEN_INDEX,
-                                    DEFAULT_IMAGE_PATCH_TOKEN,
-                                    DEFAULT_IM_START_TOKEN,
-                                    DEFAULT_IM_END_TOKEN)
 
 def build_vision_tower(vision_tower_cfg, **kwargs):
-    vision_tower = getattr(vision_tower_cfg, 'image_encoder_name_or_path', "openai/clip-vit-large-patch14")
+    vision_tower = getattr(vision_tower_cfg, "image_encoder_name_or_path", "openai/clip-vit-large-patch14")
     if vision_tower.startswith("openai") or vision_tower.startswith("laion"):
         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
 
-    raise ValueError(f'Unknown vision tower: {vision_tower}')
+    raise ValueError(f"Unknown vision tower: {vision_tower}")
+
 
 # FIXME check if can directly use the BlipVisionEncoder
 class CLIPVisionTower(nn.Module):
@@ -26,7 +22,7 @@ def __init__(self, vision_tower, args, delay_load=False):
 
         self.vision_tower_name = vision_tower
         self.select_layer = args.vision_select_layer
-        self.select_feature = getattr(args, 'vision_select_feature', 'patch')
+        self.select_feature = getattr(args, "vision_select_feature", "patch")
         if not delay_load:
             self.load_model()
         else:
@@ -48,12 +44,12 @@ def encode_images(self, images, language_projection):
 
     def feature_select(self, image_forward_outs):
         image_features = image_forward_outs.hidden_states[self.select_layer]
-        if self.select_feature == 'patch':
+        if self.select_feature == "patch":
             image_features = image_features[:, 1:]
-        elif self.select_feature == 'cls_patch':
+        elif self.select_feature == "cls_patch":
             image_features = image_features
         else:
-            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
         return image_features
 
     @torch.no_grad()
@@ -61,11 +57,15 @@ def forward(self, images):
         if type(images) is list:
             image_features = []
             for image in images:
-                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True
+                )
                 image_feature = self.feature_select(image_forward_out).to(image.dtype)
                 image_features.append(image_feature)
         else:
-            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+            )
             image_features = self.feature_select(image_forward_outs).to(images.dtype)
 
         return image_features
@@ -98,27 +98,35 @@ def num_patches(self):
         return (self.config.image_size // self.config.patch_size) ** 2
 
     def prepare_inputs_labels_for_multimodal(
-        self, input_ids, attention_mask, past_key_values, labels, images,
+        self,
+        input_ids,
+        attention_mask,
+        past_key_values,
+        labels,
+        images,
         language_projection=None,
         language_model=None,
-        **kwargs
+        **kwargs,
     ):
-        '''
+        """
         Copy from the LLAVA code base.
         Should be polished.
-        '''
+        """
         vision_tower = self.vision_tower
         # commonly used in model.generate (past_key_values is not None)
         # to avoid forward the image multiple time
         if vision_tower is None or images is None or input_ids.shape[1] == 1:
-            if (past_key_values is not None and
-                vision_tower is not None and
-                images is not None and
-                input_ids.shape[1] == 1):
-                attention_mask = torch.ones((
-                    attention_mask.shape[0],
-                    past_key_values[-1][-1].shape[-2] + 1),
-                    dtype=attention_mask.dtype, device=attention_mask.device)
+            if (
+                past_key_values is not None
+                and vision_tower is not None
+                and images is not None
+                and input_ids.shape[1] == 1
+            ):
+                attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
             return input_ids, attention_mask, past_key_values, None, labels
         if type(images) is list or images.ndim == 5:
             concat_images = torch.cat([image for image in images], dim=0)
@@ -135,7 +143,7 @@ def prepare_inputs_labels_for_multimodal(
             if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
                 # multimodal LLM, but the current sample is not multimodal
                 cur_input_embeds = language_model.embed_tokens(cur_input_ids)
-                cur_input_embeds = cur_input_embeds + (0. * language_projection(vision_tower.dummy_feature)).sum()
+                cur_input_embeds = cur_input_embeds + (0.0 * language_projection(vision_tower.dummy_feature)).sum()
                 new_input_embeds.append(cur_input_embeds)
                 if labels is not None:
                     new_labels.append(labels[batch_idx])
@@ -151,33 +159,53 @@ def prepare_inputs_labels_for_multimodal(
                 cur_image_features = image_features[cur_image_idx]
                 image_token_start = image_token_indices[0]
                 # print("image token_start", image_token_start,
-                    #   "curr_input_ids", cur_input_ids.shape)
-                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
-                    cur_new_input_embeds.append(language_model.embed_tokens(cur_input_ids[:image_token_start-1]).detach())
-                    cur_new_input_embeds.append(language_model.embed_tokens(cur_input_ids[image_token_start-1:image_token_start]))
+                #   "curr_input_ids", cur_input_ids.shape)
+                if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+                    self.config, "mm_use_im_start_end", False
+                ):
+                    cur_new_input_embeds.append(
+                        language_model.embed_tokens(cur_input_ids[: image_token_start - 1]).detach()
+                    )
+                    cur_new_input_embeds.append(
+                        language_model.embed_tokens(cur_input_ids[image_token_start - 1 : image_token_start])
+                    )
                     cur_new_input_embeds.append(cur_image_features)
-                    cur_new_input_embeds.append(language_model.embed_tokens(cur_input_ids[image_token_start+1:image_token_start+2]))
+                    cur_new_input_embeds.append(
+                        language_model.embed_tokens(cur_input_ids[image_token_start + 1 : image_token_start + 2])
+                    )
                     if labels is not None:
                         cur_new_labels.append(cur_labels[:image_token_start])
-                        cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
-                        cur_new_labels.append(cur_labels[image_token_start:image_token_start+1])
-                        cur_labels = cur_labels[image_token_start+2:]
+                        cur_new_labels.append(
+                            torch.full(
+                                (cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype
+                            )
+                        )
+                        cur_new_labels.append(cur_labels[image_token_start : image_token_start + 1])
+                        cur_labels = cur_labels[image_token_start + 2 :]
                 else:
                     cur_input_ids = cur_input_ids.to(device=language_model.device)
                     cur_new_input_embeds.append(language_model.embed_tokens(cur_input_ids[:image_token_start]))
                     cur_new_input_embeds.append(cur_image_features)
                     if labels is not None:
                         cur_new_labels.append(cur_labels[:image_token_start])
-                        cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
-                        cur_labels = cur_labels[image_token_start+1:]
+                        cur_new_labels.append(
+                            torch.full(
+                                (cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype
+                            )
+                        )
+                        cur_labels = cur_labels[image_token_start + 1 :]
                 cur_image_idx += 1
-                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
-                    cur_input_ids = cur_input_ids[image_token_start+2:]
+                if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+                    self.config, "mm_use_im_start_end", False
+                ):
+                    cur_input_ids = cur_input_ids[image_token_start + 2 :]
                 else:
-                    cur_input_ids = cur_input_ids[image_token_start+1:]
+                    cur_input_ids = cur_input_ids[image_token_start + 1 :]
                 image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
             if cur_input_ids.numel() > 0:
-                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+                if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+                    self.config, "mm_use_im_start_end", False
+                ):
                     cur_new_input_embeds.append(language_model.embed_tokens(cur_input_ids).detach())
                 else:
                     cur_new_input_embeds.append(language_model.embed_tokens(cur_input_ids))
@@ -195,7 +223,17 @@ def prepare_inputs_labels_for_multimodal(
 
             new_input_embeds_align = []
             for cur_new_embed in new_input_embeds:
-                cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
+                cur_new_embed = torch.cat(
+                    (
+                        cur_new_embed,
+                        torch.zeros(
+                            (max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]),
+                            dtype=cur_new_embed.dtype,
+                            device=cur_new_embed.device,
+                        ),
+                    ),
+                    dim=0,
+                )
                 new_input_embeds_align.append(cur_new_embed)
             new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
 
@@ -203,15 +241,40 @@ def prepare_inputs_labels_for_multimodal(
                 new_labels_align = []
                 _new_labels = new_labels
                 for cur_new_label in new_labels:
-                    cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
+                    cur_new_label = torch.cat(
+                        (
+                            cur_new_label,
+                            torch.full(
+                                (max_len - cur_new_label.shape[0],),
+                                IGNORE_INDEX,
+                                dtype=cur_new_label.dtype,
+                                device=cur_new_label.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
                     new_labels_align.append(cur_new_label)
                 new_labels = torch.stack(new_labels_align, dim=0)
             if attention_mask is not None:
                 new_attention_mask = []
-                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
-                    new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
-                    new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
-                    cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
+                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(
+                    attention_mask, _new_labels, new_labels
+                ):
+                    new_attn_mask_pad_left = torch.full(
+                        (cur_new_labels.shape[0] - labels.shape[1],),
+                        True,
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    new_attn_mask_pad_right = torch.full(
+                        (cur_new_labels_align.shape[0] - cur_new_labels.shape[0],),
+                        False,
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    cur_new_attention_mask = torch.cat(
+                        (new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0
+                    )
                     new_attention_mask.append(cur_new_attention_mask)
                 attention_mask = torch.stack(new_attention_mask, dim=0)
                 assert attention_mask.shape == new_labels.shape
@@ -220,8 +283,12 @@ def prepare_inputs_labels_for_multimodal(
             if labels is not None:
                 new_labels = torch.stack(new_labels, dim=0)
             if attention_mask is not None:
-                new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
+                new_attn_mask_pad_left = torch.full(
+                    (attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
                 attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
                 assert attention_mask.shape == new_input_embeds.shape[:2]
-        return None, attention_mask, past_key_values, \
-                new_input_embeds, new_labels
+        return None, attention_mask, past_key_values, new_input_embeds, new_labels
diff --git a/src/lmflow/optim/__init__.py b/src/lmflow/optim/__init__.py
index e69de29bb..c7125f138 100644
--- a/src/lmflow/optim/__init__.py
+++ b/src/lmflow/optim/__init__.py
@@ -0,0 +1,3 @@
+from .utils import create_customized_optimizer
+
+__all__ = ["create_customized_optimizer"]
diff --git a/src/lmflow/optim/adabelief.py b/src/lmflow/optim/adabelief.py
index df17fea41..717fdd428 100644
--- a/src/lmflow/optim/adabelief.py
+++ b/src/lmflow/optim/adabelief.py
@@ -1,19 +1,30 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import math
+
 import torch
 from torch.optim.optimizer import Optimizer
 
+
 class AdaBelief(Optimizer):
     r"""Implements AdaBelief algorithm. Modified from Adam in PyTorch
     reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients, NeurIPS 2020
     """
 
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
-                 weight_decay=0, amsgrad=False, weight_decouple=True, fixed_decay=False, rectify=True,
-                 degenerated_to_sgd=True, print_change_log = True):
-
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.999),
+        eps=1e-16,
+        weight_decay=0,
+        amsgrad=False,
+        weight_decouple=True,
+        fixed_decay=False,
+        rectify=True,
+        degenerated_to_sgd=True,
+        print_change_log=True,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -26,47 +37,53 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
         self.degenerated_to_sgd = degenerated_to_sgd
         if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
             for param in params:
-                if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]):
-                    param['buffer'] = [[None, None, None] for _ in range(10)]
-
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay, amsgrad=amsgrad, buffer=[[None, None, None] for _ in range(10)])
-        super(AdaBelief, self).__init__(params, defaults)
+                if "betas" in param and (param["betas"][0] != betas[0] or param["betas"][1] != betas[1]):
+                    param["buffer"] = [[None, None, None] for _ in range(10)]
+
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            amsgrad=amsgrad,
+            buffer=[[None, None, None] for _ in range(10)],
+        )
+        super().__init__(params, defaults)
 
         self.degenerated_to_sgd = degenerated_to_sgd
         self.weight_decouple = weight_decouple
         self.rectify = rectify
         self.fixed_decay = fixed_decay
         if self.weight_decouple:
-            print('Weight decoupling enabled in AdaBelief')
+            print("Weight decoupling enabled in AdaBelief")
             if self.fixed_decay:
-                print('Weight decay fixed')
+                print("Weight decay fixed")
         if self.rectify:
-            print('Rectification enabled in AdaBelief')
+            print("Rectification enabled in AdaBelief")
         if amsgrad:
-            print('AMSGrad enabled in AdaBelief')
+            print("AMSGrad enabled in AdaBelief")
 
     def __setstate__(self, state):
-        super(AdaBelief, self).__setstate__(state)
+        super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('amsgrad', False)
+            group.setdefault("amsgrad", False)
 
     def reset(self):
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 state = self.state[p]
-                amsgrad = group['amsgrad']
+                amsgrad = group["amsgrad"]
 
                 # State initialization
-                state['step'] = 0
+                state["step"] = 0
                 # Exponential moving average of gradient values
-                state['exp_avg'] = torch.zeros_like(p.data)
+                state["exp_avg"] = torch.zeros_like(p.data)
                 # Exponential moving average of squared gradient values
-                state['exp_avg_var'] = torch.zeros_like(p.data)
+                state["exp_avg_var"] = torch.zeros_like(p.data)
 
                 if amsgrad:
                     # Maintains max of all exp. moving avg. of sq. grad. values
-                    state['max_exp_avg_var'] = torch.zeros_like(p.data)
+                    state["max_exp_avg_var"] = torch.zeros_like(p.data)
 
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -79,10 +96,10 @@ def step(self, closure=None):
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
-                
+
                 # cast data type
                 half_precision = False
                 if p.data.dtype == torch.float16:
@@ -93,92 +110,99 @@ def step(self, closure=None):
                 grad = p.grad.data
                 if grad.is_sparse:
                     raise RuntimeError(
-                        'AdaBelief does not support sparse gradients, please consider SparseAdam instead')
-                amsgrad = group['amsgrad']
+                        "AdaBelief does not support sparse gradients, please consider SparseAdam instead"
+                    )
+                amsgrad = group["amsgrad"]
 
                 state = self.state[p]
 
-                beta1, beta2 = group['betas']
+                beta1, beta2 = group["betas"]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = 0
+                    state["step"] = 0
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
+                    state["exp_avg"] = torch.zeros_like(p.data)
                     # Exponential moving average of squared gradient values
-                    state['exp_avg_var'] = torch.zeros_like(p.data)
+                    state["exp_avg_var"] = torch.zeros_like(p.data)
                     if amsgrad:
                         # Maintains max of all exp. moving avg. of sq. grad. values
-                        state['max_exp_avg_var'] = torch.zeros_like(p.data)
-                
+                        state["max_exp_avg_var"] = torch.zeros_like(p.data)
+
                 # perform weight decay, check if decoupled weight decay
                 if self.weight_decouple:
                     if not self.fixed_decay:
-                        p.data.mul_(1.0 - group['lr'] * group['weight_decay'])
+                        p.data.mul_(1.0 - group["lr"] * group["weight_decay"])
                     else:
-                        p.data.mul_(1.0 - group['weight_decay'])
+                        p.data.mul_(1.0 - group["weight_decay"])
                 else:
-                    if group['weight_decay'] != 0:
-                        grad.add_(p.data, alpha=group['weight_decay'])
+                    if group["weight_decay"] != 0:
+                        grad.add_(p.data, alpha=group["weight_decay"])
 
                 # get current state variable
-                exp_avg, exp_avg_var = state['exp_avg'], state['exp_avg_var']
+                exp_avg, exp_avg_var = state["exp_avg"], state["exp_avg_var"]
 
-                state['step'] += 1
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
+                state["step"] += 1
+                bias_correction1 = 1 - beta1 ** state["step"]
+                bias_correction2 = 1 - beta2 ** state["step"]
 
                 # Update first and second moment running average
                 exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                 grad_residual = grad - exp_avg
-                exp_avg_var.mul_(beta2).addcmul_( grad_residual, grad_residual, value=1 - beta2)
+                exp_avg_var.mul_(beta2).addcmul_(grad_residual, grad_residual, value=1 - beta2)
 
                 if amsgrad:
-                    max_exp_avg_var = state['max_exp_avg_var']
+                    max_exp_avg_var = state["max_exp_avg_var"]
                     # Maintains the maximum of all 2nd moment running avg. till now
-                    torch.max(max_exp_avg_var, exp_avg_var.add_(group['eps']), out=max_exp_avg_var)
+                    torch.max(max_exp_avg_var, exp_avg_var.add_(group["eps"]), out=max_exp_avg_var)
 
                     # Use the max. for normalizing running avg. of gradient
-                    denom = (max_exp_avg_var.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
+                    denom = (max_exp_avg_var.sqrt() / math.sqrt(bias_correction2)).add_(group["eps"])
                 else:
-                    denom = (exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-                
+                    denom = (exp_avg_var.add_(group["eps"]).sqrt() / math.sqrt(bias_correction2)).add_(group["eps"])
+
                 # update
                 if not self.rectify:
                     # Default update
-                    step_size = group['lr'] / bias_correction1
-                    p.data.addcdiv_( exp_avg, denom, value=-step_size)
+                    step_size = group["lr"] / bias_correction1
+                    p.data.addcdiv_(exp_avg, denom, value=-step_size)
 
                 else:  # Rectified update, forked from RAdam
-                    buffered = group['buffer'][int(state['step'] % 10)]
-                    if state['step'] == buffered[0]:
+                    buffered = group["buffer"][int(state["step"] % 10)]
+                    if state["step"] == buffered[0]:
                         N_sma, step_size = buffered[1], buffered[2]
                     else:
-                        buffered[0] = state['step']
-                        beta2_t = beta2 ** state['step']
+                        buffered[0] = state["step"]
+                        beta2_t = beta2 ** state["step"]
                         N_sma_max = 2 / (1 - beta2) - 1
-                        N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
+                        N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
                         buffered[1] = N_sma
 
                         # more conservative since it's an approximated value
                         if N_sma >= 5:
                             step_size = math.sqrt(
-                                (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (
-                                        N_sma_max - 2)) / (1 - beta1 ** state['step'])
+                                (1 - beta2_t)
+                                * (N_sma - 4)
+                                / (N_sma_max - 4)
+                                * (N_sma - 2)
+                                / N_sma
+                                * N_sma_max
+                                / (N_sma_max - 2)
+                            ) / (1 - beta1 ** state["step"])
                         elif self.degenerated_to_sgd:
-                            step_size = 1.0 / (1 - beta1 ** state['step'])
+                            step_size = 1.0 / (1 - beta1 ** state["step"])
                         else:
                             step_size = -1
                         buffered[2] = step_size
 
                     if N_sma >= 5:
-                        denom = exp_avg_var.sqrt().add_(group['eps'])
-                        p.data.addcdiv_(exp_avg, denom, value=-step_size * group['lr'])
+                        denom = exp_avg_var.sqrt().add_(group["eps"])
+                        p.data.addcdiv_(exp_avg, denom, value=-step_size * group["lr"])
                     elif step_size > 0:
-                        p.data.add_( exp_avg, alpha=-step_size * group['lr'])
-                
+                        p.data.add_(exp_avg, alpha=-step_size * group["lr"])
+
                 if half_precision:
                     p.data = p.data.half()
-                    p.grad = p.grad.half() 
+                    p.grad = p.grad.half()
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/adabound.py b/src/lmflow/optim/adabound.py
index 1b6c603d8..e30511967 100644
--- a/src/lmflow/optim/adabound.py
+++ b/src/lmflow/optim/adabound.py
@@ -1,14 +1,14 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import math
 
 import torch
 from torch.optim.optimizer import Optimizer
 
+
 class AdaBound(Optimizer):
     r"""Implements AdaBound algorithm.
-    
+
     It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of
     Learning Rate
     https://arxiv.org/abs/1902.09843
@@ -20,7 +20,7 @@ def __init__(
         self,
         params,
         lr: float = 1e-3,
-        betas = (0.9, 0.999),
+        betas=(0.9, 0.999),
         final_lr: float = 0.1,
         gamma: float = 1e-3,
         eps: float = 1e-8,
@@ -32,23 +32,15 @@ def __init__(
         if eps < 0.0:
             raise ValueError("Invalid epsilon value: {}".format(eps))
         if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 0: {}".format(betas[0])
-            )
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
         if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 1: {}".format(betas[1])
-            )
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         if final_lr < 0.0:
-            raise ValueError(
-                "Invalid final learning rate: {}".format(final_lr)
-            )
+            raise ValueError("Invalid final learning rate: {}".format(final_lr))
         if not 0.0 <= gamma < 1.0:
             raise ValueError("Invalid gamma parameter: {}".format(gamma))
         if weight_decay < 0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         defaults = dict(
             lr=lr,
             betas=betas,
@@ -58,15 +50,15 @@ def __init__(
             weight_decay=weight_decay,
             amsbound=amsbound,
         )
-        super(AdaBound, self).__init__(params, defaults)
+        super().__init__(params, defaults)
         self.base_lrs = [group["lr"] for group in self.param_groups]
 
     def __setstate__(self, state) -> None:
-        super(AdaBound, self).__setstate__(state)
+        super().__setstate__(state)
         for group in self.param_groups:
             group.setdefault("amsbound", False)
 
-    def step(self, closure = None):
+    def step(self, closure=None):
         r"""Performs a single optimization step.
 
         Arguments:
@@ -82,10 +74,7 @@ def step(self, closure = None):
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    msg = (
-                        "AdaBound does not support sparse gradients, "
-                        "please consider SparseAdam instead"
-                    )
+                    msg = "AdaBound does not support sparse gradients, please consider SparseAdam instead"
                     raise RuntimeError(msg)
                 amsbound = group["amsbound"]
 
@@ -95,19 +84,13 @@ def step(self, closure = None):
                 if len(state) == 0:
                     state["step"] = 0
                     # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
+                    state["exp_avg"] = torch.zeros_like(p, memory_format=torch.preserve_format)
                     # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
+                    state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
                     if amsbound:
                         # Maintains max of all exp. moving avg. of
                         # sq. grad. values
-                        state["max_exp_avg_sq"] = torch.zeros_like(
-                            p, memory_format=torch.preserve_format
-                        )
+                        state["max_exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
 
                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                 if amsbound:
@@ -133,26 +116,16 @@ def step(self, closure = None):
 
                 bias_correction1 = 1 - beta1 ** state["step"]
                 bias_correction2 = 1 - beta2 ** state["step"]
-                step_size = (
-                    group["lr"]
-                    * math.sqrt(bias_correction2)
-                    / bias_correction1
-                )
+                step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1
 
                 # Applies bounds on actual learning rate
                 # lr_scheduler cannot affect final_lr, this is a workaround
                 # to apply lr decay
                 final_lr = group["final_lr"] * group["lr"] / base_lr
-                lower_bound = final_lr * (
-                    1 - 1 / (group["gamma"] * state["step"] + 1)
-                )
-                upper_bound = final_lr * (
-                    1 + 1 / (group["gamma"] * state["step"])
-                )
+                lower_bound = final_lr * (1 - 1 / (group["gamma"] * state["step"] + 1))
+                upper_bound = final_lr * (1 + 1 / (group["gamma"] * state["step"]))
                 step_size = torch.full_like(denom, step_size)
-                step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(
-                    exp_avg
-                )
+                step_size.div_(denom).clamp_(lower_bound, upper_bound).mul_(exp_avg)
 
                 p.data.add_(-step_size)
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/adadelta.py b/src/lmflow/optim/adadelta.py
index 5755692cb..bbb8acd77 100644
--- a/src/lmflow/optim/adadelta.py
+++ b/src/lmflow/optim/adadelta.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import torch
 from torch.optim.optimizer import Optimizer
 
+
 class Adadelta(Optimizer):
     def __init__(self, params, lr=1.0, rho=0.95, eps=1e-6):
         defaults = dict(lr=lr, rho=rho, eps=eps)
-        super(Adadelta, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def step(self, closure=None):
         loss = None
@@ -15,21 +15,21 @@ def step(self, closure=None):
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad.data
                 state = self.state[p]
 
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['square_avg'] = torch.zeros_like(p.data)
-                    state['acc_delta'] = torch.zeros_like(p.data)
+                    state["step"] = 0
+                    state["square_avg"] = torch.zeros_like(p.data)
+                    state["acc_delta"] = torch.zeros_like(p.data)
 
-                square_avg, acc_delta = state['square_avg'], state['acc_delta']
-                rho, eps = group['rho'], group['eps']
+                square_avg, acc_delta = state["square_avg"], state["acc_delta"]
+                rho, eps = group["rho"], group["eps"]
 
-                state['step'] += 1
+                state["step"] += 1
 
                 square_avg.mul_(rho).addcmul_(1 - rho, grad, grad)
 
@@ -40,4 +40,4 @@ def step(self, closure=None):
 
                 acc_delta.mul_(rho).addcmul_(1 - rho, delta, delta)
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/adagrad.py b/src/lmflow/optim/adagrad.py
index 29977173c..2ad383c12 100644
--- a/src/lmflow/optim/adagrad.py
+++ b/src/lmflow/optim/adagrad.py
@@ -1,35 +1,34 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import torch
-from torch.optim.optimizer import Optimizer
+
 
 class AdaGrad(torch.optim.Optimizer):
     def __init__(self, params, lr=0.001, eps=1e-8, weight_decay=0):
         defaults = dict(lr=lr, eps=eps, weight_decay=weight_decay)
-        super(AdaGrad, self).__init__(params, defaults)
-        
+        super().__init__(params, defaults)
+
     def step(self, closure=None):
         loss = None
         if closure is not None:
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad.data
-                if group['weight_decay'] != 0:
-                    grad = grad.add(group['weight_decay'], p.data)
+                if group["weight_decay"] != 0:
+                    grad = grad.add(group["weight_decay"], p.data)
 
                 state = self.state[p]
 
                 if len(state) == 0:
-                    state['sum'] = torch.zeros_like(p.data)
+                    state["sum"] = torch.zeros_like(p.data)
 
-                sum = state['sum']
+                sum = state["sum"]
                 sum.addcmul_(1, grad, grad)
-                std = sum.sqrt().add_(group['eps'])
-                p.data.addcdiv_(-group['lr'], grad, std)
+                std = sum.sqrt().add_(group["eps"])
+                p.data.addcdiv_(-group["lr"], grad, std)
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/adam.py b/src/lmflow/optim/adam.py
index 17c940519..cd33c6090 100644
--- a/src/lmflow/optim/adam.py
+++ b/src/lmflow/optim/adam.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import torch
 from torch.optim.optimizer import Optimizer
 
+
 class Adam(Optimizer):
     def __init__(self, params, lr=0.001, betas=(0.9, 0.999), eps=1e-8):
         defaults = dict(lr=lr, betas=betas, eps=eps)
-        super(Adam, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def step(self, closure=None):
         loss = None
@@ -15,32 +15,32 @@ def step(self, closure=None):
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad.data
-                
+
                 state = self.state[p]
 
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p.data)
+                    state["exp_avg_sq"] = torch.zeros_like(p.data)
 
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
 
-                state['step'] += 1
+                state["step"] += 1
 
                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 
                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
+                bias_correction1 = 1 - beta1 ** state["step"]
+                bias_correction2 = 1 - beta2 ** state["step"]
 
-                step_size = group['lr'] * (bias_correction2 ** 0.5) / bias_correction1
-                denom = exp_avg_sq.sqrt().add_(group['eps'])
+                step_size = group["lr"] * (bias_correction2**0.5) / bias_correction1
+                denom = exp_avg_sq.sqrt().add_(group["eps"])
                 p.data.addcdiv_(-step_size, exp_avg, denom)
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/adamax.py b/src/lmflow/optim/adamax.py
index c99ed476c..aec36709b 100644
--- a/src/lmflow/optim/adamax.py
+++ b/src/lmflow/optim/adamax.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import torch
 from torch.optim.optimizer import Optimizer
 
+
 class Adamax(Optimizer):
     def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
         if not 0.0 <= lr:
@@ -17,10 +17,10 @@ def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
-        super(Adamax, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
-        super(Adamax, self).__setstate__(state)
+        super().__setstate__(state)
 
     def step(self, closure=None):
         loss = None
@@ -28,41 +28,38 @@ def step(self, closure=None):
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError('Adamax does not support sparse gradients')
+                    raise RuntimeError("Adamax does not support sparse gradients")
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    state['exp_inf'] = torch.zeros_like(p.data)
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p.data)
+                    state["exp_inf"] = torch.zeros_like(p.data)
 
-                exp_avg, exp_inf = state['exp_avg'], state['exp_inf']
-                beta1, beta2 = group['betas']
+                exp_avg, exp_inf = state["exp_avg"], state["exp_inf"]
+                beta1, beta2 = group["betas"]
 
-                state['step'] += 1
+                state["step"] += 1
 
-                if group['weight_decay'] != 0:
-                    grad = grad.add(group['weight_decay'], p.data)
+                if group["weight_decay"] != 0:
+                    grad = grad.add(group["weight_decay"], p.data)
 
                 # Update biased first moment estimate
                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 
                 # Update the exponentially weighted infinity norm
-                norm_buf = torch.cat([
-                    exp_inf.mul_(beta2).unsqueeze(0),
-                    grad.abs().unsqueeze_(0)
-                ], 0)
+                norm_buf = torch.cat([exp_inf.mul_(beta2).unsqueeze(0), grad.abs().unsqueeze_(0)], 0)
                 torch.max(norm_buf, 0, keepdim=False, out=(exp_inf, exp_inf.new().long()))
 
-                bias_correction = 1 - beta1 ** state['step']
-                clr = group['lr'] / bias_correction
+                bias_correction = 1 - beta1 ** state["step"]
+                clr = group["lr"] / bias_correction
 
-                p.data.addcdiv_(-clr, exp_avg, exp_inf + group['eps'])
+                p.data.addcdiv_(-clr, exp_avg, exp_inf + group["eps"])
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/adamp.py b/src/lmflow/optim/adamp.py
index ec0fe1a8c..171cf2828 100644
--- a/src/lmflow/optim/adamp.py
+++ b/src/lmflow/optim/adamp.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import math
+
 import torch
 from torch.optim.optimizer import Optimizer
 
@@ -21,7 +21,7 @@ def __init__(
         self,
         params,
         lr: float = 1e-3,
-        betas = (0.9, 0.999),
+        betas=(0.9, 0.999),
         eps: float = 1e-8,
         weight_decay: float = 0,
         delta: float = 0.1,
@@ -33,17 +33,11 @@ def __init__(
         if eps < 0.0:
             raise ValueError("Invalid epsilon value: {}".format(eps))
         if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 0: {}".format(betas[0])
-            )
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
         if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 1: {}".format(betas[1])
-            )
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         if weight_decay < 0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         if delta < 0:
             raise ValueError("Invalid delta value: {}".format(delta))
         if wd_ratio < 0:
@@ -58,7 +52,7 @@ def __init__(
             wd_ratio=wd_ratio,
             nesterov=nesterov,
         )
-        super(AdamP, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     @staticmethod
     def _channel_view(x):
@@ -86,19 +80,15 @@ def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
             cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
 
             if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
-                p_n = p.data / view_func(p.data).norm(dim=1).view(
-                    expand_size
-                ).add_(eps)
-                perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(
-                    expand_size
-                )
+                p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
+                perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
                 wd = wd_ratio
 
                 return perturb, wd
 
         return perturb, wd
 
-    def step(self, closure = None):
+    def step(self, closure=None):
         r"""Performs a single optimization step.
 
         Arguments:
@@ -122,12 +112,8 @@ def step(self, closure = None):
                 # State initialization
                 if len(state) == 0:
                     state["step"] = 0
-                    state["exp_avg"] = torch.zeros_like(
-                        p.data, memory_format=torch.preserve_format
-                    )
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p.data, memory_format=torch.preserve_format
-                    )
+                    state["exp_avg"] = torch.zeros_like(p.data, memory_format=torch.preserve_format)
+                    state["exp_avg_sq"] = torch.zeros_like(p.data, memory_format=torch.preserve_format)
 
                 # Adam
                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
@@ -139,9 +125,7 @@ def step(self, closure = None):
                 exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
 
-                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(
-                    group["eps"]
-                )
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group["eps"])
                 step_size = group["lr"] / bias_correction1
 
                 if nesterov:
@@ -163,11 +147,9 @@ def step(self, closure = None):
 
                 # Weight decay
                 if group["weight_decay"] > 0:
-                    p.data.mul_(
-                        1 - group["lr"] * group["weight_decay"] * wd_ratio
-                    )
+                    p.data.mul_(1 - group["lr"] * group["weight_decay"] * wd_ratio)
 
                 # Step
                 p.data.add_(perturb, alpha=-step_size)
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/adamw_schedule_free.py b/src/lmflow/optim/adamw_schedule_free.py
index 9b0638ba7..0202ba678 100644
--- a/src/lmflow/optim/adamw_schedule_free.py
+++ b/src/lmflow/optim/adamw_schedule_free.py
@@ -1,75 +1,83 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# 
+#
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import math
+
 import torch
 import torch.optim
-import math
+
 
 class AdamWScheduleFree(torch.optim.Optimizer):
     r"""
     Schedule-Free AdamW
-    As the name suggests, no scheduler is needed with this optimizer. 
+    As the name suggests, no scheduler is needed with this optimizer.
     To add warmup, rather than using a learning rate schedule you can just
     set the warmup_steps parameter.
-    
+
     This optimizer requires that .train() and .eval() be called before the
     beginning of training and evaluation respectively. The optimizer should
     also be placed in eval mode when saving checkpoints.
     """
-    def __init__(self,
-                 params, 
-                 lr=0.0025, 
-                 betas=(0.9, 0.999), 
-                 eps=1e-8,
-                 weight_decay=0,
-                 warmup_steps=0,
-                 r=0.0,
-                 weight_lr_power=2.0,
-                 foreach=hasattr(torch, "_foreach_mul_")
-                 ):
-
-        defaults = dict(lr=lr, 
-                        betas=betas, 
-                        eps=eps,
-                        r=r,
-                        k=0,
-                        warmup_steps=warmup_steps,
-                        train_mode=True,
-                        weight_sum=0.0,
-                        lr_max=-1.0,
-                        weight_lr_power=weight_lr_power,
-                        weight_decay=weight_decay,
-                        foreach=foreach)
+
+    def __init__(
+        self,
+        params,
+        lr=0.0025,
+        betas=(0.9, 0.999),
+        eps=1e-8,
+        weight_decay=0,
+        warmup_steps=0,
+        r=0.0,
+        weight_lr_power=2.0,
+        foreach=None,
+    ):
+        if not foreach:
+            foreach = hasattr(torch, "_foreach_mul_")
+
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            r=r,
+            k=0,
+            warmup_steps=warmup_steps,
+            train_mode=True,
+            weight_sum=0.0,
+            lr_max=-1.0,
+            weight_lr_power=weight_lr_power,
+            weight_decay=weight_decay,
+            foreach=foreach,
+        )
         super().__init__(params, defaults)
-    
+
     def eval(self):
         for group in self.param_groups:
-            train_mode = group['train_mode']
-            beta1, _ = group['betas']
+            train_mode = group["train_mode"]
+            beta1, _ = group["betas"]
             if train_mode:
-                for p in group['params']:
+                for p in group["params"]:
                     state = self.state[p]
-                    if 'z' in state:
+                    if "z" in state:
                         # Set p.data to x
-                        p.data.lerp_(end=state['z'], weight=1-1/beta1)
-                group['train_mode'] = False
+                        p.data.lerp_(end=state["z"], weight=1 - 1 / beta1)
+                group["train_mode"] = False
 
     def train(self):
         for group in self.param_groups:
-            train_mode = group['train_mode']
-            beta1, _ = group['betas']
+            train_mode = group["train_mode"]
+            beta1, _ = group["betas"]
             if not train_mode:
-                for p in group['params']:
+                for p in group["params"]:
                     state = self.state[p]
-                    if 'z' in state:
+                    if "z" in state:
                         # Set p.data to y
-                        p.data.lerp_(end=state['z'], weight=1-beta1)
-                group['train_mode'] = True
+                        p.data.lerp_(end=state["z"], weight=1 - beta1)
+                group["train_mode"] = True
 
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -82,54 +90,52 @@ def step(self, closure=None):
         loss = None
         if closure is not None:
             loss = closure()
-        
+
         for group in self.param_groups:
-            eps = group['eps']
-            beta1, beta2 = group['betas']
-            decay = group['weight_decay']
-            k = group['k']
-            r = group['r']
-            warmup_steps = group['warmup_steps']
-            weight_lr_power = group['weight_lr_power']
-            
+            eps = group["eps"]
+            beta1, beta2 = group["betas"]
+            decay = group["weight_decay"]
+            k = group["k"]
+            r = group["r"]
+            warmup_steps = group["warmup_steps"]
+            weight_lr_power = group["weight_lr_power"]
+
             if k < warmup_steps:
-              sched = (k+1) / warmup_steps
+                sched = (k + 1) / warmup_steps
             else:
-              sched = 1.0
-            
-            bias_correction2 = 1 - beta2 ** (k+1)
-            lr = group['lr']*sched*math.sqrt(bias_correction2)
-            
-            lr_max = group['lr_max'] = max(lr, group['lr_max'])
-            
-            weight = ((k+1)**r) * (lr_max**weight_lr_power)
-            weight_sum = group['weight_sum'] = group['weight_sum'] + weight
+                sched = 1.0
+
+            bias_correction2 = 1 - beta2 ** (k + 1)
+            lr = group["lr"] * sched * math.sqrt(bias_correction2)
+
+            lr_max = group["lr_max"] = max(lr, group["lr_max"])
+
+            weight = ((k + 1) ** r) * (lr_max**weight_lr_power)
+            weight_sum = group["weight_sum"] = group["weight_sum"] + weight
 
             try:
-                ckp1 = weight/weight_sum
+                ckp1 = weight / weight_sum
             except ZeroDivisionError:
                 ckp1 = 0
 
-            if not group['train_mode']:
+            if not group["train_mode"]:
                 raise Exception("Not in train mode!")
 
-            active_p = [p for p in group['params'] if p.grad is not None]
-            
+            active_p = [p for p in group["params"] if p.grad is not None]
+
             for p in active_p:
-                if 'z' not in self.state[p]:
-                    self.state[p]['z'] = torch.clone(p.data)
-                    self.state[p]['exp_avg_sq'] = torch.zeros_like(p.data)
+                if "z" not in self.state[p]:
+                    self.state[p]["z"] = torch.clone(p.data)
+                    self.state[p]["exp_avg_sq"] = torch.zeros_like(p.data)
 
-            if group['foreach'] and len(active_p) > 0:
-                y, grad, exp_avg_sq, z = zip(*[(p.data, 
-                                                p.grad, 
-                                                self.state[p]['exp_avg_sq'], 
-                                                self.state[p]['z']) 
-                                                for p in active_p])
+            if group["foreach"] and len(active_p) > 0:
+                y, grad, exp_avg_sq, z = zip(
+                    *[(p.data, p.grad, self.state[p]["exp_avg_sq"], self.state[p]["z"]) for p in active_p]
+                )
 
                 # Decay the first and second moment running average coefficient
                 torch._foreach_mul_(exp_avg_sq, beta2)
-                torch._foreach_addcmul_(exp_avg_sq, grad, grad, value=1-beta2)
+                torch._foreach_addcmul_(exp_avg_sq, grad, grad, value=1 - beta2)
                 denom = torch._foreach_sqrt(exp_avg_sq)
                 torch._foreach_add_(denom, eps)
 
@@ -143,21 +149,21 @@ def step(self, closure=None):
                 # These operations update y in-place,
                 # without computing x explicitly.
                 torch._foreach_lerp_(y, z, weight=ckp1)
-                torch._foreach_add_(y, grad, alpha=lr*(beta1*(1-ckp1)-1))
+                torch._foreach_add_(y, grad, alpha=lr * (beta1 * (1 - ckp1) - 1))
 
                 # z step
                 torch._foreach_sub_(z, grad, alpha=lr)
             else:
                 for p in active_p:
-                    y = p.data # Notation to match theory
+                    y = p.data  # Notation to match theory
                     grad = p.grad.data
 
                     state = self.state[p]
 
-                    z = state['z']
-                    exp_avg_sq = state['exp_avg_sq']
+                    z = state["z"]
+                    exp_avg_sq = state["exp_avg_sq"]
 
-                    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1-beta2)
+                    exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
                     denom = exp_avg_sq.sqrt().add_(eps)
 
                     # Reuse grad buffer for memory efficiency
@@ -170,10 +176,10 @@ def step(self, closure=None):
                     # These operations update y in-place,
                     # without computing x explicitly.
                     y.lerp_(end=z, weight=ckp1)
-                    y.add_(grad_normalized, alpha=lr*(beta1*(1-ckp1)-1))
+                    y.add_(grad_normalized, alpha=lr * (beta1 * (1 - ckp1) - 1))
 
                     # z step
                     z.sub_(grad_normalized, alpha=lr)
 
-            group['k'] = k+1
-        return loss
\ No newline at end of file
+            group["k"] = k + 1
+        return loss
diff --git a/src/lmflow/optim/adan.py b/src/lmflow/optim/adan.py
index c2ad68434..76eb1a214 100644
--- a/src/lmflow/optim/adan.py
+++ b/src/lmflow/optim/adan.py
@@ -1,12 +1,12 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import math
-from typing import List
+
 import torch
 from torch import Tensor
 from torch.optim.optimizer import Optimizer
 
+
 class Adan(Optimizer):
     """Implements a pytorch variant of Adan.
 
@@ -16,30 +16,29 @@ class Adan(Optimizer):
 
     """
 
-    def __init__(self,
-                 params,
-                 lr=1e-3,
-                 betas=(0.98, 0.92, 0.99),
-                 eps=1e-8,
-                 weight_decay=0.0,
-                 max_grad_norm=0.0,
-                 no_prox=False,
-                 foreach: bool = True):
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.98, 0.92, 0.99),
+        eps=1e-8,
+        weight_decay=0.0,
+        max_grad_norm=0.0,
+        no_prox=False,
+        foreach: bool = True,
+    ):
         if not 0.0 <= max_grad_norm:
-            raise ValueError('Invalid Max grad norm: {}'.format(max_grad_norm))
+            raise ValueError("Invalid Max grad norm: {}".format(max_grad_norm))
         if not 0.0 <= lr:
-            raise ValueError('Invalid learning rate: {}'.format(lr))
+            raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
-            raise ValueError('Invalid epsilon value: {}'.format(eps))
+            raise ValueError("Invalid epsilon value: {}".format(eps))
         if not 0.0 <= betas[0] < 1.0:
-            raise ValueError('Invalid beta parameter at index 0: {}'.format(
-                betas[0]))
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
         if not 0.0 <= betas[1] < 1.0:
-            raise ValueError('Invalid beta parameter at index 1: {}'.format(
-                betas[1]))
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         if not 0.0 <= betas[2] < 1.0:
-            raise ValueError('Invalid beta parameter at index 2: {}'.format(
-                betas[2]))
+            raise ValueError("Invalid beta parameter at index 2: {}".format(betas[2]))
         defaults = dict(
             lr=lr,
             betas=betas,
@@ -47,50 +46,48 @@ def __init__(self,
             weight_decay=weight_decay,
             max_grad_norm=max_grad_norm,
             no_prox=no_prox,
-            foreach=foreach)
+            foreach=foreach,
+        )
         super().__init__(params, defaults)
 
     def __setstate__(self, state):
-        super(Adan, self).__setstate__(state)
+        super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('no_prox', False)
+            group.setdefault("no_prox", False)
 
     @torch.no_grad()
     def restart_opt(self):
         for group in self.param_groups:
-            group['step'] = 0
-            for p in group['params']:
+            group["step"] = 0
+            for p in group["params"]:
                 if p.requires_grad:
                     state = self.state[p]
                     # State initialization
 
                     # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p)
+                    state["exp_avg"] = torch.zeros_like(p)
                     # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    state["exp_avg_sq"] = torch.zeros_like(p)
                     # Exponential moving average of gradient difference
-                    state['exp_avg_diff'] = torch.zeros_like(p)
+                    state["exp_avg_diff"] = torch.zeros_like(p)
 
     @torch.no_grad()
     def step(self):
         """Performs a single optimization step."""
-        if self.defaults['max_grad_norm'] > 0:
-            device = self.param_groups[0]['params'][0].device
+        if self.defaults["max_grad_norm"] > 0:
+            device = self.param_groups[0]["params"][0].device
             global_grad_norm = torch.zeros(1, device=device)
 
-            max_grad_norm = torch.tensor(
-                self.defaults['max_grad_norm'], device=device)
+            max_grad_norm = torch.tensor(self.defaults["max_grad_norm"], device=device)
             for group in self.param_groups:
-
-                for p in group['params']:
+                for p in group["params"]:
                     if p.grad is not None:
                         grad = p.grad
                         global_grad_norm.add_(grad.pow(2).sum())
 
-            global_grad_norm = torch.sqrt(global_grad_norm) + group['eps']
+            global_grad_norm = torch.sqrt(global_grad_norm) + group["eps"]
 
-            clip_global_grad_norm = \
-                torch.clamp(max_grad_norm / global_grad_norm, max=1.0)
+            clip_global_grad_norm = torch.clamp(max_grad_norm / global_grad_norm, max=1.0)
         else:
             clip_global_grad_norm = 1.0
 
@@ -102,20 +99,20 @@ def step(self):
             exp_avg_diffs = []
             pre_grads = []
 
-            beta1, beta2, beta3 = group['betas']
+            beta1, beta2, beta3 = group["betas"]
             # assume same step across group now to simplify things
             # per parameter step can be easily support
             # by making it tensor, or pass list into kernel
-            if 'step' in group:
-                group['step'] += 1
+            if "step" in group:
+                group["step"] += 1
             else:
-                group['step'] = 1
+                group["step"] = 1
 
-            bias_correction1 = 1.0 - beta1**group['step']
-            bias_correction2 = 1.0 - beta2**group['step']
-            bias_correction3 = 1.0 - beta3**group['step']
+            bias_correction1 = 1.0 - beta1 ** group["step"]
+            bias_correction2 = 1.0 - beta2 ** group["step"]
+            bias_correction3 = 1.0 - beta3 ** group["step"]
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 params_with_grad.append(p)
@@ -123,20 +120,20 @@ def step(self):
 
                 state = self.state[p]
                 if len(state) == 0:
-                    state['exp_avg'] = torch.zeros_like(p)
-                    state['exp_avg_sq'] = torch.zeros_like(p)
-                    state['exp_avg_diff'] = torch.zeros_like(p)
+                    state["exp_avg"] = torch.zeros_like(p)
+                    state["exp_avg_sq"] = torch.zeros_like(p)
+                    state["exp_avg_diff"] = torch.zeros_like(p)
 
-                if 'pre_grad' not in state or group['step'] == 1:
+                if "pre_grad" not in state or group["step"] == 1:
                     # at first step grad wouldn't be clipped
                     # by `clip_global_grad_norm`
                     # this is only to simplify implementation
-                    state['pre_grad'] = p.grad
+                    state["pre_grad"] = p.grad
 
-                exp_avgs.append(state['exp_avg'])
-                exp_avg_sqs.append(state['exp_avg_sq'])
-                exp_avg_diffs.append(state['exp_avg_diff'])
-                pre_grads.append(state['pre_grad'])
+                exp_avgs.append(state["exp_avg"])
+                exp_avg_sqs.append(state["exp_avg_sq"])
+                exp_avg_diffs.append(state["exp_avg_diff"])
+                pre_grads.append(state["pre_grad"])
 
             kwargs = dict(
                 params=params_with_grad,
@@ -151,28 +148,28 @@ def step(self):
                 bias_correction1=bias_correction1,
                 bias_correction2=bias_correction2,
                 bias_correction3_sqrt=math.sqrt(bias_correction3),
-                lr=group['lr'],
-                weight_decay=group['weight_decay'],
-                eps=group['eps'],
-                no_prox=group['no_prox'],
+                lr=group["lr"],
+                weight_decay=group["weight_decay"],
+                eps=group["eps"],
+                no_prox=group["no_prox"],
                 clip_global_grad_norm=clip_global_grad_norm,
             )
-            if group['foreach']:
+            if group["foreach"]:
                 copy_grads = _multi_tensor_adan(**kwargs)
             else:
                 copy_grads = _single_tensor_adan(**kwargs)
 
             for p, copy_grad in zip(params_with_grad, copy_grads):
-                self.state[p]['pre_grad'] = copy_grad
+                self.state[p]["pre_grad"] = copy_grad
 
 
 def _single_tensor_adan(
-    params: List[Tensor],
-    grads: List[Tensor],
-    exp_avgs: List[Tensor],
-    exp_avg_sqs: List[Tensor],
-    exp_avg_diffs: List[Tensor],
-    pre_grads: List[Tensor],
+    params: list[Tensor],
+    grads: list[Tensor],
+    exp_avgs: list[Tensor],
+    exp_avg_sqs: list[Tensor],
+    exp_avg_diffs: list[Tensor],
+    pre_grads: list[Tensor],
     *,
     beta1: float,
     beta2: float,
@@ -218,12 +215,12 @@ def _single_tensor_adan(
 
 
 def _multi_tensor_adan(
-    params: List[Tensor],
-    grads: List[Tensor],
-    exp_avgs: List[Tensor],
-    exp_avg_sqs: List[Tensor],
-    exp_avg_diffs: List[Tensor],
-    pre_grads: List[Tensor],
+    params: list[Tensor],
+    grads: list[Tensor],
+    exp_avgs: list[Tensor],
+    exp_avg_sqs: list[Tensor],
+    exp_avg_diffs: list[Tensor],
+    pre_grads: list[Tensor],
     *,
     beta1: float,
     beta2: float,
@@ -256,8 +253,7 @@ def _multi_tensor_adan(
     torch._foreach_add_(exp_avg_diffs, diff, alpha=1 - beta2)  # diff_t
 
     torch._foreach_mul_(exp_avg_sqs, beta3)
-    torch._foreach_addcmul_(
-        exp_avg_sqs, update, update, value=1 - beta3)  # n_t
+    torch._foreach_addcmul_(exp_avg_sqs, update, update, value=1 - beta3)  # n_t
 
     denom = torch._foreach_sqrt(exp_avg_sqs)
     torch._foreach_div_(denom, bias_correction3_sqrt)
@@ -268,8 +264,7 @@ def _multi_tensor_adan(
     # beta2 * diff / bias_correction2 != diff * (beta2 / bias_correction2)  # noqa
     # using faster version by default. uncomment for tests to pass
     # torch._foreach_add_(update, torch._foreach_div(torch._foreach_mul(exp_avg_diffs, beta2), bias_correction2))  # noqa
-    torch._foreach_add_(
-        update, torch._foreach_mul(exp_avg_diffs, beta2 / bias_correction2))
+    torch._foreach_add_(update, torch._foreach_mul(exp_avg_diffs, beta2 / bias_correction2))
     torch._foreach_div_(update, denom)
 
     if no_prox:
diff --git a/src/lmflow/optim/dummy.py b/src/lmflow/optim/dummy.py
index bb922199d..429733d2b 100644
--- a/src/lmflow/optim/dummy.py
+++ b/src/lmflow/optim/dummy.py
@@ -1,15 +1,14 @@
 #!/usr/bin/env python
-# coding=utf-8
-"""Dummy Optimizer.
-"""
-import math
-import warnings
-from typing import Callable, Iterable, Tuple
+"""Dummy Optimizer."""
+
+from collections.abc import Iterable
+from typing import Callable
 
 import torch
 from torch import nn
 from torch.optim import Optimizer
 
+
 class Dummy(Optimizer):
     """
     An dummy optimizer that does nothing.
@@ -24,8 +23,8 @@ class Dummy(Optimizer):
     def __init__(
         self,
         params: Iterable[nn.parameter.Parameter],
-        lr: float = 0.,
-        betas: Tuple[float, float] = (0.9, 0.999),
+        lr: float = 0.0,
+        betas: tuple[float, float] = (0.9, 0.999),
         weight_decay: float = 0.0,
     ):
         if lr < 0.0:
@@ -37,9 +36,8 @@ def __init__(
         defaults = {"lr": lr, "betas": betas, "weight_decay": weight_decay}
         super().__init__(params, defaults)
 
-
     @torch.no_grad()
-    def step(self, closure: Callable=None):
+    def step(self, closure: Callable = None):
         """
         Performs a single optimization step.
 
@@ -68,9 +66,9 @@ def step(self, closure: Callable=None):
 
                 # v := exp_avg
                 # m := double_exp_avg
-                v, m = state["exp_avg"], state["exp_avg2"]
-                beta1, beta2 = group["betas"]
-                step_size = group["lr"]
+                _, m = state["exp_avg"], state["exp_avg2"]
+                # beta1, beta2 = group["betas"]
+                # step_size = group["lr"]
 
                 state["step"] += 1
 
diff --git a/src/lmflow/optim/lamb.py b/src/lmflow/optim/lamb.py
index 4d5711051..850a1f93c 100644
--- a/src/lmflow/optim/lamb.py
+++ b/src/lmflow/optim/lamb.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import math
+
 import torch
 from torch.optim.optimizer import Optimizer
 
+
 class Lamb(Optimizer):
     r"""Implements Lamb algorithm.
 
@@ -20,7 +21,7 @@ def __init__(
         self,
         params,
         lr: float = 1e-3,
-        betas = (0.9, 0.999),
+        betas=(0.9, 0.999),
         eps: float = 1e-6,
         weight_decay: float = 0,
         clamp_value: float = 10,
@@ -32,17 +33,11 @@ def __init__(
         if eps < 0.0:
             raise ValueError("Invalid epsilon value: {}".format(eps))
         if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 0: {}".format(betas[0])
-            )
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
         if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 1: {}".format(betas[1])
-            )
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         if weight_decay < 0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         if clamp_value < 0.0:
             raise ValueError("Invalid clamp value: {}".format(clamp_value))
 
@@ -51,9 +46,9 @@ def __init__(
         self.adam = adam
         self.debias = debias
 
-        super(Lamb, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
-    def step(self, closure = None):
+    def step(self, closure=None):
         r"""Performs a single optimization step.
 
         Arguments:
@@ -69,10 +64,7 @@ def step(self, closure = None):
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    msg = (
-                        "Lamb does not support sparse gradients, "
-                        "please consider SparseAdam instead"
-                    )
+                    msg = "Lamb does not support sparse gradients, please consider SparseAdam instead"
                     raise RuntimeError(msg)
 
                 state = self.state[p]
@@ -81,13 +73,9 @@ def step(self, closure = None):
                 if len(state) == 0:
                     state["step"] = 0
                     # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
+                    state["exp_avg"] = torch.zeros_like(p, memory_format=torch.preserve_format)
                     # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
+                    state["exp_avg_sq"] = torch.zeros_like(p, memory_format=torch.preserve_format)
 
                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                 beta1, beta2 = group["betas"]
@@ -129,4 +117,4 @@ def step(self, closure = None):
 
                 p.data.add_(adam_step, alpha=-step_size * trust_ratio)
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/lars.py b/src/lmflow/optim/lars.py
index 557865a6c..e507f96fc 100644
--- a/src/lmflow/optim/lars.py
+++ b/src/lmflow/optim/lars.py
@@ -1,9 +1,9 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import torch
 from torch.optim.optimizer import Optimizer
 
+
 class LARS(Optimizer):
     r"""Extends SGD in PyTorch with LARS scaling from the paper
     `Large batch training of Convolutional Networks`__.
@@ -56,13 +56,9 @@ def __init__(
         if dampening < 0.0:
             raise ValueError("Invalid dampening value: {}".format(dampening))
         if weight_decay < 0.0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         if trust_coefficient < 0.0:
-            raise ValueError(
-                "Invalid trust_coefficient value: {}".format(trust_coefficient)
-            )
+            raise ValueError("Invalid trust_coefficient value: {}".format(trust_coefficient))
 
         defaults = dict(
             lr=lr,
@@ -74,9 +70,7 @@ def __init__(
             eps=eps,
         )
         if nesterov and (momentum <= 0 or dampening != 0):
-            raise ValueError(
-                "Nesterov momentum requires a momentum and zero dampening"
-            )
+            raise ValueError("Nesterov momentum requires a momentum and zero dampening")
 
         super().__init__(params, defaults)
 
@@ -87,7 +81,7 @@ def __setstate__(self, state) -> None:
             group.setdefault("nesterov", False)
 
     @torch.no_grad()
-    def step(self, closure = None):
+    def step(self, closure=None):
         r"""Performs a single optimization step.
 
         Arguments:
@@ -116,9 +110,7 @@ def step(self, closure = None):
                 # lars scaling + weight decay part
                 if weight_decay != 0:
                     if p_norm != 0 and g_norm != 0:
-                        lars_lr = p_norm / (
-                            g_norm + p_norm * weight_decay + group["eps"]
-                        )
+                        lars_lr = p_norm / (g_norm + p_norm * weight_decay + group["eps"])
                         lars_lr *= group["trust_coefficient"]
 
                         d_p = d_p.add(p, alpha=weight_decay)
@@ -127,9 +119,7 @@ def step(self, closure = None):
                 if momentum != 0:
                     param_state = self.state[p]
                     if "momentum_buffer" not in param_state:
-                        buf = param_state["momentum_buffer"] = torch.clone(
-                            d_p
-                        ).detach()
+                        buf = param_state["momentum_buffer"] = torch.clone(d_p).detach()
                     else:
                         buf = param_state["momentum_buffer"]
                         buf.mul_(momentum).add_(d_p, alpha=1 - dampening)
@@ -140,4 +130,4 @@ def step(self, closure = None):
 
                 p.add_(d_p, alpha=-group["lr"])
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/muon.py b/src/lmflow/optim/muon.py
index 0b68f4b09..307c030b4 100644
--- a/src/lmflow/optim/muon.py
+++ b/src/lmflow/optim/muon.py
@@ -1,12 +1,10 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
-import torch
-import torch.nn as nn
 import math
-import os
-import torch.distributed as dist
-import torch.nn as nn
+
+import torch
 from torch import Tensor
+
+
 def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor:
     """
     Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
@@ -17,8 +15,10 @@ def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor:
     where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
     performance at all relative to UV^T, where USV^T = G is the SVD.
     """
-    assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
-    a, b, c = (3.4445, -4.7750,  2.0315)
+    assert (
+        G.ndim >= 2
+    )  # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
+    a, b, c = (3.4445, -4.7750, 2.0315)
     X = G.bfloat16()
     if G.size(-2) > G.size(-1):
         X = X.mT
@@ -28,16 +28,21 @@ def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor:
     # Perform the NS iterations
     for _ in range(steps):
         A = X @ X.mT
-        B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        B = (
+            b * A + c * A @ A
+        )  # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
         X = a * X + B @ X
-    
+
     if G.size(-2) > G.size(-1):
         X = X.mT
     return X
+
+
 class Muon(torch.optim.Optimizer):
     """
     Adam optimizer with orthogonalization step.
     """
+
     def __init__(self, params, lr=0.001, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, ns_steps=5):
         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, ns_steps=ns_steps)
         super().__init__(params, defaults)
@@ -56,7 +61,7 @@ def step(self, closure=None):
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad
@@ -64,35 +69,35 @@ def step(self, closure=None):
 
                 # Initialize state
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p)
-                    state['exp_avg_sq'] = torch.zeros_like(p)
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p)
+                    state["exp_avg_sq"] = torch.zeros_like(p)
 
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
 
-                state['step'] += 1
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
+                state["step"] += 1
+                bias_correction1 = 1 - beta1 ** state["step"]
+                bias_correction2 = 1 - beta2 ** state["step"]
 
                 # Update momentum and squared gradient
                 exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                 exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
 
                 # Compute the update
-                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
-                step_size = group['lr'] / bias_correction1
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group["eps"])
+                step_size = group["lr"] / bias_correction1
 
                 # Orthogonalize the update
                 update = exp_avg / denom
                 if update.ndim >= 2:
-                    update = zeropower_via_newtonschulz5(update, steps=group['ns_steps'])
+                    update = zeropower_via_newtonschulz5(update, steps=group["ns_steps"])
 
                 # Apply the update
                 p.add_(update, alpha=-step_size)
 
                 # Apply weight decay
-                if group['weight_decay'] != 0:
-                    p.add_(p, alpha=-group['lr'] * group['weight_decay'])
+                if group["weight_decay"] != 0:
+                    p.add_(p, alpha=-group["lr"] * group["weight_decay"])
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/nadam.py b/src/lmflow/optim/nadam.py
index 46330c560..3be65f6e5 100644
--- a/src/lmflow/optim/nadam.py
+++ b/src/lmflow/optim/nadam.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
-import torch
 import math
 
+import torch
+
+
 class NAdam(torch.optim.Optimizer):
     def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, momentum_decay=4e-3):
         if not 0.0 <= lr:
@@ -19,10 +20,10 @@ def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0
         if not 0.0 <= momentum_decay:
             raise ValueError("Invalid momentum_decay value: {}".format(momentum_decay))
         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, momentum_decay=momentum_decay)
-        super(NAdam, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
-        super(NAdam, self).__setstate__(state)
+        super().__setstate__(state)
 
     def step(self, closure=None):
         loss = None
@@ -30,29 +31,29 @@ def step(self, closure=None):
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError('NAdam does not support sparse gradients')
+                    raise RuntimeError("NAdam does not support sparse gradients")
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['m_prev'] = torch.zeros_like(p.data)
-                    state['v'] = torch.zeros_like(p.data)
+                    state["step"] = 0
+                    state["m_prev"] = torch.zeros_like(p.data)
+                    state["v"] = torch.zeros_like(p.data)
 
-                m_prev, v = state['m_prev'], state['v']
-                beta1, beta2 = group['betas']
+                m_prev, v = state["m_prev"], state["v"]
+                beta1, beta2 = group["betas"]
 
-                state['step'] += 1
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
+                state["step"] += 1
+                bias_correction1 = 1 - beta1 ** state["step"]
+                bias_correction2 = 1 - beta2 ** state["step"]
 
-                if group['weight_decay'] != 0:
-                    grad = grad.add(group['weight_decay'], p.data)
+                if group["weight_decay"] != 0:
+                    grad = grad.add(group["weight_decay"], p.data)
 
                 m = beta1 * m_prev + (1 - beta1) * grad
                 v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
@@ -60,14 +61,14 @@ def step(self, closure=None):
                 m_hat = m / bias_correction1
                 v_hat = v / bias_correction2
 
-                denom = v_hat.sqrt().add_(group['eps'])
+                denom = v_hat.sqrt().add_(group["eps"])
 
-                momentum_decay = group['momentum_decay']
+                momentum_decay = group["momentum_decay"]
                 m_prev.mul_(beta1).add_(1 - beta1, grad)
                 m_prev_hat = m_prev / bias_correction1
 
-                step_size = group['lr'] * math.sqrt(bias_correction2) / bias_correction1
+                step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1
 
                 p.data.addcdiv_(-step_size, m_hat + momentum_decay * m_prev_hat, denom)
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/novograd.py b/src/lmflow/optim/novograd.py
index 86666e765..a8cfff280 100644
--- a/src/lmflow/optim/novograd.py
+++ b/src/lmflow/optim/novograd.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import torch
 import torch.optim as optim
 
+
 class NovoGrad(optim.Optimizer):
-    def __init__(self, params, lr=0.01, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, grad_averaging=False, amsgrad=False):
+    def __init__(
+        self, params, lr=0.01, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, grad_averaging=False, amsgrad=False
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= eps:
@@ -16,13 +18,15 @@ def __init__(self, params, lr=0.01, betas=(0.9, 0.999), eps=1e-8, weight_decay=0
             raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, grad_averaging=grad_averaging, amsgrad=amsgrad)
-        super(NovoGrad, self).__init__(params, defaults)
+        defaults = dict(
+            lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, grad_averaging=grad_averaging, amsgrad=amsgrad
+        )
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
-        super(NovoGrad, self).__setstate__(state)
+        super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('amsgrad', False)
+            group.setdefault("amsgrad", False)
 
     def step(self, closure=None):
         loss = None
@@ -30,30 +34,30 @@ def step(self, closure=None):
             loss = closure()
 
         for group in self.param_groups:
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError('NovoGrad does not support sparse gradients')
-                amsgrad = group['amsgrad']
+                    raise RuntimeError("NovoGrad does not support sparse gradients")
+                amsgrad = group["amsgrad"]
 
                 state = self.state[p]
 
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = 0
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+                    state["step"] = 0
+                    state["exp_avg"] = torch.zeros_like(p.data)
+                    state["exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)
                     if amsgrad:
-                        state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device)
+                        state["max_exp_avg_sq"] = torch.zeros([]).to(state["exp_avg"].device)
 
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
                 if amsgrad:
-                    max_exp_avg_sq = state['max_exp_avg_sq']
-                beta1, beta2 = group['betas']
+                    max_exp_avg_sq = state["max_exp_avg_sq"]
+                beta1, beta2 = group["betas"]
 
-                state['step'] += 1
+                state["step"] += 1
 
                 norm = torch.sum(torch.pow(grad, 2))
 
@@ -66,19 +70,19 @@ def step(self, closure=None):
                     # Maintains the maximum of all 2nd moment running avg. till now
                     torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq)
                     # Use the max. for normalizing running avg. of gradient
-                    denom = max_exp_avg_sq.sqrt().add_(group['eps'])
+                    denom = max_exp_avg_sq.sqrt().add_(group["eps"])
                 else:
-                    denom = exp_avg_sq.sqrt().add_(group['eps'])
+                    denom = exp_avg_sq.sqrt().add_(group["eps"])
 
                 grad.div_(denom)
-                if group['weight_decay'] != 0:
-                    grad.add_(group['weight_decay'], p.data)
+                if group["weight_decay"] != 0:
+                    grad.add_(group["weight_decay"], p.data)
 
-                if group['grad_averaging']:
+                if group["grad_averaging"]:
                     grad.mul_(1 - beta1)
 
                 exp_avg.mul_(beta1).add_(grad)
 
-                p.data.add_(-group['lr'], exp_avg)
+                p.data.add_(-group["lr"], exp_avg)
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/optimizers.py b/src/lmflow/optim/optimizers.py
index 929230539..2f219bb0f 100644
--- a/src/lmflow/optim/optimizers.py
+++ b/src/lmflow/optim/optimizers.py
@@ -1,24 +1,2 @@
 #!/usr/bin/env python
-# coding=utf-8
-"""All optimizers.
-"""
-from lmflow.optim.dummy import Dummy
-from lmflow.optim.adabelief import AdaBelief
-from lmflow.optim.adabound import AdaBound
-from lmflow.optim.lars import LARS
-from lmflow.optim.lamb import Lamb
-from lmflow.optim.adamax import Adamax
-from lmflow.optim.nadam import NAdam
-from lmflow.optim.radam import RAdam
-from lmflow.optim.adamp import AdamP
-from lmflow.optim.sgdp import SGDP
-from lmflow.optim.yogi import Yogi
-from lmflow.optim.sophia import SophiaG
-from lmflow.optim.adan import Adan
-from lmflow.optim.novograd import NovoGrad
-from lmflow.optim.adam import Adam
-from lmflow.optim.adadelta import Adadelta
-from lmflow.optim.adagrad import AdaGrad
-from lmflow.optim.muon import Muon
-from lmflow.optim.adamw_schedule_free import AdamWScheduleFree
-from lmflow.optim.sgd_schedule_free import SGDScheduleFree
+"""All optimizers."""
diff --git a/src/lmflow/optim/radam.py b/src/lmflow/optim/radam.py
index 21232cb18..39d2f590e 100644
--- a/src/lmflow/optim/radam.py
+++ b/src/lmflow/optim/radam.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import math
 import warnings
+
 import torch
 from torch.optim.optimizer import Optimizer
 
+
 class RAdam(Optimizer):
     r"""Implements RAdam optimization algorithm.
 
@@ -24,13 +25,12 @@ def __init__(
         self,
         params,
         lr: float = 1e-3,
-        betas = (0.9, 0.999),
+        betas=(0.9, 0.999),
         eps: float = 1e-8,
         weight_decay: float = 0,
     ) -> None:
         warnings.warn(
-            "RAdam optimizer is deprecated, since it is included "
-            "in pytorch natively.",
+            "RAdam optimizer is deprecated, since it is included in pytorch natively.",
             DeprecationWarning,
             stacklevel=2,
         )
@@ -39,28 +39,15 @@ def __init__(
         if eps < 0.0:
             raise ValueError("Invalid epsilon value: {}".format(eps))
         if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 0: {}".format(betas[0])
-            )
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
         if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 1: {}".format(betas[1])
-            )
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         if weight_decay < 0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
-
-        if (
-            isinstance(params, (list, tuple))
-            and len(params) > 0
-            and isinstance(params[0], dict)
-        ):
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+
+        if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict):
             for param in params:
-                if "betas" in param and (
-                    param["betas"][0] != betas[0]
-                    or param["betas"][1] != betas[1]
-                ):
+                if "betas" in param and (param["betas"][0] != betas[0] or param["betas"][1] != betas[1]):
                     param["buffer"] = [[None, None, None] for _ in range(10)]
 
         defaults = dict(
@@ -70,12 +57,12 @@ def __init__(
             weight_decay=weight_decay,
             buffer=[[None, None, None] for _ in range(10)],
         )
-        super(RAdam, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
-        super(RAdam, self).__setstate__(state)
+        super().__setstate__(state)
 
-    def step(self, closure = None):
+    def step(self, closure=None):
         r"""Performs a single optimization step.
 
         Arguments:
@@ -97,10 +84,7 @@ def step(self, closure = None):
                     continue
                 grad = p.grad.data.float()
                 if grad.is_sparse:
-                    msg = (
-                        "RAdam does not support sparse gradients, "
-                        "please consider SparseAdam instead"
-                    )
+                    msg = "RAdam does not support sparse gradients, please consider SparseAdam instead"
                     raise RuntimeError(msg)
 
                 p_data_fp32 = p.data.float()
@@ -109,17 +93,11 @@ def step(self, closure = None):
 
                 if len(state) == 0:
                     state["step"] = 0
-                    state["exp_avg"] = torch.zeros_like(
-                        p_data_fp32, memory_format=torch.preserve_format
-                    )
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p_data_fp32, memory_format=torch.preserve_format
-                    )
+                    state["exp_avg"] = torch.zeros_like(p_data_fp32, memory_format=torch.preserve_format)
+                    state["exp_avg_sq"] = torch.zeros_like(p_data_fp32, memory_format=torch.preserve_format)
                 else:
                     state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
-                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(
-                        p_data_fp32
-                    )
+                    state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
 
                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
 
@@ -134,9 +112,7 @@ def step(self, closure = None):
                     buffered[0] = state["step"]
                     beta2_t = beta2 ** state["step"]
                     N_sma_max = 2 / (1 - beta2) - 1
-                    N_sma = N_sma_max - 2 * state["step"] * beta2_t / (
-                        1 - beta2_t
-                    )
+                    N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
                     buffered[1] = N_sma
 
                     # more conservative since it's an approximated value
@@ -170,4 +146,4 @@ def step(self, closure = None):
 
                 p.data.copy_(p_data_fp32)
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/sgd_schedule_free.py b/src/lmflow/optim/sgd_schedule_free.py
index 42a7b1746..9b47d467a 100644
--- a/src/lmflow/optim/sgd_schedule_free.py
+++ b/src/lmflow/optim/sgd_schedule_free.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# 
+#
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 #!/usr/bin/env python
@@ -9,10 +9,11 @@
 import torch
 import torch.optim
 
+
 class SGDScheduleFree(torch.optim.Optimizer):
     r"""
     Schedule-Free SGD
-    As the name suggests, no scheduler is needed with this optimizer. 
+    As the name suggests, no scheduler is needed with this optimizer.
     To add warmup, rather than using a learning rate schedule you can just
     set the warmup_steps parameter.
 
@@ -20,16 +21,21 @@ class SGDScheduleFree(torch.optim.Optimizer):
     beginning of training and evaluation respectively. The optimizer should
     also be placed in eval mode when saving checkpoints.
     """
-    def __init__(self,
-                 params, 
-                 lr=1.0, 
-                 momentum=0.9, 
-                 weight_decay=0,
-                 warmup_steps=0,
-                 r=0.0,
-                 weight_lr_power=2,
-                 foreach=hasattr(torch, "_foreach_mul_"),
-                 ):
+
+    def __init__(
+        self,
+        params,
+        lr=1.0,
+        momentum=0.9,
+        weight_decay=0,
+        warmup_steps=0,
+        r=0.0,
+        weight_lr_power=2,
+        foreach=None,
+    ):
+        if not foreach:
+            foreach = hasattr(torch, "_foreach_mul_")
+
         if lr < 0.0:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if weight_decay < 0.0:
@@ -37,42 +43,44 @@ def __init__(self,
         if momentum <= 0 or momentum >= 1:
             raise ValueError("Momentum must be between 0 and 1 exclusive: {}".format(momentum))
 
-        defaults = dict(lr=lr, 
-                        momentum=momentum, 
-                        r=r,
-                        k=0,
-                        warmup_steps=warmup_steps,
-                        train_mode=True,
-                        weight_sum=0.0,
-                        lr_max=-1.0,
-                        weight_lr_power=weight_lr_power,
-                        weight_decay=weight_decay,
-                        foreach=foreach)
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            r=r,
+            k=0,
+            warmup_steps=warmup_steps,
+            train_mode=True,
+            weight_sum=0.0,
+            lr_max=-1.0,
+            weight_lr_power=weight_lr_power,
+            weight_decay=weight_decay,
+            foreach=foreach,
+        )
         super().__init__(params, defaults)
-    
+
     def eval(self):
         for group in self.param_groups:
-            train_mode = group['train_mode']
-            momentum = group['momentum']
+            train_mode = group["train_mode"]
+            momentum = group["momentum"]
             if train_mode:
-                for p in group['params']:
+                for p in group["params"]:
                     state = self.state[p]
-                    if 'z' in state:
+                    if "z" in state:
                         # Set p.data to x
-                        p.data.lerp_(end=state['z'], weight=1-1/momentum)
-                group['train_mode'] = False
+                        p.data.lerp_(end=state["z"], weight=1 - 1 / momentum)
+                group["train_mode"] = False
 
     def train(self):
         for group in self.param_groups:
-            train_mode = group['train_mode']
-            momentum = group['momentum']
+            train_mode = group["train_mode"]
+            momentum = group["momentum"]
             if not train_mode:
-                for p in group['params']:
+                for p in group["params"]:
                     state = self.state[p]
-                    if 'z' in state:
+                    if "z" in state:
                         # Set p.data to y
-                        p.data.lerp_(end=state['z'], weight=1-momentum)
-                group['train_mode'] = True
+                        p.data.lerp_(end=state["z"], weight=1 - momentum)
+                group["train_mode"] = True
 
     def step(self, closure=None):
         """Performs a single optimization step.
@@ -85,45 +93,44 @@ def step(self, closure=None):
         loss = None
         if closure is not None:
             loss = closure()
-        
+
         for group in self.param_groups:
-            momentum = group['momentum']
-            lr = group['lr']
-            weight_decay = group['weight_decay']
-            k = group['k']
-            warmup_steps = group['warmup_steps']
+            momentum = group["momentum"]
+            lr = group["lr"]
+            weight_decay = group["weight_decay"]
+            k = group["k"]
+            warmup_steps = group["warmup_steps"]
 
             if k < warmup_steps:
-              sched = (k+1) / warmup_steps
+                sched = (k + 1) / warmup_steps
             else:
-              sched = 1.0
-            lr = group['lr']*sched
+                sched = 1.0
+            lr = group["lr"] * sched
+
+            weight_lr_power = group["weight_lr_power"]
+
+            r = group["r"]
+            lr_max = group["lr_max"] = max(lr, group["lr_max"])
 
-            weight_lr_power = group['weight_lr_power']
-            
-            r = group['r']
-            lr_max = group['lr_max'] = max(lr, group['lr_max'])
-            
-            weight = ((k+1)**r) * (lr_max**weight_lr_power)
-            weight_sum = group['weight_sum'] = group['weight_sum'] + weight
+            weight = ((k + 1) ** r) * (lr_max**weight_lr_power)
+            weight_sum = group["weight_sum"] = group["weight_sum"] + weight
 
             try:
-                ckp1 = weight/weight_sum
+                ckp1 = weight / weight_sum
             except ZeroDivisionError:
                 ckp1 = 0
 
-            if not group['train_mode']:
+            if not group["train_mode"]:
                 raise Exception("Not in train mode!")
 
-            active_p = [p for p in group['params'] if p.grad is not None]
+            active_p = [p for p in group["params"] if p.grad is not None]
 
             for p in active_p:
-                if 'z' not in self.state[p]:
-                    self.state[p]['z'] = torch.clone(p.data)
+                if "z" not in self.state[p]:
+                    self.state[p]["z"] = torch.clone(p.data)
 
-            if group['foreach'] and len(active_p) > 0:
-                y, grad, z = zip(*[(p.data, p.grad, self.state[p]['z']) 
-                                for p in active_p])
+            if group["foreach"] and len(active_p) > 0:
+                y, grad, z = zip(*[(p.data, p.grad, self.state[p]["z"]) for p in active_p])
 
                 # Apply weight decay
                 if weight_decay != 0:
@@ -132,15 +139,15 @@ def step(self, closure=None):
                 # These operations update y in-place,
                 # without computing x explicitly.
                 torch._foreach_lerp_(y, z, weight=ckp1)
-                torch._foreach_add_(y, grad, alpha=lr*(momentum*(1-ckp1)-1))
+                torch._foreach_add_(y, grad, alpha=lr * (momentum * (1 - ckp1) - 1))
 
                 # SGD step
                 torch._foreach_sub_(z, grad, alpha=lr)
             else:
                 for p in active_p:
-                    y = p.data # Notation to match theory
+                    y = p.data  # Notation to match theory
                     grad = p.grad.data
-                    z = self.state[p]['z']
+                    z = self.state[p]["z"]
 
                     # Apply weight decay
                     if weight_decay != 0:
@@ -149,10 +156,10 @@ def step(self, closure=None):
                     # These operations update y in-place,
                     # without computing x explicitly.
                     y.lerp_(end=z, weight=ckp1)
-                    y.add_(grad, alpha=lr*(momentum*(1-ckp1)-1))
+                    y.add_(grad, alpha=lr * (momentum * (1 - ckp1) - 1))
 
                     # SGD step
                     z.sub_(grad, alpha=lr)
 
-            group['k'] = k+1
-        return loss
\ No newline at end of file
+            group["k"] = k + 1
+        return loss
diff --git a/src/lmflow/optim/sgdp.py b/src/lmflow/optim/sgdp.py
index feb37f99e..e1a4ee9be 100644
--- a/src/lmflow/optim/sgdp.py
+++ b/src/lmflow/optim/sgdp.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import math
+
 import torch
 from torch.optim.optimizer import Optimizer
 
@@ -38,9 +38,7 @@ def __init__(
         if dampening < 0.0:
             raise ValueError("Invalid dampening value: {}".format(dampening))
         if weight_decay < 0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
         if delta < 0:
             raise ValueError("Invalid delta value: {}".format(delta))
         if wd_ratio < 0:
@@ -56,7 +54,7 @@ def __init__(
             wd_ratio=wd_ratio,
             nesterov=nesterov,
         )
-        super(SGDP, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
     @staticmethod
     def _channel_view(x):
@@ -84,19 +82,15 @@ def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
             cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)
 
             if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
-                p_n = p.data / view_func(p.data).norm(dim=1).view(
-                    expand_size
-                ).add_(eps)
-                perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(
-                    expand_size
-                )
+                p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
+                perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
                 wd = wd_ratio
 
                 return perturb, wd
 
         return perturb, wd
 
-    def step(self, closure = None):
+    def step(self, closure=None):
         r"""Performs a single optimization step.
 
         Arguments:
@@ -121,9 +115,7 @@ def step(self, closure = None):
 
                 # State initialization
                 if len(state) == 0:
-                    state["momentum"] = torch.zeros_like(
-                        p.data, memory_format=torch.preserve_format
-                    )
+                    state["momentum"] = torch.zeros_like(p.data, memory_format=torch.preserve_format)
 
                 # SGD
                 buf = state["momentum"]
@@ -147,15 +139,9 @@ def step(self, closure = None):
 
                 # Weight decay
                 if weight_decay != 0:
-                    p.data.mul_(
-                        1
-                        - group["lr"]
-                        * group["weight_decay"]
-                        * wd_ratio
-                        / (1 - momentum)
-                    )
+                    p.data.mul_(1 - group["lr"] * group["weight_decay"] * wd_ratio / (1 - momentum))
 
                 # Step
                 p.data.add_(d_p, alpha=-group["lr"])
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/optim/sophia.py b/src/lmflow/optim/sophia.py
index 7d8e8db34..bd5b3804c 100644
--- a/src/lmflow/optim/sophia.py
+++ b/src/lmflow/optim/sophia.py
@@ -1,18 +1,26 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import torch
 from torch.optim.optimizer import Optimizer
 
+
 class SophiaG(Optimizer):
     """
     Sophia: A Scalable Stochastic Second-order Optimizer for Language Model Pre-training.
     Code from: https://github.com/Liuhong99/Sophia/
     """
 
-    def __init__(self, params, lr=1e-4, betas=(0.965, 0.99), rho = 0.04,
-         weight_decay=1e-1, *, maximize: bool = False,
-         capturable: bool = False):
+    def __init__(
+        self,
+        params,
+        lr=1e-4,
+        betas=(0.965, 0.99),
+        rho=0.04,
+        weight_decay=1e-1,
+        *,
+        maximize: bool = False,
+        capturable: bool = False,
+    ):
         if not 0.0 <= lr:
             raise ValueError("Invalid learning rate: {}".format(lr))
         if not 0.0 <= betas[0] < 1.0:
@@ -23,41 +31,44 @@ def __init__(self, params, lr=1e-4, betas=(0.965, 0.99), rho = 0.04,
             raise ValueError("Invalid rho parameter at index 1: {}".format(rho))
         if not 0.0 <= weight_decay:
             raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
-        defaults = dict(lr=lr, betas=betas, rho=rho, 
-                        weight_decay=weight_decay, 
-                        maximize=maximize, capturable=capturable)
-        super(SophiaG, self).__init__(params, defaults)
+        defaults = dict(
+            lr=lr, betas=betas, rho=rho, weight_decay=weight_decay, maximize=maximize, capturable=capturable
+        )
+        super().__init__(params, defaults)
 
     def __setstate__(self, state):
         super().__setstate__(state)
         for group in self.param_groups:
-            group.setdefault('maximize', False)
-            group.setdefault('capturable', False)
+            group.setdefault("maximize", False)
+            group.setdefault("capturable", False)
         state_values = list(self.state.values())
-        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]['step'])
+        step_is_tensor = (len(state_values) != 0) and torch.is_tensor(state_values[0]["step"])
         if not step_is_tensor:
             for s in state_values:
-                s['step'] = torch.tensor(float(s['step']))
+                s["step"] = torch.tensor(float(s["step"]))
 
     @torch.no_grad()
     def update_hessian(self):
         for group in self.param_groups:
-            beta1, beta2 = group['betas']
-            for p in group['params']:
+            beta1, beta2 = group["betas"]
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 state = self.state[p]
 
                 if len(state) == 0:
-                    state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \
-                        if self.defaults['capturable'] else torch.tensor(0.)
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                
-                if 'hessian' not in state.keys():
-                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["step"] = (
+                        torch.zeros((1,), dtype=torch.float, device=p.device)
+                        if self.defaults["capturable"]
+                        else torch.tensor(0.0)
+                    )
+                    state["exp_avg"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["hessian"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                if "hessian" not in state.keys():
+                    state["hessian"] = torch.zeros_like(p, memory_format=torch.preserve_format)
 
-                state['hessian'].mul_(beta2).addcmul_(p.grad, p.grad, value=1 - beta2)
+                state["hessian"].mul_(beta2).addcmul_(p.grad, p.grad, value=1 - beta2)
 
     @torch.no_grad()
     def step(self, closure=None, bs=5120):
@@ -72,38 +83,41 @@ def step(self, closure=None, bs=5120):
             exp_avgs = []
             state_steps = []
             hessian = []
-            beta1, beta2 = group['betas']
+            beta1, beta2 = group["betas"]
 
-            for p in group['params']:
+            for p in group["params"]:
                 if p.grad is None:
                     continue
                 params_with_grad.append(p)
-                
+
                 if p.grad.is_sparse:
-                    raise RuntimeError('SophiaG does not support sparse gradients')
+                    raise RuntimeError("SophiaG does not support sparse gradients")
                 grads.append(p.grad)
                 state = self.state[p]
                 # State initialization
                 if len(state) == 0:
-                    state['step'] = torch.zeros((1,), dtype=torch.float, device=p.device) \
-                        if self.defaults['capturable'] else torch.tensor(0.)
-                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)
-                
-                if 'hessian' not in state.keys():
-                    state['hessian'] = torch.zeros_like(p, memory_format=torch.preserve_format)                
-
-                exp_avgs.append(state['exp_avg'])
-                state_steps.append(state['step'])
-                hessian.append(state['hessian'])
-                
-                if self.defaults['capturable']:
+                    state["step"] = (
+                        torch.zeros((1,), dtype=torch.float, device=p.device)
+                        if self.defaults["capturable"]
+                        else torch.tensor(0.0)
+                    )
+                    state["exp_avg"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    state["hessian"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                if "hessian" not in state.keys():
+                    state["hessian"] = torch.zeros_like(p, memory_format=torch.preserve_format)
+
+                exp_avgs.append(state["exp_avg"])
+                state_steps.append(state["step"])
+                hessian.append(state["hessian"])
+
+                if self.defaults["capturable"]:
                     bs = torch.ones((1,), dtype=torch.float, device=p.device) * bs
 
             # Perform the actual update step here instead of calling SophiaG again
             for p, grad, exp_avg, h, step in zip(params_with_grad, grads, exp_avgs, hessian, state_steps):
-                if group['weight_decay'] != 0:
-                    grad = grad.add(p, alpha=group['weight_decay'])
+                if group["weight_decay"] != 0:
+                    grad = grad.add(p, alpha=group["weight_decay"])
 
                 # Decay the first and second moment running average coefficient
                 exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
@@ -111,8 +125,7 @@ def step(self, closure=None, bs=5120):
 
                 # Compute the update using the hessian information
                 update = exp_avg.div(1 - beta1 ** step.item())
-                h_sqrt = h.sqrt().add_(group['rho'])
-                p.addcdiv_(update, h_sqrt, value=-group['lr'])
+                h_sqrt = h.sqrt().add_(group["rho"])
+                p.addcdiv_(update, h_sqrt, value=-group["lr"])
 
         return loss
-        
\ No newline at end of file
diff --git a/src/lmflow/optim/utils.py b/src/lmflow/optim/utils.py
new file mode 100644
index 000000000..57430bdd0
--- /dev/null
+++ b/src/lmflow/optim/utils.py
@@ -0,0 +1,201 @@
+from typing import Any, Optional
+
+from transformers import PreTrainedModel
+from transformers.utils import is_sagemaker_mp_enabled
+
+import lmflow.optim.optimizers as optim
+from lmflow.args import OptimizerNames, TrainingArguments
+
+
+def create_customized_optimizer(base_trainer_class, model_args):
+    class CustomizedOptimTrainer(base_trainer_class):
+        @staticmethod
+        def get_optimizer_cls_and_kwargs(
+            args: TrainingArguments,
+            model: Optional[PreTrainedModel] = None,
+        ) -> tuple[Any, Any]:
+            # parse args.optim_args
+            optim_args = {}
+            if args.customized_optim_args:
+                for mapping in args.customized_optim_args.replace(" ", "").split(","):
+                    key, value = mapping.split("=")
+                    optim_args[key] = value
+
+            optimizer_kwargs = {"lr": args.learning_rate}
+
+            if args.customized_optim == OptimizerNames.DUMMY:
+                optimizer_cls = optim.Dummy
+                dummy_kwargs = {
+                    "betas": (args.optim_dummy_beta1, args.optim_dummy_beta2),
+                }
+                optimizer_kwargs.update(dummy_kwargs)
+            elif args.customized_optim == OptimizerNames.ADABELIEF:
+                optimizer_cls = optim.AdaBelief
+                adabelief_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(adabelief_kwargs)
+            elif args.customized_optim == OptimizerNames.ADABOUND:
+                optimizer_cls = optim.AdaBound
+                adabound_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(adabound_kwargs)
+            elif args.customized_optim == OptimizerNames.LARS:
+                optimizer_cls = optim.LARS
+                lars_kwargs = {
+                    "momentum": (args.optim_momentum),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(lars_kwargs)
+            elif args.customized_optim == OptimizerNames.LAMB:
+                optimizer_cls = optim.Lamb
+                lamb_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(lamb_kwargs)
+            elif args.customized_optim == OptimizerNames.ADAMAX:
+                optimizer_cls = optim.Adamax
+                adamax_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(adamax_kwargs)
+            elif args.customized_optim == OptimizerNames.NADAM:
+                optimizer_cls = optim.NAdam
+                nadam_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(nadam_kwargs)
+            elif args.customized_optim == OptimizerNames.RADAM:
+                optimizer_cls = optim.RAdam
+                radam_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(radam_kwargs)
+            elif args.customized_optim == OptimizerNames.ADAMP:
+                optimizer_cls = optim.AdamP
+                adamp_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(adamp_kwargs)
+            elif args.customized_optim == OptimizerNames.SGDP:
+                optimizer_cls = optim.SGDP
+                sgdp_kwargs = {
+                    "momentum": (args.optim_momentum),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(sgdp_kwargs)
+            elif args.customized_optim == OptimizerNames.YOGI:
+                optimizer_cls = optim.Yogi
+                yogi_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(yogi_kwargs)
+            elif args.customized_optim == OptimizerNames.SOPHIA:
+                optimizer_cls = optim.SophiaG
+                sophia_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(sophia_kwargs)
+            elif args.customized_optim == OptimizerNames.ADAM:
+                optimizer_cls = optim.Adam
+                adam_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                }
+                optimizer_kwargs.update(adam_kwargs)
+            elif args.customized_optim == OptimizerNames.NOVOGRAD:
+                optimizer_cls = optim.NovoGrad
+                novograd_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(novograd_kwargs)
+            elif args.customized_optim == OptimizerNames.ADADELTA:
+                optimizer_cls = optim.Adadelta
+                adadelta_kwargs = {}
+                optimizer_kwargs.update(adadelta_kwargs)
+            elif args.customized_optim == OptimizerNames.ADAGRAD:
+                optimizer_cls = optim.AdaGrad
+                adagrad_kwargs = {}
+                optimizer_kwargs.update(adagrad_kwargs)
+            elif args.customized_optim == OptimizerNames.ADAMW_SCHEDULE_FREE:
+                optimizer_cls = optim.AdamWScheduleFree
+                adamw_schedule_free_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(adamw_schedule_free_kwargs)
+            elif args.customized_optim == OptimizerNames.SGD_SCHEDULE_FREE:
+                optimizer_cls = optim.SGDScheduleFree
+                sgd_schedule_free_kwargs = {
+                    "momentum": (args.optim_momentum),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(sgd_schedule_free_kwargs)
+            elif args.customized_optim == OptimizerNames.ADAN:
+                optimizer_cls = optim.Adan
+                adan_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2, args.optim_beta3),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(adan_kwargs)
+            elif args.customized_optim == OptimizerNames.MUON:
+                optimizer_cls = optim.Muon
+                muon_kwargs = {
+                    "betas": (args.optim_beta1, args.optim_beta2),
+                    "weight_decay": (args.optim_weight_decay),
+                }
+                optimizer_kwargs.update(muon_kwargs)
+            else:
+                raise ValueError(f"Trainer cannot instantiate unsupported optimizer:  {args.customized_optim}")
+            return optimizer_cls, optimizer_kwargs
+
+        def create_optimizer(self):
+            opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
+
+            if self.optimizer is None:
+                decay_parameters = self.get_decay_parameter_names(opt_model)
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+
+                optimizer_cls, optimizer_kwargs = CustomizedOptimTrainer.get_optimizer_cls_and_kwargs(
+                    self.args, opt_model
+                )
+
+                # Overwrite `params` in case it's created by
+                # `get_optimizer_cls_and_kwargs` e.g. for GaLore optimizer.
+                if "params" in optimizer_kwargs:
+                    optimizer_grouped_parameters = optimizer_kwargs.pop("params")
+
+                # For layer-wise dummy optimizers we overwrite
+                # optimizer_grouped_parameters with `optimizer_dict` to
+                # avoid arguments conflicts.
+                if "optimizer_dict" in optimizer_kwargs:
+                    optimizer_grouped_parameters = optimizer_kwargs.pop("optimizer_dict")
+
+                self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+
+    return CustomizedOptimTrainer
diff --git a/src/lmflow/optim/yogi.py b/src/lmflow/optim/yogi.py
index f184fd391..b9eaae401 100644
--- a/src/lmflow/optim/yogi.py
+++ b/src/lmflow/optim/yogi.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
 import math
+
 import torch
 import torch.nn as nn
 from torch.optim.optimizer import Optimizer
 
+
 class Yogi(Optimizer):
     r"""Implements Yogi Optimizer Algorithm.
     It has been proposed in `Adaptive methods for Nonconvex Optimization`.
@@ -20,7 +21,7 @@ def __init__(
         self,
         params,
         lr: float = 1e-2,
-        betas = (0.9, 0.999),
+        betas=(0.9, 0.999),
         eps: float = 1e-3,
         initial_accumulator: float = 1e-6,
         weight_decay: float = 0,
@@ -30,17 +31,11 @@ def __init__(
         if eps < 0.0:
             raise ValueError("Invalid epsilon value: {}".format(eps))
         if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 0: {}".format(betas[0])
-            )
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
         if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 1: {}".format(betas[1])
-            )
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
         if weight_decay < 0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay)
-            )
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
 
         defaults = dict(
             lr=lr,
@@ -49,9 +44,9 @@ def __init__(
             initial_accumulator=initial_accumulator,
             weight_decay=weight_decay,
         )
-        super(Yogi, self).__init__(params, defaults)
+        super().__init__(params, defaults)
 
-    def step(self, closure = None):
+    def step(self, closure=None):
         r"""Performs a single optimization step.
 
         Arguments:
@@ -67,10 +62,7 @@ def step(self, closure = None):
                     continue
                 grad = p.grad.data
                 if grad.is_sparse:
-                    raise RuntimeError(
-                        "Yogi does not support sparse gradients, "
-                        "please consider SparseAdam instead"
-                    )
+                    raise RuntimeError("Yogi does not support sparse gradients, please consider SparseAdam instead")
 
                 state = self.state[p]
 
@@ -83,16 +75,12 @@ def step(self, closure = None):
                     state["step"] = 0
                     # Exponential moving average of gradient values
                     state["exp_avg"] = nn.init.constant_(
-                        torch.empty_like(
-                            p.data, memory_format=torch.preserve_format
-                        ),
+                        torch.empty_like(p.data, memory_format=torch.preserve_format),
                         group["initial_accumulator"],
                     )
                     # Exponential moving average of squared gradient values
                     state["exp_avg_sq"] = nn.init.constant_(
-                        torch.empty_like(
-                            p.data, memory_format=torch.preserve_format
-                        ),
+                        torch.empty_like(p.data, memory_format=torch.preserve_format),
                         group["initial_accumulator"],
                     )
 
@@ -117,10 +105,8 @@ def step(self, closure = None):
                     value=-(1 - beta2),
                 )
 
-                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(
-                    group["eps"]
-                )
+                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group["eps"])
                 step_size = group["lr"] / bias_correction1
                 p.data.addcdiv_(exp_avg, denom, value=-step_size)
 
-        return loss
\ No newline at end of file
+        return loss
diff --git a/src/lmflow/pipeline/auto_pipeline.py b/src/lmflow/pipeline/auto_pipeline.py
index 98b08ef39..cbc7d82e3 100644
--- a/src/lmflow/pipeline/auto_pipeline.py
+++ b/src/lmflow/pipeline/auto_pipeline.py
@@ -1,19 +1,13 @@
 #!/usr/bin/env python
-# coding=utf-8
-"""Return a pipeline automatically based on its name.
-"""
-from lmflow.utils.versioning import (
-    is_package_version_at_least, 
-    is_vllm_available, 
-    is_trl_available, 
-    is_ray_available
-)
+"""Return a pipeline automatically based on its name."""
 
 from lmflow.pipeline.evaluator import Evaluator
 from lmflow.pipeline.finetuner import Finetuner
 from lmflow.pipeline.inferencer import Inferencer
-from lmflow.pipeline.rm_tuner import RewardModelTuner
 from lmflow.pipeline.rm_inferencer import RewardModelInferencer
+from lmflow.pipeline.rm_tuner import RewardModelTuner
+from lmflow.utils.versioning import is_package_version_at_least, is_ray_available, is_trl_available, is_vllm_available
+
 PIPELINE_MAPPING = {
     "evaluator": Evaluator,
     "finetuner": Finetuner,
@@ -23,62 +17,51 @@
 }
 PIPELINE_NEEDS_EXTRAS = []
 
-if not is_package_version_at_least('transformers', '4.35.0'):
+if not is_package_version_at_least("transformers", "4.35.0"):
     from lmflow.pipeline.raft_aligner import RaftAligner
-    PIPELINE_MAPPING['raft_aligner'] = RaftAligner
+
+    PIPELINE_MAPPING["raft_aligner"] = RaftAligner
 else:
-    PIPELINE_NEEDS_EXTRAS.append('raft_aligner')
-    
+    PIPELINE_NEEDS_EXTRAS.append("raft_aligner")
+
 if is_vllm_available():
     from lmflow.pipeline.vllm_inferencer import VLLMInferencer
-    PIPELINE_MAPPING['vllm_inferencer'] = VLLMInferencer
+
+    PIPELINE_MAPPING["vllm_inferencer"] = VLLMInferencer
 else:
-    PIPELINE_NEEDS_EXTRAS.append('vllm_inferencer')
-    
+    PIPELINE_NEEDS_EXTRAS.append("vllm_inferencer")
+
 if is_trl_available():
     from lmflow.pipeline.dpo_aligner import DPOAligner
     from lmflow.pipeline.dpov2_aligner import DPOv2Aligner
-    PIPELINE_MAPPING['dpo_aligner'] = DPOAligner
-    PIPELINE_MAPPING['dpov2_aligner'] = DPOv2Aligner
+
+    PIPELINE_MAPPING["dpo_aligner"] = DPOAligner
+    PIPELINE_MAPPING["dpov2_aligner"] = DPOv2Aligner
 else:
-    PIPELINE_NEEDS_EXTRAS.extend(['dpo_aligner', 'dpov2_aligner'])
-    
+    PIPELINE_NEEDS_EXTRAS.extend(["dpo_aligner", "dpov2_aligner"])
+
 if is_vllm_available() and is_trl_available() and is_ray_available():
     from lmflow.pipeline.iterative_dpo_aligner import IterativeDPOAligner
-    PIPELINE_MAPPING['iterative_dpo_aligner'] = IterativeDPOAligner
+
+    PIPELINE_MAPPING["iterative_dpo_aligner"] = IterativeDPOAligner
 else:
-    PIPELINE_NEEDS_EXTRAS.append('iterative_dpo_aligner')
+    PIPELINE_NEEDS_EXTRAS.append("iterative_dpo_aligner")
 
 
 class AutoPipeline:
-    """ 
+    """
     The class designed to return a pipeline automatically based on its name.
     """
+
     @classmethod
-    def get_pipeline(self,
-        pipeline_name,
-        model_args,
-        data_args,
-        pipeline_args,
-        *args,
-        **kwargs
-    ):
+    def get_pipeline(self, pipeline_name, model_args, data_args, pipeline_args, *args, **kwargs):
         if pipeline_name not in PIPELINE_MAPPING:
             if pipeline_name in PIPELINE_NEEDS_EXTRAS:
                 raise NotImplementedError(
-                    f'Please install the necessary dependencies '
-                    f'to use pipeline "{pipeline_name}"'
+                    f'Please install the necessary dependencies to use pipeline "{pipeline_name}"'
                 )
-                
-            raise NotImplementedError(
-                f'Pipeline "{pipeline_name}" is not supported'
-            )
 
-        pipeline = PIPELINE_MAPPING[pipeline_name](
-            model_args,
-            data_args,
-            pipeline_args,
-            *args,
-            **kwargs
-        )
+            raise NotImplementedError(f'Pipeline "{pipeline_name}" is not supported')
+
+        pipeline = PIPELINE_MAPPING[pipeline_name](model_args, data_args, pipeline_args, *args, **kwargs)
         return pipeline
diff --git a/src/lmflow/pipeline/base_aligner.py b/src/lmflow/pipeline/base_aligner.py
index c2a640a5d..3c1c48ffc 100644
--- a/src/lmflow/pipeline/base_aligner.py
+++ b/src/lmflow/pipeline/base_aligner.py
@@ -1,14 +1,14 @@
 #!/usr/bin/env python
-# coding=utf-8
-""" BaseTuner: a subclass of BasePipeline.
-"""
+"""BaseTuner: a subclass of BasePipeline."""
+
+from abc import abstractmethod
 
 from lmflow.pipeline.base_pipeline import BasePipeline
 
 
 class BaseAligner(BasePipeline):
-    """ A subclass of BasePipeline which is alignable.
-    """
+    """A subclass of BasePipeline which is alignable."""
+
     def __init__(self, *args, **kwargs):
         pass
 
@@ -17,5 +17,6 @@ def _check_if_alignable(self, model, dataset, reward_model):
         # TODO: add reward_model
         pass
 
+    @abstractmethod
     def align(self, model, dataset, reward_model):
         raise NotImplementedError(".align is not implemented")
diff --git a/src/lmflow/pipeline/base_pipeline.py b/src/lmflow/pipeline/base_pipeline.py
index e5d03b91e..cec27b1c4 100644
--- a/src/lmflow/pipeline/base_pipeline.py
+++ b/src/lmflow/pipeline/base_pipeline.py
@@ -1,9 +1,6 @@
 #!/usr/bin/env python
-# coding=utf-8
-""" BasePipeline.
-"""
+"""BasePipeline."""
 
-from abc import ABC         # abstract class
 
-class BasePipeline(ABC):
+class BasePipeline:
     pass
diff --git a/src/lmflow/pipeline/base_tuner.py b/src/lmflow/pipeline/base_tuner.py
index d73340d8f..bf0ba40a0 100644
--- a/src/lmflow/pipeline/base_tuner.py
+++ b/src/lmflow/pipeline/base_tuner.py
@@ -1,14 +1,14 @@
 #!/usr/bin/env python
-# coding=utf-8
-""" BaseTuner: a subclass of BasePipeline.
-"""
+"""BaseTuner: a subclass of BasePipeline."""
+
+from abc import abstractmethod
 
 from lmflow.pipeline.base_pipeline import BasePipeline
 
 
 class BaseTuner(BasePipeline):
-    """ A subclass of BasePipeline which is tunable.
-    """
+    """A subclass of BasePipeline which is tunable."""
+
     def __init__(self, *args, **kwargs):
         pass
 
@@ -16,5 +16,6 @@ def _check_if_tunable(self, model, dataset):
         # TODO: check if the model is tunable and dataset is compatible
         pass
 
+    @abstractmethod
     def tune(self, model, dataset):
         raise NotImplementedError(".tune is not implemented")
diff --git a/src/lmflow/pipeline/dpo_aligner.py b/src/lmflow/pipeline/dpo_aligner.py
index cdc0868a0..254762e46 100644
--- a/src/lmflow/pipeline/dpo_aligner.py
+++ b/src/lmflow/pipeline/dpo_aligner.py
@@ -1,12 +1,11 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 # @Time    : 7/4/2024 21:12
 # @Author  : Yu Li
 # @Site    :
 # @File    : dpo_pipeline.py
 import os
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Optional
 
 from datasets import Dataset, load_dataset
 from peft import LoraConfig
@@ -22,29 +21,26 @@
 
 
 def get_paired_dataset(
-        data_root: str,
-        data_dir: str,
-        sanity_check: bool = False,
-        cache_dir: Optional[str] = None,
-        num_proc=24,
+    data_root: str,
+    data_dir: str,
+    sanity_check: bool = False,
+    cache_dir: Optional[str] = None,
+    num_proc=24,
 ) -> Dataset:
     """Load dataset and convert it to the necessary format.
 
     The dataset is converted to a dictionary with the following structure:
     {
-        'prompt': List[str],
-        'chosen': List[str],
-        'rejected': List[str],
+        'prompt': list[str],
+        'chosen': list[str],
+        'rejected': list[str],
     }
 
     Prompts are structured as follows:
       "Question: " + <prompt> + "\n\nAnswer: "
     """
     data_path = Path(data_root) / data_dir
-    data_files = [
-        x.absolute().as_posix()
-            for x in data_path.glob("*.json")
-    ]
+    data_files = [x.absolute().as_posix() for x in data_path.glob("*.json")]
     dataset = load_dataset(
         path=data_root,
         split="train",
@@ -56,7 +52,7 @@ def get_paired_dataset(
     if sanity_check:
         dataset = dataset.select(range(min(len(dataset), 1000)))
 
-    def return_prompt_and_responses(samples) -> Dict[str, str]:
+    def return_prompt_and_responses(samples) -> dict[str, str]:
         return {
             "prompt": ["Question: " + question + "\n\nAnswer: " for question in samples["question"]],
             "chosen": samples["response_j"],
@@ -135,21 +131,21 @@ def _initialize_trainer(self, model, tokenizer):
 
     def _load_dataset(self):
         # load training set
-        self.train_dataset = get_paired_dataset(data_root=self.data_args.dataset_path,
-                                                data_dir="train",
-                                                sanity_check=self.aligner_args.sanity_check)
+        self.train_dataset = get_paired_dataset(
+            data_root=self.data_args.dataset_path, data_dir="train", sanity_check=self.aligner_args.sanity_check
+        )
         self.train_dataset = self.train_dataset.filter(
             lambda x: len(x["prompt"]) + len(x["chosen"]) <= self.aligner_args.max_length
-                      and len(x["prompt"]) + len(x["rejected"]) <= self.aligner_args.max_length
+            and len(x["prompt"]) + len(x["rejected"]) <= self.aligner_args.max_length
         )
         # load evaluation set
         if self.aligner_args.eval_dataset_path:
-            self.eval_dataset = get_paired_dataset(data_root=self.aligner_args.eval_dataset_path,
-                                                data_dir="test",
-                                                sanity_check=True)
+            self.eval_dataset = get_paired_dataset(
+                data_root=self.aligner_args.eval_dataset_path, data_dir="test", sanity_check=True
+            )
             self.eval_dataset = self.eval_dataset.filter(
                 lambda x: len(x["prompt"]) + len(x["chosen"]) <= self.aligner_args.max_length
-                        and len(x["prompt"]) + len(x["rejected"]) <= self.aligner_args.max_length
+                and len(x["prompt"]) + len(x["rejected"]) <= self.aligner_args.max_length
             )
 
     def align(self, model, dataset, reward_model):
@@ -158,7 +154,6 @@ def align(self, model, dataset, reward_model):
         tokenizer.pad_token_id = tokenizer.eos_token_id
         self._load_dataset()
 
-        wrapped_model = model
         model = model.get_backend_model()
 
         dpo_trainer = self._initialize_trainer(model, tokenizer)
@@ -168,7 +163,3 @@ def align(self, model, dataset, reward_model):
         # 7. save
         output_dir = os.path.join(self.aligner_args.output_dir, "final_checkpoint")
         dpo_trainer.model.save_pretrained(output_dir)
-
-
-
-
diff --git a/src/lmflow/pipeline/dpov2_aligner.py b/src/lmflow/pipeline/dpov2_aligner.py
index 7d5104cd0..4e93b6253 100644
--- a/src/lmflow/pipeline/dpov2_aligner.py
+++ b/src/lmflow/pipeline/dpov2_aligner.py
@@ -4,37 +4,26 @@
 import os
 import subprocess
 import sys
-from typing import Optional, List, Tuple, Dict, Union
+from typing import Union
 
 import numpy as np
-from tqdm import tqdm
 import torch
+from tqdm import tqdm
 from transformers import TrainingArguments
 
-from lmflow.pipeline.utils.dpov2_trainer import DPOv2Trainer
-from lmflow.pipeline.base_aligner import BaseAligner
-from lmflow.args import (
-    ModelArguments,
-    DatasetArguments,
-    DPOv2AlignerArguments
-)
-from lmflow.utils.common import (
-    make_shell_args_from_dataclass, 
-    add_dataclass_attr_prefix, 
-    create_copied_dataclass
-)
+from lmflow.args import DatasetArguments, DPOv2AlignerArguments, ModelArguments
+from lmflow.datasets.dataset import KEY_INSTANCES, KEY_SCORE, KEY_TYPE, Dataset
 from lmflow.models.hf_decoder_model import HFDecoderModel
-from lmflow.datasets.dataset import Dataset, KEY_SCORE, KEY_TYPE, KEY_INSTANCES
+from lmflow.pipeline.base_aligner import BaseAligner
+from lmflow.pipeline.utils.dpov2_trainer import DPOv2Trainer
+from lmflow.utils.common import add_dataclass_attr_prefix, create_copied_dataclass, make_shell_args_from_dataclass
 from lmflow.utils.constants import MEMORY_SAFE_DPOV2_ALIGN_ENV_VAR_TO_REMOVE
 
-
 logger = logging.getLogger(__name__)
 
 
 ReferenceModelArguments = create_copied_dataclass(
-    original_dataclass=ModelArguments, 
-    field_prefix="reference_",
-    class_prefix="Reference"
+    original_dataclass=ModelArguments, field_prefix="reference_", class_prefix="Reference"
 )
 
 
@@ -51,29 +40,25 @@ def __init__(
         self.data_args = data_args
         self.aligner_args = aligner_args
 
-
     def align(
         self,
         model: HFDecoderModel,
         ref_model: HFDecoderModel,
         train_dataset: Dataset,
         eval_dataset: Dataset,
-        transform_dataset_in_place: bool=True,
+        transform_dataset_in_place: bool = True,
     ):
-        if (train_dataset.get_type() not in ["text_to_scored_textlist", "paired_text_to_text"]) or \
-            (eval_dataset.get_type() not in ["text_to_scored_textlist", "paired_text_to_text"]):
-            raise ValueError(
-                f"Unsupported dataset type {train_dataset.get_type()} for DPOv2 aligner."
-            )
-            
+        if (train_dataset.get_type() not in ["text_to_scored_textlist", "paired_text_to_text"]) or (
+            eval_dataset.get_type() not in ["text_to_scored_textlist", "paired_text_to_text"]
+        ):
+            raise ValueError(f"Unsupported dataset type {train_dataset.get_type()} for DPOv2 aligner.")
+
         # step 0. setting up
         if self.aligner_args.gradient_checkpointing:
-            logger.warning(
-                "Setting backend_model.config.use_cache to False since using gradient checkpointing"
-            )
+            logger.warning("Setting backend_model.config.use_cache to False since using gradient checkpointing")
             model.get_backend_model().config.use_cache = False
             ref_model.get_backend_model().config.use_cache = False
-            
+
         # step 1. prepare datasets
         if train_dataset.get_type() == "text_to_scored_textlist":
             train_dataset = self.convert_to_paired_dataset(
@@ -84,7 +69,9 @@ def align(
                 use_fast=False,
             )
         if self.data_args.max_train_samples:
-            train_dataset.backend_dataset = train_dataset.backend_dataset.select(range(self.data_args.max_train_samples))
+            train_dataset.backend_dataset = train_dataset.backend_dataset.select(
+                range(self.data_args.max_train_samples)
+            )
 
         if eval_dataset.get_type() == "text_to_scored_textlist":
             eval_dataset = self.convert_to_paired_dataset(
@@ -93,12 +80,12 @@ def align(
                 margin_scale=self.aligner_args.margin_scale,
                 use_fast=False,
             )
-            
+
         # step 2. prepare trainer
         dpo_trainer = DPOv2Trainer(
             model.get_backend_model(),
             ref_model.get_backend_model(),
-            train_dataset=train_dataset.get_backend_dataset(), # tokenization is done in the trainer
+            train_dataset=train_dataset.get_backend_dataset(),  # tokenization is done in the trainer
             eval_dataset=eval_dataset.get_backend_dataset(),
             tokenizer=model.tokenizer,
             args=self.__prepare_training_args(self.aligner_args),
@@ -108,9 +95,10 @@ def align(
             max_length=self.aligner_args.max_length,
             mask_prompt=self.aligner_args.mask_prompt,
             len_penalty=self.aligner_args.length_penalty,
-            # preprocessing_num_workers=self.data_args.preprocessing_num_workers, # will trigger TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object
         )
-        
+        # will trigger TypeError: cannot pickle 'torch._C._distributed_c10d.ProcessGroup' object
+        # if we use preprocessing_num_workers
+
         # step 3. train
         dpo_trainer.train()
         dpo_trainer.save_model(self.aligner_args.output_dir)
@@ -118,12 +106,11 @@ def align(
         # step 4. save
         output_dir = os.path.join(self.aligner_args.output_dir, "final_checkpoint")
         dpo_trainer.model.save_pretrained(output_dir)
-        
+
         # step 5. release resources
         with torch.no_grad():
             torch.cuda.empty_cache()
-        
-        
+
     def __prepare_training_args(
         self,
         args: DPOv2AlignerArguments,
@@ -147,94 +134,85 @@ def __prepare_training_args(
             bf16=args.bf16,
             report_to=args.report_to,
             run_name=args.run_name,
-            remove_unused_columns=False, # DO NOT CHANGE THIS, may cause error https://discuss.huggingface.co/t/indexerror-invalid-key-16-is-out-of-bounds-for-size-0/14298/3
+            remove_unused_columns=False,  # DO NOT CHANGE THIS, may cause error https://discuss.huggingface.co/t/indexerror-invalid-key-16-is-out-of-bounds-for-size-0/14298/3
         )
         logger.debug(f"Actual training arguments for dpo trainer: {training_args}")
-        
+
         return training_args
-    
-    
+
     def convert_to_paired_dataset(
         self,
         source_dataset: Dataset,
-        sampling_paired_method: str="random",
-        length_penalty: float=0.0,
-        margin_scale: float=1.0,
-        use_fast: bool=False,
+        sampling_paired_method: str = "random",
+        length_penalty: float = 0.0,
+        margin_scale: float = 1.0,
+        use_fast: bool = False,
     ) -> Dataset:
-        """Convert a scored one to multiple (text_to_scored_textlist) to a paired dataset by rejection sampling.
-        """
-        output_dict = {
-            KEY_INSTANCES: []
-        }
+        """Convert a scored one to multiple (text_to_scored_textlist) to a paired dataset by rejection sampling."""
+        output_dict = {KEY_INSTANCES: []}
         if source_dataset.get_type() in ["text_to_scored_textlist"]:
             output_dict[KEY_TYPE] = "paired_text_to_text"
-        
+
         for sample in tqdm(source_dataset.get_backend_dataset(), desc="Converting to paired dataset"):
             sample_output_dict = {}
             lengths = self._calc_response_lengths(sample["output"], source_dataset.get_type())
             penalized_rewards = self._calc_reward_with_length_penalty(
-                rewards=[content[KEY_SCORE] for content in sample["output"]], 
-                lengths=lengths, 
-                length_penalty=length_penalty
+                rewards=[content[KEY_SCORE] for content in sample["output"]],
+                lengths=lengths,
+                length_penalty=length_penalty,
             )
             chosen_idx, rejected_idx = self.sampling_paired_idx_from_rewards(
-                rewards=penalized_rewards, 
-                sampling_paired_method=sampling_paired_method,
-                use_fast=use_fast
+                rewards=penalized_rewards, sampling_paired_method=sampling_paired_method, use_fast=use_fast
             )
-            
+
             sample_output_dict["prompt"] = sample["input"]
             sample_output_dict["chosen"] = sample["output"][chosen_idx]["text"]
             sample_output_dict["rejected"] = sample["output"][rejected_idx]["text"]
-            sample_output_dict["margin"] = (sample["output"][chosen_idx][KEY_SCORE] - sample["output"][rejected_idx][KEY_SCORE]) * margin_scale
+            sample_output_dict["margin"] = (
+                sample["output"][chosen_idx][KEY_SCORE] - sample["output"][rejected_idx][KEY_SCORE]
+            ) * margin_scale
             output_dict[KEY_INSTANCES].append(sample_output_dict)
-        
+
         output_dataset_args = copy.deepcopy(source_dataset.data_args)
         output_dataset_args.dataset_path = None
         output_dataset_args.dataset_name = f"paired_{output_dataset_args.dataset_name}"
         output_dataset = Dataset(output_dataset_args)
         output_dataset = output_dataset.from_dict(output_dict)
-        
+
         return output_dataset
-    
-    
+
     def _calc_response_lengths(
         self,
-        outputs: List[Union[str, Dict[str, str]]],
+        outputs: list[Union[str, dict[str, str]]],
         dataset_type: str,
-    ) -> List[int]:
+    ) -> list[int]:
         all_lengths = []
         if dataset_type == "text_to_scored_textlist":
             all_lengths = [len(output["text"]) for output in outputs]
-            
+
         else:
-            raise NotImplementedError(
-                f"Unknown dataset type {dataset_type} when calculating the response length."
-            )
-        
+            raise NotImplementedError(f"Unknown dataset type {dataset_type} when calculating the response length.")
+
         return all_lengths
-    
-    
+
     def _calc_reward_with_length_penalty(
         self,
-        rewards: List[float], 
-        lengths: List[int], 
+        rewards: list[float],
+        lengths: list[int],
         length_penalty: float,
-    ) -> List[float]:
-        """When length_penalty > 0, penalize the longer sequence by subtracting 
+    ) -> list[float]:
+        """When length_penalty > 0, penalize the longer sequence by subtracting
         length_penalty * length from the reward. Vice versa when length_penalty < 0.
         """
         assert len(rewards) == len(lengths), "The number of rewards and lengths should be the same."
         return [reward - length_penalty * length for reward, length in zip(rewards, lengths)]
-    
-    
+
     def sampling_paired_idx_from_rewards(
         self,
-        rewards: List[float],
-        sampling_paired_method: str="random",
-        use_fast: bool=False,
-    ) -> Tuple[int, int]:
+        rewards: list[float],
+        sampling_paired_method: str = "random",
+        use_fast: bool = False,
+    ) -> tuple[int, int]:
         """Prepare the dataset for DPO training by rejection sampling.
         We implement different strategies to select pairs, including
         random: randomly select two instances
@@ -247,14 +225,11 @@ def sampling_paired_idx_from_rewards(
         else:
             return self._sampling_paired_idx_from_rewards(rewards, sampling_paired_method)
 
-
     def _sampling_paired_idx_from_rewards(
-        self,
-        rewards: List[float], 
-        sampling_paired_method: str="random"
-    ) -> Tuple[int, int]:
+        self, rewards: list[float], sampling_paired_method: str = "random"
+    ) -> tuple[int, int]:
         idx_0, idx_1 = -1, -1
-        
+
         if sampling_paired_method == "random":
             idx_0, idx_1 = np.random.choice(len(rewards), size=2, replace=False)
         elif sampling_paired_method == "max_min":
@@ -267,19 +242,16 @@ def _sampling_paired_idx_from_rewards(
             idx_1 = np.random.choice([i for i in range(len(rewards)) if i != idx_0])
         else:
             raise ValueError(f"Unknown sampling method: {sampling_paired_method}")
-        
+
         chosen_idx, rejected_idx = (idx_0, idx_1) if rewards[idx_0] > rewards[idx_1] else (idx_1, idx_0)
-        
-        return chosen_idx, rejected_idx
 
+        return chosen_idx, rejected_idx
 
     def _sampling_paired_idx_from_rewards_fast(
-        self,
-        rewards: List[float],
-        sampling_paired_method: str="random"
-    ) -> Tuple[int, int]:
+        self, rewards: list[float], sampling_paired_method: str = "random"
+    ) -> tuple[int, int]:
         idx_0, idx_1 = -1, -1
-        
+
         if sampling_paired_method == "random":
             idx_0, idx_1 = 0, 1
         elif sampling_paired_method == "max_min":
@@ -292,11 +264,11 @@ def _sampling_paired_idx_from_rewards_fast(
             idx_1 = 0 if idx_0 != 0 else 1
         else:
             raise ValueError(f"Unknown sampling method: {sampling_paired_method}")
-        
+
         chosen_idx, rejected_idx = (idx_0, idx_1) if rewards[idx_0] > rewards[idx_1] else (idx_1, idx_0)
-        
+
         return chosen_idx, rejected_idx
-        
+
 
 class MemorySafeDPOv2Aligner:
     def __init__(
@@ -307,33 +279,28 @@ def __init__(
         ref_model_args: ModelArguments,
     ):
         self.model_args = model_args
-        self.ref_model_args = ReferenceModelArguments(**add_dataclass_attr_prefix(ref_model_args, 'reference_'))
+        self.ref_model_args = ReferenceModelArguments(**add_dataclass_attr_prefix(ref_model_args, "reference_"))
         self.data_args = data_args
         self.aligner_args = aligner_args
         self.aligner_file_path = pkg_resources.files("lmflow.pipeline.utils") / "memory_safe_dpov2_align.py"
 
     def align(self):
         aligner_args = make_shell_args_from_dataclass(
-            dataclass_objects=[
-                self.model_args, 
-                self.data_args, 
-                self.aligner_args,
-                self.ref_model_args
-            ],
+            dataclass_objects=[self.model_args, self.data_args, self.aligner_args, self.ref_model_args],
             format="shell",
-            ignored_args_list=['accelerator_config', 'fsdp_config', '_n_gpu'],
+            ignored_args_list=["accelerator_config", "fsdp_config", "_n_gpu"],
         )
         cmd = (
             f"accelerate launch --config_file {self.aligner_args.accelerate_config_file}"
-            + " " 
-            + str(self.aligner_file_path) 
-            + " " 
+            + " "
+            + str(self.aligner_file_path)
+            + " "
             + aligner_args
         )
         current_env = os.environ.copy()
         for var in MEMORY_SAFE_DPOV2_ALIGN_ENV_VAR_TO_REMOVE:
             current_env.pop(var, None)
-        
+
         cli_res = subprocess.run(
             args=cmd,
             stdout=sys.stdout,
@@ -343,7 +310,7 @@ def align(self):
             env=current_env,
         )
         logger.info(f"MemorySafeDPOv2Aligner subprocess run finished, info at finish: {cli_res}")
-        
+
         if cli_res.returncode != 0:
             print(cli_res.stderr)
-            raise RuntimeError(f"Error during MemorySafeDPOv2Aligner: {cli_res}")
\ No newline at end of file
+            raise RuntimeError(f"Error during MemorySafeDPOv2Aligner: {cli_res}")
diff --git a/src/lmflow/pipeline/evaluator.py b/src/lmflow/pipeline/evaluator.py
index 6ca98fbf8..fd5a0127f 100644
--- a/src/lmflow/pipeline/evaluator.py
+++ b/src/lmflow/pipeline/evaluator.py
@@ -1,26 +1,43 @@
-"""The Evaluator class simplifies the process of running evaluation on a language model provided by a HFDecoderModel instance imported from the lmflow package. The class constructor takes three dictionaries as arguments: model_args containing arguments related to the language model, data_args containing arguments related to the data used for evaluation, and evaluator_args containing other arguments for the evaluation process.
-
-The class has two methods: create_dataloader() that loads the data from the test file, creates a data loader, and returns it with the size of the data, and evaluate(model) that generates output text given input text. It uses the create_dataloader() method to load the data, iterates over the data in mini-batches, and encodes the input text with the encode() method of the HFDecoderModel class. Then, it generates output text using the evaluate() method of the HFDecoderModel class, decodes the generated output text using the decode() method of the HFDecoderModel class, and writes the output to a file in the output directory. The method also logs some information to the console and Weights and Biases if the use_wandb argument is True.
 """
+The Evaluator class simplifies the process of running evaluation on a language model provided
+by a HFDecoderModel instance imported from the lmflow package. The class constructor takes three
+dictionaries as arguments: model_args containing arguments related to the language model,
+data_args containing arguments related to the data used for evaluation, and evaluator_args
+containing other arguments for the evaluation process.
+
+The class has two methods: create_dataloader() that loads the data from the test file, creates
+a data loader, and returns it with the size of the data, and evaluate(model) that generates
+output text given input text. It uses the create_dataloader() method to load the data, iterates
+over the data in mini-batches, and encodes the input text with the encode() method of the
+HFDecoderModel class. Then, it generates output text using the evaluate() method of the HFDecoderModel
+class, decodes the generated output text using the decode() method of the HFDecoderModel class, and
+writes the output to a file in the output directory. The method also logs some information to the
+console and Weights and Biases if the use_wandb argument is True.
+"""
+
+import datetime
+import json
 import os
+
+import numpy as np
 import torch
+import torch.distributed as dist
 import wandb
-import deepspeed
-import sys
-import numpy as np
-import datetime
-import json
+
 # TODO: remove later
 from accelerate import Accelerator
 from transformers import AutoConfig
-import torch.distributed as dist
 
+from lmflow.args import DatasetArguments, EvaluatorArguments, ModelArguments
 from lmflow.datasets.dataset import Dataset
 from lmflow.pipeline.base_pipeline import BasePipeline
-from lmflow.models.hf_decoder_model import HFDecoderModel
-from lmflow.utils.data_utils import set_random_seed, batchlize, answer_extraction
+from lmflow.utils.data_utils import answer_extraction, batchlize, set_random_seed
+from lmflow.utils.envs import is_accelerate_env
+from lmflow.utils.versioning import is_deepspeed_available
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
 
+
 class Evaluator(BasePipeline):
     """
     Initializes the `Evaluator` class with given arguments.
@@ -29,7 +46,7 @@ class Evaluator(BasePipeline):
     ------------
     model_args : ModelArguments object.
         Contains the arguments required to load the model.
-    
+
     data_args : DatasetArguments object.
         Contains the arguments required to load the dataset.
 
@@ -38,14 +55,20 @@ class Evaluator(BasePipeline):
 
 
     """
-    def __init__(self, model_args, data_args, evaluator_args):
-    # our method
+
+    def __init__(
+        self,
+        model_args: ModelArguments,
+        data_args: DatasetArguments,
+        evaluator_args: EvaluatorArguments,
+    ):
+        # our method
         self.data_args = data_args
         self.evaluator_args = evaluator_args
         self.model_args = model_args
 
         # logger
-        if(self.evaluator_args.use_wandb == True):
+        if self.evaluator_args.use_wandb:
             wandb.init(project="lmflow_evaluation")
         # random seed
         set_random_seed(self.evaluator_args.random_seed)
@@ -53,18 +76,22 @@ def __init__(self, model_args, data_args, evaluator_args):
         self.world_size = int(os.getenv("WORLD_SIZE", "1"))
         torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
 
-        if evaluator_args.use_accelerator_for_evaluator:
+        if is_accelerate_env():
             self.accelerator = Accelerator()
             self.accelerator.wait_for_everyone()
         else:
+            if is_deepspeed_available():
+                import deepspeed
+            else:
+                raise ImportError('Deepspeed is not available, please install using `pip install -e ".[deepspeed]"`')
             deepspeed.init_distributed()
 
         self.config = AutoConfig.from_pretrained(model_args.model_name_or_path)
-        try: 
+        try:
             self.model_hidden_size = self.config.hidden_size
-        except:
+        except Exception:
             print("Error in setting hidden size, use the default size 1024")
-            self.model_hidden_size = 1024 # gpt2 seems do not have hidden_size in config
+            self.model_hidden_size = 1024  # gpt2 seems do not have hidden_size in config
 
         print(f"model_hidden_size = {self.model_hidden_size}")
         # batch size has to be divisible by world_size, but can be bigger than world_size
@@ -73,29 +100,22 @@ def __init__(self, model_args, data_args, evaluator_args):
         self.block_size = evaluator_args.evaluate_block_size
         # dataloader, data_size = create_dataloader(args)    # load dataset
 
-
     def create_dataloader(self, dataset: Dataset):
         data_dict = dataset.to_dict()
-        inputs = [ instance["input"] for instance in data_dict["instances"] ]
-        outputs = [ instance["output"] for instance in data_dict["instances"] ]
+        inputs = [instance["input"] for instance in data_dict["instances"]]
+        outputs = [instance["output"] for instance in data_dict["instances"]]
         dataset_size = len(outputs)
         dataset_buf = []
         for idx in range(dataset_size):
-            dataset_buf.append({
-                "input": inputs[idx],
-                "output": outputs[idx],
-                "input_idx": idx
-            })
-
-        dataloader = batchlize(
-            dataset_buf,
-            self.evaluator_args.minibatch_size,
-            self.evaluator_args.random_shuffle
+            dataset_buf.append({"input": inputs[idx], "output": outputs[idx], "input_idx": idx})
+
+        dataloader = batchlize(dataset_buf, self.evaluator_args.minibatch_size, self.evaluator_args.random_shuffle)
+        print(
+            f"Successfully create dataloader with size {len(dataloader)}, "
+            f"batch_size {self.evaluator_args.minibatch_size}."
         )
-        print(f"Successfully create dataloader with size {len(dataloader)},batch_size {self.evaluator_args.minibatch_size}.")
-        
-        return dataloader, dataset_size
 
+        return dataloader, dataset_size
 
     # TODO: Split for better unittest
 
@@ -114,12 +134,11 @@ def _match(self, predicted_answer, groundtruth, answer_type=None):
             return predicted_answer == groundtruth
         return False
 
-
     def evaluate(
         self,
         model,
         dataset: Dataset,
-        metric = "accuracy",
+        metric="accuracy",
         verbose=True,
     ):
         """
@@ -131,12 +150,12 @@ def evaluate(
             TunableModel to perform inference
 
         dataset : Dataset object.
-            
+
 
         """
         if metric in ["acc", "accuracy"]:
-            if self.evaluator_args.use_accelerator_for_evaluator:
-                acc = self._evaluate_acc_with_accelerator(model, dataset, verbose=verbose)
+            if is_accelerate_env():
+                acc = self._evaluate_acc_with_accelerate(model, dataset, verbose=verbose)
             else:
                 acc = self._evaluate_acc_with_deepspeed(model, dataset, verbose=verbose)
             print(f"Evaluating final accuracy: {acc}")
@@ -152,48 +171,63 @@ def evaluate(
         else:
             raise NotImplementedError(f"metric {metric} is not supported")
 
-
-    def _evaluate_acc_with_accelerator(self, model, dataset, verbose=True):
+    def _evaluate_acc_with_accelerate(self, model, dataset, verbose=True):
         dataloader, data_size = self.create_dataloader(dataset)
         if self.accelerator.is_local_main_process:
             if not os.path.exists(self.evaluator_args.output_dir):
                 os.makedirs(self.evaluator_args.output_dir)
             output_writer = open(f"{self.evaluator_args.output_dir}/evaluation.json", "w")
-        
+
         correct_number_list = []
         for batch_index, batch in enumerate(dataloader):
-            if batch_index * self.world_size >= self.data_args.max_eval_samples: 
+            if batch_index * self.world_size >= self.data_args.max_eval_samples:
                 break
-            if self.local_rank*self.evaluator_args.inference_batch_size_per_device >= len(batch):
-                current_batch = batch[:self.evaluator_args.inference_batch_size_per_device]
+            if self.local_rank * self.evaluator_args.inference_batch_size_per_device >= len(batch):
+                current_batch = batch[: self.evaluator_args.inference_batch_size_per_device]
             else:
-                current_batch = batch[self.local_rank*self.evaluator_args.inference_batch_size_per_device:(self.local_rank+1)*self.evaluator_args.inference_batch_size_per_device]
+                current_batch = batch[
+                    self.local_rank * self.evaluator_args.inference_batch_size_per_device : (self.local_rank + 1)
+                    * self.evaluator_args.inference_batch_size_per_device
+                ]
             prompt_structure = self.evaluator_args.prompt_structure
-            input = [prompt_structure.format(input=i['input']) for i in current_batch]
-            output = [i['output'] for i in current_batch]   
+            input = [prompt_structure.format(input=i["input"]) for i in current_batch]
+            output = [i["output"] for i in current_batch]
 
-            batch_input = model.encode(input, return_tensors="pt",padding=True).to(self.accelerator.device)
-            inputs = batch_input['input_ids']
-            mask = batch_input['attention_mask']
+            batch_input = model.encode(input, return_tensors="pt", padding=True).to(self.accelerator.device)
+            inputs = batch_input["input_ids"]
+            mask = batch_input["attention_mask"]
             with self.accelerator.autocast():
-                outputs = model.inference(inputs, max_new_tokens=self.evaluator_args.max_new_tokens,attention_mask=mask,temperature=self.evaluator_args.temperature, repetition_penalty=self.evaluator_args.repetition_penalty,use_accelerator=self.evaluator_args.use_accelerator_for_evaluator)
+                outputs = model.inference(
+                    inputs,
+                    max_new_tokens=self.evaluator_args.max_new_tokens,
+                    attention_mask=mask,
+                    temperature=self.evaluator_args.temperature,
+                    repetition_penalty=self.evaluator_args.repetition_penalty,
+                )
             text_out = model.decode(outputs, skip_special_tokens=True)
-            decoded_input = model.decode(inputs, skip_special_tokens=True,)
+            decoded_input = model.decode(
+                inputs,
+                skip_special_tokens=True,
+            )
             prompt_length = [len(i) for i in decoded_input]
-            text_out = [text_out[i][prompt_length[i]:] for i in range(len(text_out))]
+            text_out = [text_out[i][prompt_length[i] :] for i in range(len(text_out))]
             answer_type = self.evaluator_args.answer_type
             pred_answer = []
             for i in text_out:
-                pred_answer.append(answer_extraction(
-                    i,
-                    answer_type=answer_type,
-                ))
+                pred_answer.append(
+                    answer_extraction(
+                        i,
+                        answer_type=answer_type,
+                    )
+                )
             if verbose:
-                print(f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n")
+                print(
+                    f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n"
+                )
                 print(f"predicted answer: {pred_answer} \n")
                 print(f"groundtruth answer: {output} \n")
 
-            if self.local_rank * self.evaluator_args.inference_batch_size_per_device  >= len(batch):
+            if self.local_rank * self.evaluator_args.inference_batch_size_per_device >= len(batch):
                 correct_ = 0
             else:
                 correct_ = 0
@@ -208,31 +242,40 @@ def _evaluate_acc_with_accelerator(self, model, dataset, verbose=True):
             correct_number_list.append(correct_)
 
             # collect predictions from all gpus
-            output_dict = {"question": input,
-                        "prediction": text_out,
-                        "pred_answer": pred_answer,
-                        "answer": output}
-            if(self.world_size > 1):
+            output_dict = {"question": input, "prediction": text_out, "pred_answer": pred_answer, "answer": output}
+            if self.world_size > 1:
                 all_process_list = [{}] * self.world_size
                 dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0)
             else:
                 all_process_list = [output_dict]
-            
-            if self.accelerator.is_local_main_process:
-                current_total = (batch_index+1) * self.world_size * self.evaluator_args.inference_batch_size_per_device
-                current_accuracy = np.sum(correct_number_list) / current_total if int(current_total) < data_size else np.sum(correct_number_list) / data_size
-                print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), f"{int(current_total) if int(current_total) < data_size else data_size} / {data_size} has been finished, # correct = { np.sum(correct_number_list)}, current accuracy = {current_accuracy}")
 
-                if(self.evaluator_args.use_wandb == True):
+            if self.accelerator.is_local_main_process:
+                current_total = (
+                    (batch_index + 1) * self.world_size * self.evaluator_args.inference_batch_size_per_device
+                )
+                current_accuracy = (
+                    np.sum(correct_number_list) / current_total
+                    if int(current_total) < data_size
+                    else np.sum(correct_number_list) / data_size
+                )
+                print(
+                    datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    f"{int(current_total) if int(current_total) < data_size else data_size} / {data_size} finished, "
+                    f"# correct = {np.sum(correct_number_list)}, current accuracy = {current_accuracy}",
+                )
+
+                if self.evaluator_args.use_wandb:
                     wandb.log({"Accuracy": current_accuracy})
 
                 for index, output in enumerate(all_process_list):
                     output_json = json.dumps(output)
-                    output_writer.write(output_json + '\n')
+                    output_writer.write(output_json + "\n")
 
         if self.accelerator.is_local_main_process:
             current_accuracy = np.sum(correct_number_list) / data_size
-            print(f"# Correct = {np.sum(correct_number_list)}, # Total = {data_size}, Final accuracy = ", current_accuracy)
+            print(
+                f"# Correct = {np.sum(correct_number_list)}, # Total = {data_size}, Final accuracy = ", current_accuracy
+            )
             output_writer.close()
         return np.sum(correct_number_list) / data_size
 
@@ -245,38 +288,53 @@ def _evaluate_acc_with_deepspeed(self, model, dataset, verbose=True):
 
         correct_number_list = []
         for batch_index, batch in enumerate(dataloader):
-            if batch_index * self.world_size >= self.data_args.max_eval_samples: 
+            if batch_index * self.world_size >= self.data_args.max_eval_samples:
                 break
-            if self.local_rank*self.evaluator_args.inference_batch_size_per_device >= len(batch):
-                current_batch = batch[:self.evaluator_args.inference_batch_size_per_device]
+            if self.local_rank * self.evaluator_args.inference_batch_size_per_device >= len(batch):
+                current_batch = batch[: self.evaluator_args.inference_batch_size_per_device]
             else:
-                current_batch = batch[self.local_rank*self.evaluator_args.inference_batch_size_per_device:(self.local_rank+1)*self.evaluator_args.inference_batch_size_per_device]
+                current_batch = batch[
+                    self.local_rank * self.evaluator_args.inference_batch_size_per_device : (self.local_rank + 1)
+                    * self.evaluator_args.inference_batch_size_per_device
+                ]
             prompt_structure = self.evaluator_args.prompt_structure
-            input = [prompt_structure.format(input=i['input']) for i in current_batch]
-            output = [i['output'] for i in current_batch]   
-            input_idx = [i['input_idx'] for i in current_batch]
-            batch_input = model.encode(input, return_tensors="pt",padding=True).to(device=self.local_rank)
-            inputs = batch_input['input_ids']
-            mask = batch_input['attention_mask']
-            outputs = model.inference(inputs, max_new_tokens=self.evaluator_args.max_new_tokens, attention_mask=mask,temperature=self.evaluator_args.temperature, repetition_penalty=self.evaluator_args.repetition_penalty)
+            input = [prompt_structure.format(input=i["input"]) for i in current_batch]
+            output = [i["output"] for i in current_batch]
+            batch_input = model.encode(input, return_tensors="pt", padding=True).to(device=self.local_rank)
+            inputs = batch_input["input_ids"]
+            mask = batch_input["attention_mask"]
+            outputs = model.inference(
+                inputs,
+                max_new_tokens=self.evaluator_args.max_new_tokens,
+                attention_mask=mask,
+                temperature=self.evaluator_args.temperature,
+                repetition_penalty=self.evaluator_args.repetition_penalty,
+            )
             text_out = model.decode(outputs, skip_special_tokens=True)
             # # only return the generation, trucating the input
-            decoded_input = model.decode(inputs, skip_special_tokens=True,)
+            decoded_input = model.decode(
+                inputs,
+                skip_special_tokens=True,
+            )
             prompt_length = [len(i) for i in decoded_input]
-            text_out = [text_out[i][prompt_length[i]:] for i in range(len(text_out))]
+            text_out = [text_out[i][prompt_length[i] :] for i in range(len(text_out))]
             answer_type = self.evaluator_args.answer_type
             pred_answer = []
             for i in text_out:
-                pred_answer.append(answer_extraction(
-                    i,
-                    answer_type=answer_type,
-                ))
+                pred_answer.append(
+                    answer_extraction(
+                        i,
+                        answer_type=answer_type,
+                    )
+                )
             if verbose:
-                print(f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n")
+                print(
+                    f"batch_index{batch_index} rank{self.local_rank}:\n   question={input}\n  prediction={text_out}\n"
+                )
                 print(f"predicted answer: {pred_answer} \n")
                 print(f"groundtruth answer: {output} \n")
 
-            if self.local_rank * self.evaluator_args.inference_batch_size_per_device  >= len(batch):
+            if self.local_rank * self.evaluator_args.inference_batch_size_per_device >= len(batch):
                 correct_ = 0
             else:
                 correct_ = 0
@@ -291,46 +349,57 @@ def _evaluate_acc_with_deepspeed(self, model, dataset, verbose=True):
             correct_number_list.append(correct_)
 
             # collect predictions from all gpus
-            output_dict = {"question": input,
-                        "prediction": text_out,
-                        "pred_answer": pred_answer,
-                        "answer": output}
+            output_dict = {"question": input, "prediction": text_out, "pred_answer": pred_answer, "answer": output}
             all_process_list = [{}] * self.world_size
 
             dist.gather_object(output_dict, all_process_list if dist.get_rank() == 0 else None, dst=0)
             if not dist.is_initialized() or dist.get_rank() == 0:
-                current_total = (batch_index+1) * self.world_size * self.evaluator_args.inference_batch_size_per_device
-                current_accuracy = np.sum(correct_number_list) / current_total if int(current_total) < data_size else np.sum(correct_number_list) / data_size
-                print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), f"{int(current_total) if int(current_total) < data_size else data_size} / {data_size} has been finished, # correct = { np.sum(correct_number_list)}, current accuracy = {current_accuracy}")
-
-                if(self.evaluator_args.use_wandb == True):
+                current_total = (
+                    (batch_index + 1) * self.world_size * self.evaluator_args.inference_batch_size_per_device
+                )
+                current_accuracy = (
+                    np.sum(correct_number_list) / current_total
+                    if int(current_total) < data_size
+                    else np.sum(correct_number_list) / data_size
+                )
+                print(
+                    datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+                    f"{int(current_total) if int(current_total) < data_size else data_size} / {data_size} finished, "
+                    f"# correct = {np.sum(correct_number_list)}, current accuracy = {current_accuracy}",
+                )
+
+                if self.evaluator_args.use_wandb:
                     wandb.log({"Accuracy": current_accuracy})
 
                 for index, output in enumerate(all_process_list):
                     output_json = json.dumps(output)
-                    output_writer.write(output_json + '\n')
+                    output_writer.write(output_json + "\n")
 
         if not dist.is_initialized() or dist.get_rank() == 0:
             current_accuracy = np.sum(correct_number_list) / data_size
-            print(f"# Correct = {np.sum(correct_number_list)}, # Total = {data_size}, Final accuracy = ", current_accuracy)
+            print(
+                f"# Correct = {np.sum(correct_number_list)}, # Total = {data_size}, Final accuracy = ", current_accuracy
+            )
             output_writer.close()
         return np.sum(correct_number_list) / data_size
 
     def _evaluate_ppl(self, model, dataset: Dataset, verbose=True):
         data_dict = dataset.to_dict()
-        if data_dict['type'] == 'text2text':
-            raise NotImplementedError("ppl evaluation is currently not supported for text2text dataset, please use text_only dataset.")
-        texts = [ instance["text"] for instance in data_dict["instances"] ]
+        if data_dict["type"] == "text2text":
+            raise NotImplementedError(
+                "ppl evaluation is currently not supported for text2text dataset, please use text_only dataset."
+            )
+        texts = [instance["text"] for instance in data_dict["instances"]]
         encodings = model.get_tokenizer()("\n\n".join(texts), return_tensors="pt")
         # Define some constant
         if self.model_args.truncate_to_model_max_length:
             try:
                 max_length = min(model.get_backend_model().config.n_positions, model.get_max_length())
-            except:
+            except Exception:
                 max_length = min(1024, model.get_max_length())
         else:
             max_length = self.block_size
-        
+
         if verbose:
             print(f"The maximum sequence length : {max_length}")
         seq_len = encodings.input_ids.size(1)
@@ -354,13 +423,15 @@ def _evaluate_ppl(self, model, dataset: Dataset, verbose=True):
             nlls.append(neg_log_likelihood)
             prev_end_loc = end_loc
             if verbose:
-                print(f"Evaluating PPL: {int(begin_loc/self.block_size) + 1} / {int(seq_len/self.block_size)} Complete, current ppl : {torch.exp(torch.stack(nlls).mean())}")
+                print(
+                    f"Evaluating PPL: {int(begin_loc / self.block_size) + 1} / {int(seq_len / self.block_size)} "
+                    f"complete, current ppl : {torch.exp(torch.stack(nlls).mean())}"
+                )
             if end_loc == seq_len:
                 break
         ppl = torch.exp(torch.stack(nlls).mean())
         return ppl
 
-
     def _evaluate_nll(
         self,
         model,
@@ -386,10 +457,7 @@ def _evaluate_nll(
         if dataset.get_type() == "text2text":
             prompt = self.evaluator_args.prompt_structure
             data_dict["instances"] = [
-                {
-                    "input": prompt.format(input=instance["input"]),
-                    "output": instance["output"]
-                }
+                {"input": prompt.format(input=instance["input"]), "output": instance["output"]}
                 for instance in data_dict["instances"]
             ]
 
@@ -401,15 +469,13 @@ def _evaluate_nll(
                 "input_ids": torch.tensor([input_ids]),
                 "labels": torch.tensor([labels]),
             }
-            for input_ids, labels in zip(tokenized_dataset["input_ids"],
-                                         tokenized_dataset["labels"])
+            for input_ids, labels in zip(tokenized_dataset["input_ids"], tokenized_dataset["labels"])
         ]
 
         # Gets context window length
         try:
-            max_length = min(model.get_backend_model().config.n_positions,
-                             model.get_max_length())
-        except:
+            max_length = min(model.get_backend_model().config.n_positions, model.get_max_length())
+        except Exception:
             max_length = min(1024, model.get_max_length())
 
         nlls = []
@@ -431,7 +497,7 @@ def _evaluate_nll(
                 target_ids = labels.clone()
                 full_target_ids = input_ids.clone()
 
-                def get_nll(label_ids, nll_list):
+                def get_nll(label_ids, nll_list, input_ids, trg_len):
                     label_ids[:, :-trg_len] = -100
                     label_ids = label_ids.to(device=self.local_rank)
 
@@ -442,23 +508,19 @@ def get_nll(label_ids, nll_list):
 
                     if not torch.all(label_ids == -100):
                         with torch.no_grad():
-                            outputs = model.get_backend_model()(
-                                input_ids, labels=label_ids
-                            )
+                            outputs = model.get_backend_model()(input_ids, labels=label_ids)
                             # loss is calculated using CrossEntropyLoss which
                             # sums over valid labels N.B. the model only
                             # calculates loss over trg_len - 1 labels, because
                             # it internally shifts the labels to the left by 1.
                             neg_log_likelihood = outputs.loss * num_valid_labels
                     else:
-                        neg_log_likelihood = torch.zeros([]).to(
-                            device=self.local_rank
-                        )
+                        neg_log_likelihood = torch.zeros([]).to(device=self.local_rank)
 
                     nll_list.append(neg_log_likelihood)
 
-                get_nll(target_ids, nlls)
-                get_nll(full_target_ids, full_nlls)
+                get_nll(target_ids, nlls, input_ids, trg_len)
+                get_nll(full_target_ids, full_nlls, input_ids, trg_len)
 
                 current_output_nll = torch.stack(nlls).sum() / (sample_idx + 1)
                 current_full_nll = torch.stack(full_nlls).sum() / (sample_idx + 1)
@@ -481,10 +543,7 @@ def get_nll(label_ids, nll_list):
                             f" {current_output_nll}"
                         )
                     else:
-                        raise NotImplementedError(
-                            "f{dataset.get_type()} typed datasets are not"
-                            " supported"
-                        )
+                        raise NotImplementedError("f{dataset.get_type()} typed datasets are not supported")
 
                 if end_loc == seq_len:
                     break
diff --git a/src/lmflow/pipeline/finetuner.py b/src/lmflow/pipeline/finetuner.py
index 059cd6130..6be4611c1 100644
--- a/src/lmflow/pipeline/finetuner.py
+++ b/src/lmflow/pipeline/finetuner.py
@@ -1,46 +1,36 @@
 #!/usr/bin/env python
-# coding=utf-8
-"""The Finetuner class simplifies the process of running finetuning process on a language model for a TunableModel instance with given dataset.
-"""
-
 import copy
 import logging
 import os
 import sys
-from typing import Any, Iterable, Optional, Tuple, Union
+from copy import deepcopy
+from itertools import chain
+from typing import Union
 
 import datasets
-import transformers
 import evaluate
-from itertools import chain
+import torch
+import transformers
 from transformers import (
     Trainer,
     default_data_collator,
     set_seed,
 )
-from copy import deepcopy
-from transformers import PreTrainedModel, TrainingArguments
 from transformers.trainer_utils import get_last_checkpoint
-from transformers.trainer_callback import (
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-)
 from transformers.utils import (
-    is_sagemaker_mp_enabled,
     send_example_telemetry,
 )
-import numpy as np
 
-import lmflow.optim.optimizers as optim
-from lmflow.args import OptimizerNames, DatasetArguments, ModelArguments, FinetunerArguments
+from lmflow.args import DatasetArguments, FinetunerArguments, ModelArguments
 from lmflow.datasets.dataset import Dataset
 from lmflow.models.hf_decoder_model import HFDecoderModel
-from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
+
+# from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
 from lmflow.models.hf_text_regression_model import HFTextRegressionModel
+from lmflow.optim import create_customized_optimizer
 from lmflow.pipeline.base_tuner import BaseTuner
-from lmflow.pipeline.utils.peft_trainer import PeftTrainer, PeftSavingCallback
-
+from lmflow.pipeline.utils.lisa_trainer import DynamicLayerActivationCallback
+from lmflow.utils.versioning import is_package_version_at_least
 
 logger = logging.getLogger(__name__)
 
@@ -67,15 +57,15 @@ class Finetuner(BaseTuner):
         Keyword arguments.
 
     """
+
     def __init__(
-        self, 
-        model_args: ModelArguments, 
-        data_args: DatasetArguments, 
-        finetuner_args: FinetunerArguments, 
-        *args, 
-        **kwargs
+        self,
+        model_args: ModelArguments,
+        data_args: DatasetArguments,
+        finetuner_args: FinetunerArguments,
+        *args,
+        **kwargs,
     ):
-
         self.model_args = model_args
         self.data_args = data_args
         self.finetuner_args = finetuner_args
@@ -111,7 +101,11 @@ def __init__(
 
         # Detecting last checkpoint.
         last_checkpoint = None
-        if os.path.isdir(finetuner_args.output_dir) and finetuner_args.do_train and not finetuner_args.overwrite_output_dir:
+        if (
+            os.path.isdir(finetuner_args.output_dir)
+            and finetuner_args.do_train
+            and not finetuner_args.overwrite_output_dir
+        ):
             last_checkpoint = get_last_checkpoint(finetuner_args.output_dir)
             if last_checkpoint is None and len(os.listdir(finetuner_args.output_dir)) > 0:
                 raise ValueError(
@@ -131,7 +125,6 @@ def __init__(
         # Set seed before initializing model.
         set_seed(finetuner_args.seed)
 
-
     def group_text(self, tokenized_datasets, model_max_length):
         """
         Groups texts together to form blocks of maximum length `model_max_length` and returns the processed data as
@@ -170,10 +163,12 @@ def group_text(self, tokenized_datasets, model_max_length):
                         f"The block_size passed ({data_args.block_size}) is larger"
                         f"than the maximum length for the model"
                         f"({model_max_length})."
-                        f"Using block_size={data_args.block_size}.")
+                        f"Using block_size={data_args.block_size}."
+                    )
                     block_size = data_args.block_size
             else:
                 block_size = data_args.block_size
+
         # Main data processing function that will concatenate all texts from
         # our dataset and generate chunks of block_size.
         def group_texts(examples):
@@ -221,217 +216,13 @@ def group_texts(examples):
 
         return lm_datasets
 
-    def create_customized_optimizer(self, base_trainer_class, model_args):
-        class CustomizedOptimTrainer(base_trainer_class):
-
-            @staticmethod
-            def get_optimizer_cls_and_kwargs(
-                args: TrainingArguments,
-                model: Optional[PreTrainedModel] = None,
-            ) -> Tuple[Any, Any]:
-                # parse args.optim_args
-                optim_args = {}
-                if args.customized_optim_args:
-                    for mapping in args.customized_optim_args.replace(" ", "").split(","):
-                        key, value = mapping.split("=")
-                        optim_args[key] = value
-
-                optimizer_kwargs = {"lr": args.learning_rate}
-
-                if args.customized_optim == OptimizerNames.DUMMY:
-                    optimizer_cls = optim.Dummy
-                    dummy_kwargs = {
-                        "betas": (args.optim_dummy_beta1, args.optim_dummy_beta2),
-                    }
-                    optimizer_kwargs.update(dummy_kwargs)
-                elif args.customized_optim == OptimizerNames.ADABELIEF:
-                    optimizer_cls = optim.AdaBelief
-                    adabelief_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay)
-                    }
-                    optimizer_kwargs.update(adabelief_kwargs)
-                elif args.customized_optim == OptimizerNames.ADABOUND:
-                    optimizer_cls = optim.AdaBound
-                    adabound_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay)
-                    }
-                    optimizer_kwargs.update(adabound_kwargs)
-                elif args.customized_optim == OptimizerNames.LARS:
-                    optimizer_cls = optim.LARS
-                    lars_kwargs = {
-                        "momentum": (args.optim_momentum),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(lars_kwargs)
-                elif args.customized_optim == OptimizerNames.LAMB:
-                    optimizer_cls = optim.Lamb
-                    lamb_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(lamb_kwargs)
-                elif args.customized_optim == OptimizerNames.ADAMAX:
-                    optimizer_cls = optim.Adamax
-                    adamax_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(adamax_kwargs)
-                elif args.customized_optim == OptimizerNames.NADAM:
-                    optimizer_cls = optim.NAdam
-                    nadam_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(nadam_kwargs)
-                elif args.customized_optim == OptimizerNames.RADAM:
-                    optimizer_cls = optim.RAdam
-                    radam_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(radam_kwargs)
-                elif args.customized_optim == OptimizerNames.ADAMP:
-                    optimizer_cls = optim.AdamP
-                    adamp_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(adamp_kwargs)
-                elif args.customized_optim == OptimizerNames.SGDP:
-                    optimizer_cls = optim.SGDP
-                    sgdp_kwargs = {
-                        "momentum": (args.optim_momentum),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(sgdp_kwargs)
-                elif args.customized_optim == OptimizerNames.YOGI:
-                    optimizer_cls = optim.Yogi
-                    yogi_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(yogi_kwargs)
-                elif args.customized_optim == OptimizerNames.SOPHIA:
-                    optimizer_cls = optim.SophiaG
-                    sophia_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(sophia_kwargs)
-                elif args.customized_optim == OptimizerNames.ADAM:
-                    optimizer_cls = optim.Adam
-                    adam_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                    }
-                    optimizer_kwargs.update(adam_kwargs)
-                elif args.customized_optim == OptimizerNames.NOVOGRAD:
-                    optimizer_cls = optim.NovoGrad
-                    novograd_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(novograd_kwargs)
-                elif args.customized_optim == OptimizerNames.ADADELTA:
-                    optimizer_cls = optim.Adadelta
-                    adadelta_kwargs = {
-                    }
-                    optimizer_kwargs.update(adadelta_kwargs)
-                elif args.customized_optim == OptimizerNames.ADAGRAD:
-                    optimizer_cls = optim.AdaGrad
-                    adagrad_kwargs = {
-                    }
-                    optimizer_kwargs.update(adagrad_kwargs)
-                elif args.customized_optim == OptimizerNames.MUON:
-                    optimizer_cls = optim.Muon
-                    muon_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(muon_kwargs)
-                elif args.customized_optim == OptimizerNames.ADAMW_SCHEDULE_FREE:
-                    optimizer_cls = optim.AdamWScheduleFree
-                    adamw_schedule_free_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(adamw_schedule_free_kwargs)
-                elif args.customized_optim == OptimizerNames.SGD_SCHEDULE_FREE:
-                    optimizer_cls = optim.SGDScheduleFree
-                    sgd_schedule_free_kwargs = {
-                        "momentum": (args.optim_momentum),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(sgd_schedule_free_kwargs)
-                elif args.customized_optim == OptimizerNames.ADAN:
-                    optimizer_cls = optim.Adan
-                    adan_kwargs = {
-                        "betas": (args.optim_beta1, args.optim_beta2, args.optim_beta3),
-                        "weight_decay": (args.optim_weight_decay),
-                    }
-                    optimizer_kwargs.update(adan_kwargs)
-                else:
-                    raise ValueError(
-                        f"Trainer cannot instantiate unsupported optimizer: "
-                        f" {args.customized_optim}"
-                    )
-                return optimizer_cls, optimizer_kwargs
-
-            def create_optimizer(self):
-                opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
-
-                if self.optimizer is None:
-                    decay_parameters = self.get_decay_parameter_names(opt_model)
-                    optimizer_grouped_parameters = [
-                        {
-                            "params": [
-                                p for n, p in opt_model.named_parameters()
-                                    if (n in decay_parameters and p.requires_grad)
-                            ],
-                            "weight_decay": self.args.weight_decay,
-                        },
-                        {
-                            "params": [
-                                p for n, p in opt_model.named_parameters()
-                                    if (n not in decay_parameters and p.requires_grad)
-                            ],
-                            "weight_decay": 0.0,
-                        },
-                    ]
-
-                    optimizer_cls, optimizer_kwargs = CustomizedOptimTrainer.get_optimizer_cls_and_kwargs(self.args, opt_model)
-
-                    # Overwrite `params` in case it's created by
-                    # `get_optimizer_cls_and_kwargs` e.g. for GaLore optimizer.
-                    if "params" in optimizer_kwargs:
-                        optimizer_grouped_parameters = optimizer_kwargs.pop(
-                            "params"
-                        )
-
-                    # For layer-wise dummy optimizers we overwrite
-                    # optimizer_grouped_parameters with `optimizer_dict` to
-                    # avoid arguments conflicts.
-                    if "optimizer_dict" in optimizer_kwargs:
-                        optimizer_grouped_parameters = optimizer_kwargs.pop(
-                            "optimizer_dict"
-                        )
-
-                    self.optimizer = optimizer_cls(
-                        optimizer_grouped_parameters,
-                        **optimizer_kwargs
-                    )
-                if is_sagemaker_mp_enabled():
-                    self.optimizer = smp.DistributedOptimizer(self.optimizer)
-                    
-        return CustomizedOptimTrainer
-
-    def tune(self,
-             model: Union[HFDecoderModel, HFTextRegressionModel, HFEncoderDecoderModel],
-             dataset: Dataset,
-             transform_dataset_in_place=True,
-             data_collator=None):
+    def tune(
+        self,
+        model: Union[HFDecoderModel, HFTextRegressionModel],
+        dataset: Dataset,
+        transform_dataset_in_place=True,
+        data_collator=None,
+    ):
         """
         Perform tuning for a model
 
@@ -452,8 +243,7 @@ def tune(self,
 
         # Tokenization and text grouping must be done in the main process
         if dataset.backend == "custom_multi_modal":
-            dataset.backend_dataset.register_tokenizer(
-                model.tokenizer, model.image_processor)
+            dataset.backend_dataset.register_tokenizer(model.tokenizer, model.image_processor)
             lm_dataset = dataset
         else:
             with finetuner_args.main_process_first(desc="dataset map tokenization"):
@@ -467,7 +257,29 @@ def tune(self,
                     )
 
         train_dataset = lm_dataset.get_backend_dataset()
-        logger.info(f"Number of train samples: {len(train_dataset)}")
+
+        if data_args.calculate_dataset_stats:
+            total_tokens = 0
+            total_target_tokens = 0
+            pad_token_id = model.get_tokenizer().pad_token_id
+            logger.warning("Calculating dataset stats...")
+            import time
+
+            start_time = time.time()
+            for datapoint in train_dataset:
+                total_tokens += len([label for label in datapoint["input_ids"] if label != pad_token_id])
+                total_target_tokens += len([label for label in datapoint["labels"] if label != -100])
+            logger.warning(
+                "Dataset stats:\n\n"
+                f"Total tokens: {total_tokens}\n"
+                f"Total target tokens: {total_target_tokens}\n"
+                f"Total samples: {len(train_dataset)}\n"
+                f"Average tokens per sample: {total_tokens / len(train_dataset)}\n"
+                f"Average target tokens per sample: {total_target_tokens / len(train_dataset)}\n"
+            )
+            logger.warning("Calculating data stats took %s seconds", time.time() - start_time)
+        else:
+            logger.warning(f"Number of train samples: {len(train_dataset)}")
 
         if finetuner_args.do_eval:
             eval_dataset_args = deepcopy(data_args)
@@ -509,100 +321,48 @@ def compute_metrics(eval_preds):
 
         # Initialize our Trainer
         training_args = finetuner_args
+        FinetuningTrainer = Trainer
+        trainer_callbacks = []
 
-        if model_args.use_lora:
-            FinetuningTrainer = PeftTrainer
-            trainer_callbacks = [PeftSavingCallback]
-        else:
-            FinetuningTrainer = Trainer
-            trainer_callbacks = []
         if data_collator is None:
             data_collator = default_data_collator
 
         if training_args.use_customized_optim:
             BaseTrainer = FinetuningTrainer
-            FinetuningTrainer = self.create_customized_optimizer(
-                BaseTrainer, model_args
-            )
+            FinetuningTrainer = create_customized_optimizer(BaseTrainer, model_args)
 
         if training_args.use_lisa:
-            class DynamicLayerActivationCallback(TrainerCallback):
-                def __init__(self, n_layers, interval_steps, model):
-                    super().__init__()
-                    self.n_layers = n_layers
-                    self.interval_steps = interval_steps
-                    self.model = model
-
-                    # Determine the way to access layers based on the model type
-                    class_to_layers_map = {
-                        'LlamaForCausalLM': 'model.model.layers',
-                        'Qwen2ForCausalLM': 'model.model.layers',
-                        'MistralForCausalLM': 'model.model.layers',
-                        'MixtralForCausalLM': 'model.model.layers',
-                        'GemmaForCausalLM': 'model.model.layers',
-                        'GPT2LMHeadModel': 'model.transformer.h',
-                        'HymbaForCausalLM': 'model.model.layers',
-                    }
-                    model_class_name = self.model.__class__.__name__
-                    if model_class_name in class_to_layers_map:
-                        self.layers_attribute = class_to_layers_map[model_class_name]
-                    else:
-                        self.layers_attribute = training_args.lisa_layers_attribute
-                    self.total_layers = len(eval('self.' + self.layers_attribute))  # Dynamically execute to get the number of layers
-
-                    self.active_layers_indices = []
-
-                def freeze_all_layers(self):
-                    layers = eval('self.' + self.layers_attribute)  # Dynamically execute to get layers
-                    for layer in layers:
-                        for param in layer.parameters():
-                            param.requires_grad = False
-
-                def on_step_begin(self, args, state, control, **kwargs):
-                    # Check if it's time to switch active layers, including at step 0
-                    if state.global_step % self.interval_steps == 0:
-                        self.switch_active_layers()
-
-                def switch_active_layers(self):
-                    # First, disable gradients for all layers
-                    self.freeze_all_layers()
-
-                     # Randomly select n_layers to activate
-                    layers = eval('self.' + self.layers_attribute)  # Re-fetch layer references
-                    self.active_layers_indices = np.random.choice(range(self.total_layers), self.n_layers, replace=False)
-                    print(f"Activating layers at indices: {self.active_layers_indices} for the next steps.", flush=True)
-
-                    # Enable gradients only for the selected layers
-                    for idx in self.active_layers_indices:
-                        for param in layers[idx].parameters():
-                            param.requires_grad = True
-
-            # Instantiate the callback
             dynamic_layer_activation_callback = DynamicLayerActivationCallback(
-                n_layers=training_args.lisa_activated_layers,               # Number of layers to activate
-                interval_steps=training_args.lisa_interval_steps,           # Step interval to update active layers
-                model=model.get_backend_model()
+                n_layers=training_args.lisa_activated_layers,  # Number of layers to activate
+                interval_steps=training_args.lisa_interval_steps,  # Step interval to update active layers
+                model=model.get_backend_model(),
+                lisa_layers_attribute=training_args.lisa_layers_attribute,  # Attribute to access layers of the model
             )
-
             trainer_callbacks.append(dynamic_layer_activation_callback)
 
-        trainer = FinetuningTrainer(
-            model=model.get_backend_model(),
-            args=training_args,
-            train_dataset=train_dataset if training_args.do_train else None,
-            eval_dataset=eval_dataset if training_args.do_eval else None,
-            tokenizer=model.get_tokenizer(),
-            # Data collator will default to DataCollatorWithPadding, so we change it.
-            data_collator=data_collator,
-            compute_metrics=compute_metrics if training_args.do_eval else None,
-            preprocess_logits_for_metrics=preprocess_logits_for_metrics if training_args.do_eval else None,
-            callbacks=trainer_callbacks
-        )
+        trainer_kwargs = {
+            "model": model.get_backend_model(),
+            "args": training_args,
+            "train_dataset": train_dataset if training_args.do_train else None,
+            "eval_dataset": eval_dataset if training_args.do_eval else None,
+            "data_collator": data_collator,
+            "compute_metrics": compute_metrics if training_args.do_eval else None,
+            "preprocess_logits_for_metrics": preprocess_logits_for_metrics if training_args.do_eval else None,
+            "callbacks": trainer_callbacks,
+        }
+        if is_package_version_at_least("transformers", "4.46.0"):
+            # https://github.com/huggingface/transformers/pull/32385
+            trainer_kwargs["processing_class"] = model.get_tokenizer()
+        else:
+            trainer_kwargs["tokenizer"] = model.get_tokenizer()
+        trainer = FinetuningTrainer(**trainer_kwargs)
+
         # Training
         if training_args.do_train:
             checkpoint = None
             last_checkpoint = self.last_checkpoint
             if training_args.resume_from_checkpoint is not None:
+                # load from lora checkpoint is also supported
                 checkpoint = training_args.resume_from_checkpoint
             elif last_checkpoint is not None:
                 checkpoint = last_checkpoint
@@ -618,10 +378,8 @@ def switch_active_layers(self):
             if self.finetuner_args.save_language_projection:
                 language_projection_state = trainer.model.language_projection.state_dict()
                 torch.save(
-                    osp.join(
-                        self.finetuner_args.output_dir,
-                        "language_projection.pth"),
-                    language_projection_state)
+                    os.path.join(self.finetuner_args.output_dir, "language_projection.pth"), language_projection_state
+                )
             metrics = train_result.metrics
 
             max_train_samples = (
diff --git a/src/lmflow/pipeline/inferencer.py b/src/lmflow/pipeline/inferencer.py
index 855284fa7..f79ba5d34 100644
--- a/src/lmflow/pipeline/inferencer.py
+++ b/src/lmflow/pipeline/inferencer.py
@@ -1,38 +1,38 @@
 #!/usr/bin/env python
-# coding=utf-8
 """The Inferencer class simplifies the process of model inferencing."""
 
 import copy
-import os
-import torch
-import wandb
-import deepspeed
-import sys
-import numpy as np
-import datetime
-import json
-import time
 import logging
-from typing import Dict, List
-from concurrent.futures import ThreadPoolExecutor
+import os
 import subprocess
 
-from accelerate import Accelerator
-from transformers import AutoConfig
+import numpy as np
+import torch
 import torch.distributed as dist
 import torch.nn.functional as F
+from accelerate import Accelerator
+from transformers import AutoConfig
 
-from lmflow.args import DatasetArguments
+from lmflow.args import (
+    DatasetArguments,
+    InferencerArguments,
+    ModelArguments,
+)
 from lmflow.datasets.dataset import Dataset
-from lmflow.pipeline.base_pipeline import BasePipeline
 from lmflow.models.hf_decoder_model import HFDecoderModel
-from lmflow.utils.data_utils import (set_random_seed, batchlize,
-                                     answer_extraction, process_image_flag)
+from lmflow.pipeline.base_pipeline import BasePipeline
 from lmflow.utils.constants import IMAGE_TOKEN_INDEX
+from lmflow.utils.data_utils import batchlize, set_random_seed
+from lmflow.utils.envs import is_accelerate_env
+from lmflow.utils.versioning import is_deepspeed_available
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
+
+
 def rstrip_partial_utf8(string):
     return string.replace("\ufffd", "")
 
+
 supported_dataset_type = [
     "text_only",
     "image_text",
@@ -40,6 +40,7 @@ def rstrip_partial_utf8(string):
 
 logger = logging.getLogger(__name__)
 
+
 class Inferencer(BasePipeline):
     """
     Initializes the `Inferencer` class with given arguments.
@@ -57,7 +58,13 @@ class Inferencer(BasePipeline):
 
 
     """
-    def __init__(self, model_args, data_args, inferencer_args):
+
+    def __init__(
+        self,
+        model_args: ModelArguments,
+        data_args: DatasetArguments,
+        inferencer_args: InferencerArguments,
+    ):
         self.data_args = data_args
         self.inferencer_args = inferencer_args
         self.model_args = model_args
@@ -66,28 +73,28 @@ def __init__(self, model_args, data_args, inferencer_args):
 
         self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
         self.world_size = int(os.getenv("WORLD_SIZE", "1"))
-        if inferencer_args.device == "gpu":
+        if inferencer_args.device == "gpu":  # FIXME: a bit weird here
             torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
-            deepspeed.init_distributed()
+            if not is_accelerate_env() and is_deepspeed_available():
+                import deepspeed
+
+                deepspeed.init_distributed()
         else:
             os.environ["MASTER_ADDR"] = "localhost"
             os.environ["MASTER_PORT"] = "15000"
-            dist.init_process_group(
-                "gloo", rank=self.local_rank, world_size=self.world_size
-            )
+            dist.init_process_group("gloo", rank=self.local_rank, world_size=self.world_size)
 
         self.config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)
         try:
             self.model_hidden_size = self.config.hidden_size
-        except:
+        except Exception:
             print("Error in setting hidden size, use the default size 1024")
-            self.model_hidden_size = 1024 # gpt2 seems do not have hidden_size in config
+            self.model_hidden_size = 1024  # gpt2 seems do not have hidden_size in config
 
-        if inferencer_args.use_accelerator:
+        if is_accelerate_env():
             self.accelerator = Accelerator()
             self.accelerator.wait_for_everyone()
 
-
     def create_dataloader(self, dataset: Dataset):
         r"""Batchlize dataset and format it to dataloader.
 
@@ -101,17 +108,14 @@ def create_dataloader(self, dataset: Dataset):
         """
         if dataset.get_type() == "text_only":
             data_dict = dataset.to_dict()
-            inputs = [instance["text"] for instance in data_dict["instances"] ]
+            inputs = [instance["text"] for instance in data_dict["instances"]]
         elif dataset.get_type() == "image_text":
             inputs = dataset.to_list()
 
         dataset_size = len(inputs)
         dataset_buf = []
         for idx in range(dataset_size):
-            dataset_buf.append({
-                "input": inputs[idx],
-                "input_idx": idx
-            })
+            dataset_buf.append({"input": inputs[idx], "input_idx": idx})
 
         dataloader = batchlize(
             dataset_buf,
@@ -120,16 +124,15 @@ def create_dataloader(self, dataset: Dataset):
         )
         return dataloader, dataset_size
 
-
     def inference(
         self,
         model,
         dataset: Dataset,
-        max_new_tokens: int=100,
-        temperature: float=0.0,
-        prompt_structure: str='{input}',
-        remove_image_flag: bool=False,
-        chatbot_type: str="mini_gpt",
+        max_new_tokens: int = 100,
+        temperature: float = 0.0,
+        prompt_structure: str = "{input}",
+        remove_image_flag: bool = False,
+        chatbot_type: str = "mini_gpt",
     ):
         """
         Perform inference for a model
@@ -147,28 +150,22 @@ def inference(
         output_dataset: Dataset object.
         """
         if dataset.get_type() not in supported_dataset_type:
-            raise NotImplementedError(
-                'input dataset should have type {}'.format(
-                                        supported_dataset_type))
+            raise NotImplementedError("input dataset should have type {}".format(supported_dataset_type))
         dataloader, data_size = self.create_dataloader(dataset)
 
         # The output dataset
-        output_dict = {
-            "type": "text_only",
-            "instances": [
-            ]
-        }
+        output_dict = {"type": "text_only", "instances": []}
 
         for batch_index, batch in enumerate(dataloader):
-            current_batch = batch[0]        # batch size is 1
-            if isinstance(current_batch['input'], str):
-                input = prompt_structure.format(input=current_batch['input'])
+            current_batch = batch[0]  # batch size is 1
+            if isinstance(current_batch["input"], str):
+                input = prompt_structure.format(input=current_batch["input"])
             else:
-                input = current_batch['input']
-                input['text'] = prompt_structure.format(input=input['text'])
+                input = current_batch["input"]
+                input["text"] = prompt_structure.format(input=input["text"])
 
-            if False and 'images' in input and isinstance(input['images'], list):
-                input['images'] = np.array(input['images'])
+            if False and "images" in input and isinstance(input["images"], list):
+                input["images"] = np.array(input["images"])
             if remove_image_flag:
                 # remove the image flag <ImageHere> in tokenization;
                 if chatbot_type == "mini_gpt":
@@ -177,33 +174,27 @@ def inference(
                     image_split_flag = "<image>"
                 else:
                     raise NotImplementedError
-                input['text'] = input['text'].split(image_split_flag)
+                input["text"] = input["text"].split(image_split_flag)
                 # TODO remove this code by update the tokenizer
                 input_ids = []
                 attention_mask = []
                 image_token_indexes = []
                 temp_input = copy.deepcopy(input)
-                for idx in range(len(input['text'])):
-                    temp_input['text'] = input['text'][idx]
-                    temp_inputs = model.encode(
-                        temp_input,
-                        return_tensors="pt",
-                        add_special_tokens=idx == 0
-                    ).to(device=self.local_rank)
-                    input_ids.append(temp_inputs['input_ids'])
-                    attention_mask.append(temp_inputs['attention_mask'])
+                for idx in range(len(input["text"])):
+                    temp_input["text"] = input["text"][idx]
+                    temp_inputs = model.encode(temp_input, return_tensors="pt", add_special_tokens=idx == 0).to(
+                        device=self.local_rank
+                    )
+                    input_ids.append(temp_inputs["input_ids"])
+                    attention_mask.append(temp_inputs["attention_mask"])
                     if chatbot_type == "llava":
                         # add the flag for inserting the image.
                         # TODO should merge the way of handling image flag in minigpt and llava.
-                        index_tensor = torch.tensor(
-                            [IMAGE_TOKEN_INDEX]
-                        ).to(device=self.local_rank)
+                        index_tensor = torch.tensor([IMAGE_TOKEN_INDEX]).to(device=self.local_rank)
                         index_tensor = index_tensor.reshape(1, 1)
                         input_ids.append(index_tensor)
-                        attention_mask.append(
-                            torch.ones(1,1).to(device=self.local_rank))
-                    image_token_indexes.append(
-                        temp_inputs["input_ids"].shape[1])
+                        attention_mask.append(torch.ones(1, 1).to(device=self.local_rank))
+                    image_token_indexes.append(temp_inputs["input_ids"].shape[1])
                 if len(image_token_indexes) > 1:
                     image_token_indexes = image_token_indexes[:-1]
                     if chatbot_type == "llava":
@@ -214,27 +205,20 @@ def inference(
                 inputs["attention_mask"] = torch.cat(attention_mask, dim=1)
             else:
                 if self.inferencer_args.device == "gpu":
-                    inputs = model.encode(
-                        input, return_tensors="pt"
-                    ).to(device=self.local_rank)
+                    inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
                 elif self.inferencer_args.device == "cpu":
-                    inputs = model.encode(
-                        input, return_tensors="pt"
-                    ).to(device='cpu')
+                    inputs = model.encode(input, return_tensors="pt").to(device="cpu")
                 else:
-                    raise NotImplementedError(
-                        f"device \"{self.inferencer_args.device}\" is not supported"
-                    )
+                    raise NotImplementedError(f'device "{self.inferencer_args.device}" is not supported')
 
-                if self.inferencer_args.use_accelerator:
+                if is_accelerate_env():
                     inputs = inputs.to(self.accelerator.device)
 
-
             if remove_image_flag:
                 inputs["image_token_indexes"] = image_token_indexes
                 inputs["one_sample_multiple_images"] = True
 
-            if self.inferencer_args.use_accelerator:
+            if is_accelerate_env():
                 with self.accelerator.autocast():
                     outputs = model.inference(
                         inputs,
@@ -242,7 +226,6 @@ def inference(
                         temperature=self.inferencer_args.temperature,
                         repetition_penalty=self.inferencer_args.repetition_penalty,
                         do_sample=self.inferencer_args.do_sample,
-                        use_accelerator=True,
                     )
             else:
                 outputs = model.inference(
@@ -256,19 +239,29 @@ def inference(
             # only return the generation, trucating the input
             if self.model_args.arch_type != "vision_encoder_decoder":
                 text_out = model.decode(outputs[0], skip_special_tokens=True)
-                prompt_length = len(model.decode(inputs[0], skip_special_tokens=True,))
+                prompt_length = len(
+                    model.decode(
+                        inputs[0],
+                        skip_special_tokens=True,
+                    )
+                )
                 text_out = text_out[prompt_length:]
             else:
                 # to avoid redundant/missing leading space problem, we use a
                 # part of the input text
-                input_text = inputs['input_ids'][0][-1:]
+                input_text = inputs["input_ids"][0][-1:]
                 text_out = model.decode(torch.cat([input_text, outputs[0]]), skip_special_tokens=True)
-                prompt_length = len(model.decode(input_text, skip_special_tokens=True,))
+                prompt_length = len(
+                    model.decode(
+                        input_text,
+                        skip_special_tokens=True,
+                    )
+                )
                 text_out = text_out[prompt_length:]
 
-            output_dict["instances"].append({ "text": text_out })
+            output_dict["instances"].append({"text": text_out})
 
-        output_dataset = Dataset(DatasetArguments(dataset_path = None))
+        output_dataset = Dataset(DatasetArguments(dataset_path=None))
         output_dataset = output_dataset.from_dict(output_dict)
 
         return output_dataset
@@ -282,12 +275,12 @@ def stream_inference(
         temperature,
         end_string,
         input_dataset,
-        remove_image_flag: bool=False,
+        remove_image_flag: bool = False,
     ):
         response = ""
         history = []
         if "ChatGLMModel" in self.config.architectures:
-            for response, history in model.get_backend_model().stream_chat(model.get_tokenizer(), context, history=history):
+            for response, _ in model.get_backend_model().stream_chat(model.get_tokenizer(), context, history=history):
                 response = rstrip_partial_utf8(response)
                 yield response, False
         else:
@@ -329,10 +322,10 @@ class SpeculativeInferencer(Inferencer):
     ------------
     target_model_args : ModelArguments object.
         Contains the arguments required to load the target model.
-        
+
     draft_model_args : ModelArguments object.
         Contains the arguments required to load the draft model.
-    
+
     data_args : DatasetArguments object.
         Contains the arguments required to load the dataset.
 
@@ -341,30 +334,33 @@ class SpeculativeInferencer(Inferencer):
 
 
     """
+
     def __init__(self, model_args, draft_model_args, data_args, inferencer_args):
         super().__init__(model_args, data_args, inferencer_args)
         self.draft_model_args = draft_model_args
 
         self.draft_config = AutoConfig.from_pretrained(draft_model_args.model_name_or_path, trust_remote_code=True)
-        try: 
+        try:
             self.draft_model_hidden_size = self.draft_config.hidden_size
-        except:
+        except Exception:
             print("Error in setting hidden size for draft model, use the default size 1024")
             self.draft_model_hidden_size = 768
-            
-    
-    @staticmethod        
-    def score_to_prob(scores: torch.Tensor, 
-                      temperature: float = 0.,
-                      top_p: float = 1.,) -> torch.Tensor:
-        """Convert scores (NOT softmaxed tensor) to probabilities with support for temperature, top-p sampling, and argmax.
+
+    @staticmethod
+    def score_to_prob(
+        scores: torch.Tensor,
+        temperature: float = 0.0,
+        top_p: float = 1.0,
+    ) -> torch.Tensor:
+        """
+        Convert scores (NOT softmaxed tensor) to probabilities with support for temperature, top-p sampling, and argmax.
 
         Parameters
         ----------
         scores : torch.Tensor
             Input scores.
         temperature : float, optional
-            Temperature parameter for controlling randomness. Higher values make the distribution more uniform, 
+            Temperature parameter for controlling randomness. Higher values make the distribution more uniform,
             lower values make it peakier. When temperature <= 1e-6, argmax is used. by default 0.0
         top_p : float, optional
             Top-p sampling parameter for controlling the cumulative probability threshold, by default 1.0 (no threshold)
@@ -376,7 +372,7 @@ def score_to_prob(scores: torch.Tensor,
         """
         assert temperature >= 0.0
         assert 0.0 < top_p <= 1.0
-        
+
         if temperature <= 1e-6:
             final_prob = F.one_hot(scores.argmax(dim=1), num_classes=scores.size(1)).float()
         else:
@@ -393,61 +389,57 @@ def score_to_prob(scores: torch.Tensor,
                     final_prob.scatter_add_(1, sorted_scores.argsort(dim=1), thresholded_probs)
                 else:
                     final_prob = scores.softmax(dim=1)
-                    
+
             else:
                 final_prob = scores.softmax(dim=1)
 
         return final_prob
-    
-    
+
     @staticmethod
-    def sample(prob: torch.Tensor, num_samples: int = 1) -> Dict:
-        """Sample from a tensor of probabilities
-        """
-        sampled_indices = torch.multinomial(prob, num_samples=num_samples, replacement=True) 
-        return {'sampled_token': sampled_indices, 'sampled_prob': prob.gather(dim=1, index=sampled_indices), 'all_prob': prob}
-    
-    
+    def sample(prob: torch.Tensor, num_samples: int = 1) -> dict:
+        """Sample from a tensor of probabilities"""
+        sampled_indices = torch.multinomial(prob, num_samples=num_samples, replacement=True)
+        return {
+            "sampled_token": sampled_indices,
+            "sampled_prob": prob.gather(dim=1, index=sampled_indices),
+            "all_prob": prob,
+        }
+
     @staticmethod
     def predict_next_token(model: HFDecoderModel, input_ids: torch.Tensor, num_new_tokens: int = 1):
-        """Predict the next token given the input_ids.
-        """
-        output = model.inference(input_ids, 
-                                 use_accelerator=True, 
-                                 max_new_tokens=num_new_tokens,
-                                 return_dict_in_generate=True,
-                                 output_scores=True,
-                                 do_sample=True,
-                                 num_beams=1)
+        """Predict the next token given the input_ids."""
+        output = model.inference(
+            input_ids,
+            max_new_tokens=num_new_tokens,
+            return_dict_in_generate=True,
+            output_scores=True,
+            do_sample=True,
+            num_beams=1,
+        )
         return output
-    
-    
-    def autoregressive_sampling(self, 
-                                input_ids: torch.Tensor, 
-                                model: HFDecoderModel, 
-                                temperature: float = 0., 
-                                num_new_tokens: int = 5) -> Dict:
-        """Ref: [arXiv:2211.17192v2](https://arxiv.org/abs/2211.17192) Section 2.2
-        """
+
+    def autoregressive_sampling(
+        self, input_ids: torch.Tensor, model: HFDecoderModel, temperature: float = 0.0, num_new_tokens: int = 5
+    ) -> dict:
+        """Ref: [arXiv:2211.17192v2](https://arxiv.org/abs/2211.17192) Section 2.2"""
         sequence = input_ids
         new_tokens = []
-        
+
         for _ in range(num_new_tokens):
-            pred = self.predict_next_token(model=model, input_ids=sequence, num_new_tokens=1) # predict next one token
+            pred = self.predict_next_token(model=model, input_ids=sequence, num_new_tokens=1)  # predict next one token
             prob = self.score_to_prob(pred.scores[0], temperature=temperature)
             sampled = self.sample(prob=prob, num_samples=1)
             new_tokens.append(sampled)
-            sequence = torch.cat([sequence, sampled['sampled_token']], dim=1)
-            
+            sequence = torch.cat([sequence, sampled["sampled_token"]], dim=1)
+
         return {"sequence": sequence, "new_tokens": new_tokens}
-    
-    
+
     def inference(
         self,
         model: HFDecoderModel,
         draft_model: HFDecoderModel,
         input: str,
-        temperature: float = 0.,
+        temperature: float = 0.0,
         gamma: int = 5,
         max_new_tokens: int = 100,
     ):
@@ -458,19 +450,19 @@ def inference(
         ------------
         model : HFDecoderModel object.
             TunableModel to verify tokens generated by the draft model.
-            
+
         draft_model : HFDecoderModel object.
             TunableModel that provides approximations of the target model.
 
         input : str.
             The input text (i.e., the prompt) for the model.
-            
+
         gamma : int.
             The number of tokens to be generated by the draft model within each iter.
-            
+
         max_new_tokens : int.
             The maximum number of tokens to be generated by the target model.
-            
+
 
         Returns
         -------
@@ -482,100 +474,93 @@ def inference(
         if self.inferencer_args.device == "gpu":
             inputs = model.encode(input, return_tensors="pt").to(device=self.local_rank)
         elif self.inferencer_args.device == "cpu":
-            inputs = model.encode(input, return_tensors="pt").to(device='cpu')
+            inputs = model.encode(input, return_tensors="pt").to(device="cpu")
         else:
-            raise NotImplementedError(
-                f"device \"{self.inferencer_args.device}\" is not supported"
-            )
+            raise NotImplementedError(f'device "{self.inferencer_args.device}" is not supported')
 
-
-        def speculative_sampling(input_ids: torch.Tensor,
-                                 model: HFDecoderModel,
-                                 draft_model: HFDecoderModel,
-                                 temperature: float = 0.) -> torch.Tensor:
+        def speculative_sampling(
+            input_ids: torch.Tensor, model: HFDecoderModel, draft_model: HFDecoderModel, temperature: float = 0.0
+        ) -> torch.Tensor:
             """Ref: [arXiv:2211.17192v2](https://arxiv.org/abs/2211.17192)
 
             Parameters
             ----------
             input_ids : torch.Tensor
             draft_model : TunableModel object
-            model_list : List[TunableModel object]
+            model_list : list[TunableModel object]
 
             Returns
             -------
             torch.Tensor
             """
-            len_input_ids= input_ids.shape[1]
+            len_input_ids = input_ids.shape[1]
             logger.debug(f"len of input_ids: {len_input_ids}")
-            
+
             # STEP 1: Sample γ guesses x1, ..., xγ from Mq (draft model) autoregressively
             output_draft = self.autoregressive_sampling(input_ids=input_ids, model=draft_model, num_new_tokens=gamma)
             logger.debug(f"draft result: {output_draft['sequence']}")
             logger.debug(f"draft result decoded: {draft_model.decode(output_draft['sequence'][0])}")
-            
-            
+
             # STEP 2: Run Mp (target model) in parallel
             # generate sequences [prefix, x1, x2, ..., xγ]
-            output = model.get_backend_model()(input_ids=output_draft['sequence'], return_dict=True)
-            logger.debug(f'shape of output: {output.logits.shape}')
-            
-            
+            output = model.get_backend_model()(input_ids=output_draft["sequence"], return_dict=True)
+            logger.debug(f"shape of output: {output.logits.shape}")
+
             # STEP 3: Determine the number of accepted guesses n
             accepted = [False] * gamma
             for i in range(gamma):
-                draft_sampled_token_id = output_draft['new_tokens'][i]['sampled_token']
-                draft_sampled_token_prob = output_draft['new_tokens'][i]['sampled_prob']
-                token_prob = self.score_to_prob(output.logits[:,len_input_ids+i-1,:], temperature=temperature)[0, draft_sampled_token_id]
+                draft_sampled_token_id = output_draft["new_tokens"][i]["sampled_token"]
+                draft_sampled_token_prob = output_draft["new_tokens"][i]["sampled_prob"]
+                token_prob = self.score_to_prob(output.logits[:, len_input_ids + i - 1, :], temperature=temperature)[
+                    0, draft_sampled_token_id
+                ]
 
                 # reject the sample with probability 1 - p(x)/q(x)
-                if torch.rand_like(token_prob) > token_prob/draft_sampled_token_prob:
+                if torch.rand_like(token_prob) > token_prob / draft_sampled_token_prob:
                     break
                 else:
                     accepted[i] = True
-                
-            logger.debug(f"Speculative Sampling: Accepted: {sum(accepted)}/{gamma}")
 
+            logger.debug(f"Speculative Sampling: Accepted: {sum(accepted)}/{gamma}")
 
             # STEP 4: Adjust the distribution from Mp if needed
             if not all(accepted):
-                all_prob = self.score_to_prob(output.logits[:,len_input_ids+i-1,:], temperature=temperature)
-                draft_all_prob = output_draft['new_tokens'][i]['all_prob']
+                all_prob = self.score_to_prob(output.logits[:, len_input_ids + i - 1, :], temperature=temperature)
+                draft_all_prob = output_draft["new_tokens"][i]["all_prob"]
                 adjusted_prob = torch.max(torch.zeros_like(all_prob), all_prob - draft_all_prob)
                 prob = adjusted_prob / adjusted_prob.sum(dim=1, keepdim=True)
             else:
-                prob = self.score_to_prob(output.logits[:,-1,:], temperature=temperature)
-
+                prob = self.score_to_prob(output.logits[:, -1, :], temperature=temperature)
 
             # STEP 5: Return n tokens from Mq, and one token from Mp
-            token_from_target_model = self.sample(prob)['sampled_token']
-            final_sequence = torch.concat([output_draft['sequence'][:,:len_input_ids+sum(accepted)], token_from_target_model], dim=1)
+            token_from_target_model = self.sample(prob)["sampled_token"]
+            final_sequence = torch.concat(
+                [output_draft["sequence"][:, : len_input_ids + sum(accepted)], token_from_target_model], dim=1
+            )
 
             return final_sequence
-        
 
         num_generated_new_tokens = 0
         len_raw_input = len(inputs[0])
         while num_generated_new_tokens < max_new_tokens:
-            logger.debug(f'===== New iter =====')
+            logger.debug("===== New iter =====")
             logger.debug(f"input_ids: {inputs}")
-            sampling_result = speculative_sampling(input_ids=inputs,
-                                                   model=model,
-                                                   draft_model=draft_model,
-                                                   temperature=temperature)
-            logger.debug(f'sampling result: {sampling_result}')
-            logger.debug(f'sampling result decoded: {model.decode(sampling_result[0])}')
+            sampling_result = speculative_sampling(
+                input_ids=inputs, model=model, draft_model=draft_model, temperature=temperature
+            )
+            logger.debug(f"sampling result: {sampling_result}")
+            logger.debug(f"sampling result decoded: {model.decode(sampling_result[0])}")
             num_generated_new_tokens += len(sampling_result[0]) - len(inputs[0])
             inputs = sampling_result
-        
-        
-        # if, say, num_generated_new_tokens = 19, and the model accept 3 
+
+        # if, say, num_generated_new_tokens = 19, and the model accept 3
         # tokens, the actual generated tokens would be 22.
-        return model.decode(inputs[0,:len_raw_input+max_new_tokens])
-        
+        return model.decode(inputs[0, : len_raw_input + max_new_tokens])
 
     def stream_inference(self):
         raise NotImplementedError("Streaming output for SpeculativeInferencer is not supported yet")
 
+
 class ToolInferencer(Inferencer):
     """
     Initializes the `ToolInferencer` class with given arguments.
@@ -593,6 +578,7 @@ class ToolInferencer(Inferencer):
 
 
     """
+
     def __init__(self, model_args, data_args, inferencer_args):
         super().__init__(model_args, data_args, inferencer_args)
 
@@ -602,7 +588,7 @@ def inference(
         self,
         model: HFDecoderModel,
         input: str,
-        max_new_tokens: int=1024,
+        max_new_tokens: int = 1024,
     ):
         """
         Perform inference for a model
@@ -613,8 +599,8 @@ def inference(
             TunableModel to perform inference
 
         input : str.
-            The input text (i.e., the prompt) for the model. 
-            
+            The input text (i.e., the prompt) for the model.
+
         max_new_tokens : int.
             The maximum number of tokens to be generated by the model.
 
@@ -626,20 +612,18 @@ def inference(
         if self.inferencer_args.device == "gpu":
             input_id = model.encode(input, return_tensors="pt").to(device=self.local_rank)
         elif self.inferencer_args.device == "cpu":
-            input_id = model.encode(input, return_tensors="pt").to(device='cpu')
+            input_id = model.encode(input, return_tensors="pt").to(device="cpu")
         logger.debug(f"input_id: {input_id}")
-        input_length = input_id.shape[1]
         output_id = model.inference(
             input_id,
-            use_accelerator=True,
             max_new_tokens=max_new_tokens,
             # pad_token_id=model.tokenizer.eos_token_id,
         )
         # logger.debug(f"output: {output_id}")
         output = model.decode(output_id[0])
-        output = output.replace(input,"")
+        output = output.replace(input, "")
         return output
-    
+
     def code_exec(self, code):
         # Execute the code
         result = subprocess.run(["python", "-c", code], capture_output=True, text=True)
@@ -653,10 +637,3 @@ def code_exec(self, code):
             print("Error:")
             print(result.stderr)
             return result
-            
-        
-            
-        
-        
-        
-        
diff --git a/src/lmflow/pipeline/iterative_dpo_aligner.py b/src/lmflow/pipeline/iterative_dpo_aligner.py
index 59d122545..6e384736f 100644
--- a/src/lmflow/pipeline/iterative_dpo_aligner.py
+++ b/src/lmflow/pipeline/iterative_dpo_aligner.py
@@ -1,26 +1,24 @@
 import copy
-from dataclasses import fields
-import gc
 import json
 import logging
+from dataclasses import fields
 from pathlib import Path
-from typing import List, Dict, Any, Optional
 
 from tqdm import tqdm
 
-from lmflow.models.hf_text_regression_model import HFTextRegressionModel
-from lmflow.models.hf_decoder_model import HFDecoderModel
-from lmflow.datasets.dataset import Dataset
-from lmflow.pipeline.dpov2_aligner import MemorySafeDPOv2Aligner
-from lmflow.pipeline.rm_inferencer import RewardModelInferencer
-from lmflow.pipeline.vllm_inferencer import MemorySafeVLLMInferencer
 from lmflow.args import (
-    ModelArguments, 
-    DatasetArguments, 
+    DatasetArguments,
+    DPOv2AlignerArguments,
     InferencerArguments,
     IterativeDPOAlignerArguments,
-    DPOv2AlignerArguments,
+    ModelArguments,
 )
+from lmflow.datasets.dataset import Dataset
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.models.hf_text_regression_model import HFTextRegressionModel
+from lmflow.pipeline.dpov2_aligner import MemorySafeDPOv2Aligner
+from lmflow.pipeline.rm_inferencer import RewardModelInferencer
+from lmflow.pipeline.vllm_inferencer import MemorySafeVLLMInferencer
 from lmflow.utils.common import print_banner
 
 logger = logging.getLogger(__name__)
@@ -31,7 +29,7 @@ def __init__(
         self,
         model_args: ModelArguments,
         data_args: DatasetArguments,
-        aligner_args:IterativeDPOAlignerArguments,
+        aligner_args: IterativeDPOAlignerArguments,
         ref_model_args: ModelArguments,
         reward_model_args: ModelArguments,
         **kwargs,
@@ -42,34 +40,27 @@ def __init__(
         self.ref_model_args = ref_model_args
         self.reward_model_args = reward_model_args
         self.workspace_path = Path(self.aligner_args.output_dir)
-        
-        
-    def align(
-        self,
-        dataset_list: List[Dataset]
-    ):
+
+    def align(self, dataset_list: list[Dataset]):
         num_iterations = len(dataset_list)
-        
+
         for iter_idx in tqdm(
-            range(self.aligner_args.initial_iter_idx, num_iterations), 
-            desc="Iterative DPO Align", 
-            unit="iteration"
+            range(self.aligner_args.initial_iter_idx, num_iterations), desc="Iterative DPO Align", unit="iteration"
         ):
             if iter_idx == 0:
                 target_model_args = self.model_args
             else:
                 target_model_args = copy.deepcopy(self.model_args)
-                target_model_args.model_name_or_path = str(self.workspace_path/f"iteration_{iter_idx}"/"model")
-                
+                target_model_args.model_name_or_path = str(self.workspace_path / f"iteration_{iter_idx}" / "model")
+
             self._align_single_iteration(
-                iteration_name=f"iteration_{iter_idx+1}",
+                iteration_name=f"iteration_{iter_idx + 1}",
                 target_model_args=target_model_args,
                 reward_model_args=self.reward_model_args,
                 ref_model_args=self.ref_model_args,
                 dataset=dataset_list[iter_idx],
             )
-    
-    
+
     def _align_single_iteration(
         self,
         iteration_name: str,
@@ -80,58 +71,57 @@ def _align_single_iteration(
     ):
         if self.aligner_args.do_response_generation:
             # generate responses
-            print_banner(f'Iterative DPO {iteration_name}: Generate responses')
-            model = HFDecoderModel(
-                model_args=target_model_args,
-                tune_strategy='none'
-            )
+            print_banner(f"Iterative DPO {iteration_name}: Generate responses")
+            model = HFDecoderModel(model_args=target_model_args, do_train=False)
             self._do_target_model_inference(
                 model=model,
                 dataset=dataset,
-                output_dir=str(self.workspace_path/iteration_name),
+                output_dir=str(self.workspace_path / iteration_name),
             )
             del model
-        
+
         if self.aligner_args.do_scoring:
-        # reward model scoring
-            print_banner(f'Iterative DPO {iteration_name}: Reward model scoring')
+            # reward model scoring
+            print_banner(f"Iterative DPO {iteration_name}: Reward model scoring")
             reward_model = HFTextRegressionModel(
                 model_args=reward_model_args,
-                tune_strategy='none',
-                use_accelerator=self.aligner_args.use_accelerator,
+                do_train=False,
             )
             target_model_inference_result_data_args = copy.deepcopy(dataset.data_args)
-            target_model_inference_result_data_args.dataset_path = str(self.workspace_path/iteration_name/"target_model_inference_result")
+            target_model_inference_result_data_args.dataset_path = str(
+                self.workspace_path / iteration_name / "target_model_inference_result"
+            )
             target_model_inference_result_data_args.block_size = self.aligner_args.reward_model_inference_block_size
             target_model_inference_result_dataset = Dataset(target_model_inference_result_data_args)
             self._do_reward_model_inference(
                 model=reward_model,
                 dataset=target_model_inference_result_dataset,
-                output_dir=str(self.workspace_path/iteration_name),
+                output_dir=str(self.workspace_path / iteration_name),
             )
             del reward_model
-        
+
         if self.aligner_args.do_dpo_align:
             # DPO training
-            print_banner(f'Iterative DPO {iteration_name}: DPO training')
+            print_banner(f"Iterative DPO {iteration_name}: DPO training")
             dpo_train_data_args = copy.deepcopy(dataset.data_args)
-            dpo_train_data_args.dataset_path = str(self.workspace_path/iteration_name/"reward_model_inference_result")
+            dpo_train_data_args.dataset_path = str(
+                self.workspace_path / iteration_name / "reward_model_inference_result"
+            )
             self._do_single_dpo_align(
                 model_args=target_model_args,
                 ref_model_args=ref_model_args,
                 data_args=dpo_train_data_args,
-                output_dir=str(self.workspace_path/iteration_name/"model"),
+                output_dir=str(self.workspace_path / iteration_name / "model"),
                 iteration_name=iteration_name,
             )
-    
-    
+
     def _do_target_model_inference(
         self,
         model: HFDecoderModel,
         dataset: Dataset,
         output_dir: str,
     ):
-        result_cache_path = str(Path(output_dir)/"cache"/"target_model_inference_result.json")
+        result_cache_path = str(Path(output_dir) / "cache" / "target_model_inference_result.json")
         inferencer = MemorySafeVLLMInferencer(
             model_args=model.model_args,
             data_args=dataset.data_args,
@@ -141,19 +131,18 @@ def _do_target_model_inference(
             ),
         )
         res = inferencer.inference()
-        
+
         dataset_out = {"type": "text_to_textlist", "instances": res}
-            
-        target_model_inference_result_dir = Path(output_dir)/"target_model_inference_result"
+
+        target_model_inference_result_dir = Path(output_dir) / "target_model_inference_result"
         target_model_inference_result_dir.mkdir(parents=True, exist_ok=True)
         json.dump(
-            dataset_out, 
-            open(str(target_model_inference_result_dir/"result.json"), "w", encoding='utf-8'),
+            dataset_out,
+            open(str(target_model_inference_result_dir / "result.json"), "w", encoding="utf-8"),
             ensure_ascii=False,
             indent=4,
         )
-        
-        
+
     def _do_reward_model_inference(
         self,
         model: HFTextRegressionModel,
@@ -174,12 +163,11 @@ def _do_reward_model_inference(
             distributed_inference_num_instances=self.aligner_args.distributed_inference_num_instances,
             inference_batch_size=self.aligner_args.reward_model_inference_batch_size,
         )
-        
-        reward_model_inference_result_dir = Path(output_dir)/"reward_model_inference_result"
+
+        reward_model_inference_result_dir = Path(output_dir) / "reward_model_inference_result"
         reward_model_inference_result_dir.mkdir(parents=True, exist_ok=True)
-        res.save(str(reward_model_inference_result_dir/"result.json"))
-    
-    
+        res.save(str(reward_model_inference_result_dir / "result.json"))
+
     def _do_single_dpo_align(
         self,
         model_args: ModelArguments,
@@ -199,8 +187,7 @@ def _do_single_dpo_align(
             ref_model_args=ref_model_args,
         )
         aligner.align()
-        
-    
+
     def _parse_target_model_inference_args(
         self,
         args: IterativeDPOAlignerArguments,
@@ -210,12 +197,11 @@ def _parse_target_model_inference_args(
             mixed_args=args,
             target_cls=InferencerArguments,
         )
-        inferencer_args.save_results=True
-        inferencer_args.results_path=result_cache_path
-        
+        inferencer_args.save_results = True
+        inferencer_args.results_path = result_cache_path
+
         return inferencer_args
-    
-    
+
     def _parse_reward_model_inference_args(
         self,
         args: IterativeDPOAlignerArguments,
@@ -224,10 +210,9 @@ def _parse_reward_model_inference_args(
             mixed_args=args,
             target_cls=InferencerArguments,
         )
-        
+
         return inferencer_args
-    
-    
+
     def _parse_dpo_aligner_args(
         self,
         args: IterativeDPOAlignerArguments,
@@ -240,10 +225,9 @@ def _parse_dpo_aligner_args(
         )
         aligner_args.output_dir = output_dir
         aligner_args.run_name = f"{args.run_name}_{iteration_name}"
-        
+
         return aligner_args
-    
-    
+
     def __filter_args(
         self,
         mixed_args,
@@ -251,4 +235,4 @@ def __filter_args(
     ):
         target_cls_fields = {f.name for f in fields(target_cls) if f.init}
         common_fields = {f: getattr(mixed_args, f) for f in target_cls_fields if hasattr(mixed_args, f)}
-        return target_cls(**common_fields)
\ No newline at end of file
+        return target_cls(**common_fields)
diff --git a/src/lmflow/pipeline/raft_aligner.py b/src/lmflow/pipeline/raft_aligner.py
index 3c95810a9..72cd4cd79 100644
--- a/src/lmflow/pipeline/raft_aligner.py
+++ b/src/lmflow/pipeline/raft_aligner.py
@@ -1,32 +1,29 @@
 #!/usr/bin/env python
-# coding=utf-8
 """
 The Aligner class simplifies the process of running alignment.
 """
 
 import logging
-import numpy as np
 import os
 import sys
 import time
 from itertools import chain
 
+import numpy as np
 import torch
 import torch.distributed as dist
 import transformers
 from datasets import (
-    set_caching_enabled,
     Dataset,
     DatasetDict,
+    set_caching_enabled,
 )
 from transformers import (
     default_data_collator,
-    pipeline,
     set_seed,
 )
 from transformers.testing_utils import CaptureLogger
 
-from lmflow.args import DatasetArguments
 from lmflow.datasets.dataset import Dataset as LMFlowDataset
 from lmflow.pipeline.base_aligner import BaseAligner
 from lmflow.pipeline.utils.raft_trainer import RaftTrainer
@@ -42,7 +39,7 @@ class RaftAligner(BaseAligner):
     ------------
     model_args : ModelArguments object.
         Contains the arguments required to load the model.
-    
+
     data_args : DatasetArguments object.
         Contains the arguments required to load the dataset.
 
@@ -51,11 +48,12 @@ class RaftAligner(BaseAligner):
 
     args : Optional.
         Positional arguments.
-    
+
     kwargs : Optional.
         Keyword arguments.
 
     """
+
     def __init__(self, model_args, data_args, aligner_args, *args, **kwargs):
         self.model_args = model_args
         self.data_args = data_args
@@ -79,7 +77,6 @@ def __init__(self, model_args, data_args, aligner_args, *args, **kwargs):
             except OSError:
                 pass
 
-
     def _initialize_trainer(self, model, tokenizer, training_args):
         """
         This function takes the model and tokenizer as the input and initialize the trainer.
@@ -87,7 +84,7 @@ def _initialize_trainer(self, model, tokenizer, training_args):
         trainer = RaftTrainer(
             model=model,
             args=training_args,
-            train_dataset=Dataset.from_dict({"text": [ " " ] }),
+            train_dataset=Dataset.from_dict({"text": [" "]}),
             eval_dataset=Dataset.from_dict({}),
             tokenizer=tokenizer,
             data_collator=default_data_collator,
@@ -96,7 +93,6 @@ def _initialize_trainer(self, model, tokenizer, training_args):
         )
         return trainer
 
-
     def _load_dataset(
         self,
         selected_dataset,
@@ -106,9 +102,9 @@ def _load_dataset(
         data_args,
         training_args,
     ):
-        '''
+        """
         This function prepares the dataset for every iteration.
-        '''
+        """
         raw_datasets = selected_dataset
 
         if training_args.do_train:
@@ -208,7 +204,7 @@ def group_texts(examples):
                     batched=True,
                     batch_size=group_batch_size,
                 )
-    
+
         if training_args.do_train:
             if "train" not in tokenized_datasets:
                 raise ValueError("--do_train requires a train dataset")
@@ -219,7 +215,6 @@ def group_texts(examples):
 
         return train_dataset
 
-
     def _load_input_dataset(self, dataset, tokenizer):
         """
         Load input dataset (i.e. prompt/question dataset) for training.
@@ -236,13 +231,13 @@ def _load_input_dataset(self, dataset, tokenizer):
 
         def tokenize(sample):
             sample["input_ids"] = tokenizer.encode(sample["text"])
-            sample['input'] = tokenizer.decode(sample["input_ids"])
+            sample["input"] = tokenizer.decode(sample["input_ids"])
             return sample
 
         ds = ds.map(tokenize, batched=False)
         ds = ds.filter(lambda x: len(x["input_ids"]) <= 256)
 
-        ds.set_format(type='torch')
+        ds.set_format(type="torch")
 
         return ds
 
@@ -250,12 +245,12 @@ def _clean_text(self, text):
         if len(text) == 0:
             return text
         stext = [x for x in text.split("###Human") if x]
-        return stext[0].strip().strip("#") 
+        return stext[0].strip().strip("#")
 
     def _discard_sample(self, text):
         if "#" in text:
             return True
-        elif len(text) < 2: # delete empty sample
+        elif len(text) < 2:  # delete empty sample
             return True
         return False
 
@@ -281,8 +276,8 @@ def _get_batch_dataset_top(
         # we will get the batch dataset via Dataset.from_dict
         start_time = time.time()
 
-        query_tensors = batch_input['input_ids']
-        querys = batch_input['input']
+        query_tensors = batch_input["input_ids"]
+        querys = batch_input["input"]
         data_size = len(querys)
 
         reward_eva = []  # record the reward of the samples
@@ -292,7 +287,7 @@ def _get_batch_dataset_top(
         for i, query_tensor in enumerate(query_tensors):
             query = querys[i]
             input_texts.append(query)
-            if (i + 1) % infer_batch_size == 0 or (i+1 == data_size):
+            if (i + 1) % infer_batch_size == 0 or (i + 1 == data_size):
                 gen_len = np.random.randint(output_min_length, output_max_length)
                 generation_kwargs["max_new_tokens"] = gen_len
                 inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(training_args.device)
@@ -304,15 +299,15 @@ def _get_batch_dataset_top(
                 ]
                 texts_for_rewards = [q + r for q, r in zip(input_texts, generated_texts)]
 
-                texts_for_reward_dataset = LMFlowDataset.create_from_dict({
-                    "type": "text_only",
-                    "instances": [
-                        { "text": text } for text in texts_for_rewards
-                    ],
-                })
+                texts_for_reward_dataset = LMFlowDataset.create_from_dict(
+                    {
+                        "type": "text_only",
+                        "instances": [{"text": text} for text in texts_for_rewards],
+                    }
+                )
 
                 reward_dataset = reward_model.inference(texts_for_reward_dataset)
-                rewards = [ sample["value"] for sample in reward_dataset.to_dict()["instances"] ]
+                rewards = [sample["value"] for sample in reward_dataset.to_dict()["instances"]]
 
                 reward_eva.extend(rewards)
                 responses.extend(generated_texts)
@@ -321,8 +316,7 @@ def _get_batch_dataset_top(
         data = [{"input": querys[j], "output": [responses[j]]} for j in range(len(reward_eva))]
 
         world_size = int(os.getenv("WORLD_SIZE", "1"))
-        all_process_list =[{}] * world_size
-
+        all_process_list = [{}] * world_size
 
         data_to_send = [[data[i], reward_eva[i]] for i in range(len(data))]
         dist.all_gather_object(all_process_list, data_to_send)
@@ -335,48 +329,53 @@ def _get_batch_dataset_top(
             tmp_reward = [tmp[1] for tmp in all_process_list[i]]
             gathered_reward.extend(tmp_reward)
 
-        idx = np.argsort(gathered_reward)[::-1][:int(len(gathered_reward) * alpha)]
+        idx = np.argsort(gathered_reward)[::-1][: int(len(gathered_reward) * alpha)]
         gathered_data = [gathered_data[j] for j in idx]
         reward_train = [gathered_reward[j] for j in idx]
 
         self.reward_seq.append(np.mean(gathered_reward))
         self.train_reawrd.append(np.mean(reward_train))
         import matplotlib.pyplot as plt
+
         if training_args.local_rank == 0:
             plt.plot(self.reward_seq, marker="o")
             plt.plot(self.train_reawrd, marker="*")
             plt.legend(["Model reward", "Reward of SFT Set"])
-            plt.savefig(self.store_dir + '/training_reward.png')
+            plt.savefig(self.store_dir + "/training_reward.png")
             plt.close()
 
         logger.info(f"collected data of {len(gathered_data)}")
         logger.info([np.mean(gathered_reward), np.mean(reward_train)])
 
         if training_args.local_rank == 0 and output_reward_path is not None:
-            with open(output_reward_path, mode='a') as fout:
-                fout.write('mean reward: ' + str(np.mean(gathered_reward)) + 'mean reward in training set: ' + str(np.mean(reward_train)))
+            with open(output_reward_path, mode="a") as fout:
+                fout.write(
+                    "mean reward: "
+                    + str(np.mean(gathered_reward))
+                    + "mean reward in training set: "
+                    + str(np.mean(reward_train))
+                )
                 fout.write("\n")
 
-
         prompt_structure = "{definition}{input}{output}"
         tmp_output_dataset = {
-            "text": [ prompt_structure.format(
-                          definition="", input=sample["input"], output=sample["output"][0]
-                      ) for sample in gathered_data
+            "text": [
+                prompt_structure.format(definition="", input=sample["input"], output=sample["output"][0])
+                for sample in gathered_data
             ]
         }
 
         # We store the training set for monitoring the RAFT training
-        all_texts = tmp_output_dataset['text']
+        all_texts = tmp_output_dataset["text"]
         output_eval_dataset = {}
-        output_eval_dataset['type'] = 'text_only'
-        output_eval_dataset['instances'] = [{'text': i_text} for i_text in all_texts]
+        output_eval_dataset["type"] = "text_only"
+        output_eval_dataset["instances"] = [{"text": i_text} for i_text in all_texts]
         import json
+
         if local_rank == 0:
-            with open(self.store_dir + "/train_set_" + str(iter_id) + ".json", 'w', encoding='utf8') as f:
+            with open(self.store_dir + "/train_set_" + str(iter_id) + ".json", "w", encoding="utf8") as f:
                 json.dump(output_eval_dataset, f, ensure_ascii=False)
 
-        
         # We need to make sure that the order of the samples are the same for each agent
         all_process_list = [{}] * world_size
         data_to_send = [tmp_output_dataset, local_rank]
@@ -386,165 +385,162 @@ def _get_batch_dataset_top(
                 output_dataset = all_process_list[i][0]
                 break
 
-        return DatasetDict({ "train": Dataset.from_dict(output_dataset) })
+        return DatasetDict({"train": Dataset.from_dict(output_dataset)})
 
     def _get_batch_dataset_local(
-            self,
-            model,
-            batch_input,
-            K=8,
-            iter_id=0,
-            local_rank=0,
-            output_min_length=16,
-            output_max_length=48,
-            infer_batch_size=8,
-            generation_kwargs={},
-            tokenizer=None,
-            training_args=None,
-            reward_model=None,
-            output_reward_path=None,
-        ):
-            """
-            :param batch_input: input prompts
-            """
-            # we will get the batch dataset via Dataset.from_dict
-            start_time = time.time()
+        self,
+        model,
+        batch_input,
+        K=8,
+        iter_id=0,
+        local_rank=0,
+        output_min_length=16,
+        output_max_length=48,
+        infer_batch_size=8,
+        generation_kwargs={},
+        tokenizer=None,
+        training_args=None,
+        reward_model=None,
+        output_reward_path=None,
+    ):
+        """
+        :param batch_input: input prompts
+        """
+        # we will get the batch dataset via Dataset.from_dict
+        start_time = time.time()
 
-            querys = batch_input['input']
-            data_size = len(querys)
+        querys = batch_input["input"]
+        data_size = len(querys)
 
-            reward_eva = []
-            reward_train = []
+        reward_eva = []
+        reward_train = []
 
+        input_texts = []
+        responses = []
+        record_querys = []
+        all_outputs = []
+
+        for i, query in enumerate(querys):
+            input_texts = [query for _ in range(K)]
+
+            gen_len = np.random.randint(output_min_length, output_max_length)
+            generation_kwargs["max_new_tokens"] = gen_len
+            inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(training_args.device)
+            with torch.no_grad():
+                outputs = model.generate(**inputs, **generation_kwargs)
+            generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            generated_texts = [
+                generated_text.replace(input_texts[i], "") for i, generated_text in enumerate(generated_texts)
+            ]
+            generated_texts = [self._clean_text(generated_text) for generated_text in generated_texts]
+            texts_for_rewards = [q + r for q, r in zip(input_texts, generated_texts)]
+
+            texts_for_reward_dataset = LMFlowDataset.create_from_dict(
+                {
+                    "type": "text_only",
+                    "instances": [{"text": texts_for_rewards[i]} for i in range(len(texts_for_rewards))],
+                }
+            )
+
+            reward_dataset = reward_model.inference(texts_for_reward_dataset)
+            rewards = [sample["value"] for sample in reward_dataset.to_dict()["instances"]]
+            reward_eva.append(rewards[0])
+
+            ################################
+            # we impose some post-detection and discard the samples with certain criteria.
+            for kk in range(K):
+                if self._discard_sample(generated_texts[kk]):
+                    rewards[kk] = -self.INF
+            ################################
+
+            idx_to_record = np.argmax(rewards)
+            all_outputs.append(generated_texts[0])
+
+            # if we discard all the samples, we do not record the sample
+            if rewards[idx_to_record] != -self.INF:
+                responses.append(generated_texts[idx_to_record])
+                reward_train.append(rewards[idx_to_record])
+                record_querys.append(query)
             input_texts = []
-            responses = []
-            record_querys = []
-            all_outputs = []
 
-            for i, query in enumerate(querys):
-                input_texts = [query for _ in range(K)]
-                
-                gen_len = np.random.randint(output_min_length, output_max_length)
-                generation_kwargs["max_new_tokens"] = gen_len
-                inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(training_args.device)
-                with torch.no_grad():
-                    outputs = model.generate(**inputs, **generation_kwargs)
-                generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-                generated_texts = [
-                    generated_text.replace(input_texts[i], "") for i, generated_text in enumerate(generated_texts)
-                ]
-                generated_texts = [
-                    self._clean_text(generated_text) for generated_text in generated_texts
-                ]
-                texts_for_rewards = [q + r for q, r in zip(input_texts, generated_texts)]
+        data = []
+        for j in range(len(reward_train)):
+            sample = {}
+            sample["input"] = record_querys[j]
+            sample["output"] = [responses[j]]
+            data.append(sample)
 
-                texts_for_reward_dataset = LMFlowDataset.create_from_dict({
-                    "type": "text_only",
-                    "instances": [
-                        { "text": texts_for_rewards[i] } for i in range(len(texts_for_rewards))
-                    ],
-                })
+        world_size = int(os.getenv("WORLD_SIZE", "1"))
+        all_process_data = [{}] * world_size
+        dist.all_gather_object(all_process_data, data)
 
-                reward_dataset = reward_model.inference(texts_for_reward_dataset)
-                rewards = [ sample["value"] for sample in reward_dataset.to_dict()["instances"] ]
-                reward_eva.append(rewards[0])
-
-                ################################
-                # we impose some post-detection and discard the samples with certain criteria.
-                for kk in range(K):
-                    if self._discard_sample(generated_texts[kk]):
-                        rewards[kk] = -self.INF
-                ################################
-                
-                idx_to_record = np.argmax(rewards)
-                all_outputs.append(generated_texts[0])
-
-                # if we discard all the samples, we do not record the sample 
-                if rewards[idx_to_record] != -self.INF:
-                    responses.append(generated_texts[idx_to_record])
-                    reward_train.append(rewards[idx_to_record])
-                    record_querys.append(query)
-                input_texts = []
+        all_process_eval_reward = [{}] * world_size
+        dist.all_gather_object(all_process_eval_reward, reward_eva)
+        all_process_train_set_reward = [{}] * world_size
+        dist.all_gather_object(all_process_train_set_reward, reward_train)
 
+        gathered_data = []
+        gathered_reward = []
+        gathered_train_reward = []
 
-            data = []
-            for j in range(len(reward_train)):
-                sample = {}
-                sample["input"] = record_querys[j]
-                sample["output"] = [responses[j]]
-                data.append(sample)
-
-
-            world_size = int(os.getenv("WORLD_SIZE", "1"))
-            all_process_data =[{}] * world_size
-            dist.all_gather_object(all_process_data, data)
-
-            all_process_eval_reward =[{}] * world_size
-            dist.all_gather_object(all_process_eval_reward, reward_eva)
-            all_process_train_set_reward =[{}] * world_size
-            dist.all_gather_object(all_process_train_set_reward, reward_train)
-
-            
-            gathered_data = []
-            gathered_reward = []
-            gathered_train_reward = []
-
-            for i in range(world_size):
-                gathered_data.extend(all_process_data[i])
-                gathered_reward.extend(all_process_eval_reward[i])
-                gathered_train_reward.extend(all_process_train_set_reward[i])
-
-            if training_args.local_rank == 0 and output_reward_path is not None:
-                with open(output_reward_path, mode='a') as fout:
-                    fout.write('mean reward: ' + str(np.mean(gathered_reward)) + 'mean reward in training set: ' + str(np.mean(gathered_train_reward)))
-                    fout.write("\n")
-            logger.info([np.mean(gathered_reward), np.mean(gathered_train_reward)])
-
-            
-            self.reward_seq.append(np.mean(gathered_reward))
-            self.train_reawrd.append(np.mean(reward_train))
-            import matplotlib.pyplot as plt
-            if training_args.local_rank == 0:
-                plt.plot(self.reward_seq, marker="o")
-                plt.plot(self.train_reawrd, marker="*")
-                plt.legend(["Model reward", "Reward of SFT Set"])
-                plt.savefig(self.store_dir + '/training_reward.png')
-                plt.close()
-            
-
-            prompt_structure = "{definition}{input}{output}"
-            tmp_output_dataset = {
-                "text": [ prompt_structure.format(
-                            definition="", input=sample["input"], output=sample["output"][0]
-                        ) for sample in gathered_data
-                ]
-            }
+        for i in range(world_size):
+            gathered_data.extend(all_process_data[i])
+            gathered_reward.extend(all_process_eval_reward[i])
+            gathered_train_reward.extend(all_process_train_set_reward[i])
+
+        if training_args.local_rank == 0 and output_reward_path is not None:
+            with open(output_reward_path, mode="a") as fout:
+                fout.write(
+                    "mean reward: "
+                    + str(np.mean(gathered_reward))
+                    + "mean reward in training set: "
+                    + str(np.mean(gathered_train_reward))
+                )
+                fout.write("\n")
+        logger.info([np.mean(gathered_reward), np.mean(gathered_train_reward)])
+
+        self.reward_seq.append(np.mean(gathered_reward))
+        self.train_reawrd.append(np.mean(reward_train))
+        import matplotlib.pyplot as plt
+
+        if training_args.local_rank == 0:
+            plt.plot(self.reward_seq, marker="o")
+            plt.plot(self.train_reawrd, marker="*")
+            plt.legend(["Model reward", "Reward of SFT Set"])
+            plt.savefig(self.store_dir + "/training_reward.png")
+            plt.close()
 
-            # We store the training set for monitoring the RAFT training
-            all_texts = tmp_output_dataset['text']
-            output_eval_dataset = {}
-            output_eval_dataset['type'] = 'text_only'
-            output_eval_dataset['instances'] = [{'text': i_text} for i_text in all_texts]
-            import json
-            if local_rank == 0:
-                with open(self.store_dir + "/train_set_" + str(iter_id) + ".json", 'w', encoding='utf8') as f:
-                    json.dump(output_eval_dataset, f, ensure_ascii=False)
+        prompt_structure = "{definition}{input}{output}"
+        tmp_output_dataset = {
+            "text": [
+                prompt_structure.format(definition="", input=sample["input"], output=sample["output"][0])
+                for sample in gathered_data
+            ]
+        }
 
-            
-            # We need to make sure that the order of the samples are the same for each agent
-            all_process_list = [{}] * world_size
-            data_to_send = [tmp_output_dataset, local_rank]
-            dist.all_gather_object(all_process_list, data_to_send)
-            for i in range(world_size):
-                if all_process_list[i][1] == 0:
-                    output_dataset = all_process_list[i][0]
-                    break
+        # We store the training set for monitoring the RAFT training
+        all_texts = tmp_output_dataset["text"]
+        output_eval_dataset = {}
+        output_eval_dataset["type"] = "text_only"
+        output_eval_dataset["instances"] = [{"text": i_text} for i_text in all_texts]
+        import json
 
-            logger.info(f"collected data of {len(output_dataset['text'])}")
+        if local_rank == 0:
+            with open(self.store_dir + "/train_set_" + str(iter_id) + ".json", "w", encoding="utf8") as f:
+                json.dump(output_eval_dataset, f, ensure_ascii=False)
 
+        # We need to make sure that the order of the samples are the same for each agent
+        all_process_list = [{}] * world_size
+        data_to_send = [tmp_output_dataset, local_rank]
+        dist.all_gather_object(all_process_list, data_to_send)
+        for i in range(world_size):
+            if all_process_list[i][1] == 0:
+                output_dataset = all_process_list[i][0]
+                break
 
-            return DatasetDict({ "train": Dataset.from_dict(output_dataset) })
+        logger.info(f"collected data of {len(output_dataset['text'])}")
 
+        return DatasetDict({"train": Dataset.from_dict(output_dataset)})
 
     def align(self, model, dataset, reward_model):
         """
@@ -576,7 +572,7 @@ def align(self, model, dataset, reward_model):
             "top_p": 1.0,
             "do_sample": True,
             "pad_token_id": tokenizer.eos_token_id,
-            "temperature":0.85,
+            "temperature": 0.85,
         }
 
         aligner_args = self.aligner_args
@@ -585,7 +581,6 @@ def align(self, model, dataset, reward_model):
         data_args = self.data_args
         world_size = int(os.getenv("WORLD_SIZE", "1"))
 
-
         set_seed(42 + training_args.local_rank)
         ITERATION = aligner_args.num_raft_iteration
         collection_strategy = aligner_args.collection_strategy
@@ -593,9 +588,9 @@ def align(self, model, dataset, reward_model):
 
         if collection_strategy == "top":
             alpha = aligner_args.top_reward_percentage
-            M = int(sft_batch_size / world_size / alpha) 
+            M = int(sft_batch_size / world_size / alpha)
         elif collection_strategy == "local":
-            K = int(1/aligner_args.top_reward_percentage)
+            K = int(1 / aligner_args.top_reward_percentage)
             M = int(sft_batch_size / world_size)
         else:
             raise NotImplementedError("We only support two data collection strategies")
@@ -606,8 +601,8 @@ def align(self, model, dataset, reward_model):
         self.store_dir = aligner_args.output_dir
         self.reward_seq = []
         self.train_reawrd = []
-        
-        data_size = len(dataset['input'])
+
+        data_size = len(dataset["input"])
         lr = training_args.learning_rate
         random_idxs = np.arange(data_size)
         np.random.shuffle(random_idxs)
@@ -616,9 +611,9 @@ def align(self, model, dataset, reward_model):
         raft_trainer.train(resume_from_checkpoint=False, is_first_time=True)
 
         for iteration in range(ITERATION):
-            set_seed(666 + training_args.local_rank + world_size * (iteration+1))
+            set_seed(666 + training_args.local_rank + world_size * (iteration + 1))
 
-            end_idx = np.min([data_size, (iteration+1) * M])
+            end_idx = np.min([data_size, (iteration + 1) * M])
             batch_input = dataset.select(random_idxs[iteration * M : end_idx])
             model.gradient_checkpointing_disable()
             model.config.use_cache = True
@@ -658,7 +653,7 @@ def align(self, model, dataset, reward_model):
                 )
             end_time = time.time()
             logger.info("It takes %.2f s to inference one stage", end_time - start_time)
-            
+
             raft_trainer.train_dataset = self._load_dataset(
                 selected_dataset,
                 raft_trainer.tmp_model,
@@ -673,21 +668,20 @@ def align(self, model, dataset, reward_model):
             model.gradient_checkpointing_enable()
             model.config.use_cache = False
 
-
             train_result = raft_trainer.train(resume_from_checkpoint=False)
             end_time = time.time()
             logger.info("It takes %.2f s to train one stage", end_time - start_time)
-            if (iteration+1) * M  > data_size:
+            if (iteration + 1) * M > data_size:
                 logger.info("One epoch is completed.")
                 break
 
-            '''
+            """
             if training_args.local_rank == 0 and iteration % 2 == 0:
                 wrapped_model.save(aligner_args.output_dir + "/" + "model" + str(iteration))
                 print(iteration, "I save a model with", self.reward_seq[-1])
-            '''
+            """
 
         if aligner_args.output_dir is not None:
             wrapped_model.save(aligner_args.output_dir)
 
-        return wrapped_model 
+        return wrapped_model
diff --git a/src/lmflow/pipeline/rm_inferencer.py b/src/lmflow/pipeline/rm_inferencer.py
index 890e4ee20..bddac6241 100644
--- a/src/lmflow/pipeline/rm_inferencer.py
+++ b/src/lmflow/pipeline/rm_inferencer.py
@@ -1,42 +1,32 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import copy
-import os
-import torch
-import wandb
-import deepspeed
-import sys
-import numpy as np
-import datetime
-import json
-import time
 import logging
-from typing import Dict, List, Union, Tuple, Any
+import os
+from typing import Any, Union
 
-from accelerate import Accelerator
+import numpy as np
 import torch
+import torch.distributed as dist
+from accelerate import Accelerator
 from tqdm import tqdm
-from transformers import AutoConfig
 from transformers.modeling_outputs import SequenceClassifierOutputWithPast
-import torch.distributed as dist
-import torch.nn.functional as F
 
 from lmflow.args import (
     DatasetArguments,
-    ModelArguments,
     InferencerArguments,
+    ModelArguments,
 )
 from lmflow.datasets.dataset import Dataset
 from lmflow.models.hf_text_regression_model import HFTextRegressionModel
 from lmflow.pipeline.base_pipeline import BasePipeline
 from lmflow.utils.data_utils import (
-    set_random_seed,
-    batchlize,
     RewardModelInferenceResultWithInput,
+    batchlize,
+    set_random_seed,
 )
-from lmflow.datasets.dataset import KEY_SCORE
-from lmflow.utils.versioning import is_ray_available
+from lmflow.utils.envs import is_accelerate_env
+from lmflow.utils.versioning import is_deepspeed_available, is_ray_available
 
 if is_ray_available():
     import ray
@@ -63,10 +53,11 @@ class RewardModelInferencer(BasePipeline):
     inferencer_args : InferencerArguments object.
         Contains the arguments required to perform inference.
     """
+
     def __init__(
-        self, 
-        model_args: ModelArguments, 
-        data_args: DatasetArguments, 
+        self,
+        model_args: ModelArguments,
+        data_args: DatasetArguments,
         inferencer_args: InferencerArguments,
         **kwargs,
     ):
@@ -78,23 +69,23 @@ def __init__(
 
         self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
         self.world_size = int(os.getenv("WORLD_SIZE", "1"))
-        if inferencer_args.device == "gpu":
+        if inferencer_args.device == "gpu":  # FIXME: a bit weird here
             torch.cuda.set_device(self.local_rank)  # NOTE: cpu-only machine will have error
-            deepspeed.init_distributed()
+            if not is_accelerate_env() and is_deepspeed_available():
+                import deepspeed
+
+                deepspeed.init_distributed()
         else:
-            dist.init_process_group(
-                "gloo", rank=self.local_rank, world_size=self.world_size
-            )
+            dist.init_process_group("gloo", rank=self.local_rank, world_size=self.world_size)
 
-        if inferencer_args.use_accelerator:
-            self.accelerator: Accelerator = kwargs.get('accelerator', Accelerator())
-            
+        if is_accelerate_env():
+            self.accelerator: Accelerator = kwargs.get("accelerator", Accelerator())
 
     def inference(
         self,
         model: HFTextRegressionModel,
         dataset: Dataset,
-        transform_dataset_in_place: bool=True,
+        transform_dataset_in_place: bool = True,
         use_vllm: bool = False,
         enable_distributed_inference: bool = False,
         **kwargs,
@@ -102,21 +93,21 @@ def inference(
         if use_vllm:
             logger.warning("VLLM doesn't support reward model inference, using normal inference instead.")
             use_vllm = False
-            
+
         assert isinstance(model, HFTextRegressionModel), "model should be HFTextRegressionModel"
         if not transform_dataset_in_place:
             dataset = copy.deepcopy(dataset)
-            
+
         model_input = model.prepare_inputs_for_inference(
             dataset=dataset,
             apply_chat_template=True,
             enable_distributed_inference=enable_distributed_inference,
-            use_vllm=use_vllm
+            use_vllm=use_vllm,
         )
-            
+
         if use_vllm:
             inference_result = self.__vllm_inference(
-                model=model, 
+                model=model,
                 model_input=model_input,
                 enable_distributed_inference=enable_distributed_inference,
             )
@@ -127,117 +118,103 @@ def inference(
                 enable_distributed_inference=enable_distributed_inference,
                 **kwargs,
             )
-        
+
         if enable_distributed_inference:
             output_dataset = model.postprocess_distributed_inference_outputs(
                 dataset=dataset,
                 inference_result=inference_result,
             )
         else:
-            output_dataset = model.postprocess_inference_outputs(
-                dataset=dataset, 
-                scores=inference_result
-            )
-        
+            output_dataset = model.postprocess_inference_outputs(dataset=dataset, scores=inference_result)
+
         return output_dataset
-    
-    
+
     def _inference(
         self,
         model: HFTextRegressionModel,
-        model_input: Union[Dataset, 'ray.data.Dataset'],
+        model_input: Union[Dataset, "ray.data.Dataset"],
         enable_distributed_inference: bool = False,
         **kwargs,
     ):
         if enable_distributed_inference:
             if not is_ray_available():
                 raise ImportError('Ray is not installed. Please install via `pip install -e ".[ray]"`.')
-            
+
             inference_res = self.__distributed_inference(
-                model=model, 
-                model_input=model_input, 
+                model=model,
+                model_input=model_input,
                 num_instances=kwargs.get("distributed_inference_num_instances", 1),
                 batch_size=kwargs.get("inference_batch_size", 1),
             )
         else:
             inference_res = self.__inference(
-                model=model, 
+                model=model,
                 model_input=model_input,
             )
-        
-        return inference_res
 
+        return inference_res
 
     def __inference(
         self,
         model: HFTextRegressionModel,
         model_input: Dataset,
-    ) -> Union[List[float], List[List[float]]]:
+    ) -> Union[list[float], list[list[float]]]:
         if model_input.get_type() in ["text_to_textlist"]:
             model_input_ids, num_outputs = self.flatten_list(model_input.get_backend_dataset()["input_ids"])
         else:
             model_input_ids = model_input.get_backend_dataset()["input_ids"]
-            
+
         dataloader = batchlize(
             examples=model_input_ids,
             batch_size=self.inferencer_args.inference_batch_size,
-            random_shuffle=False, # DO NOT shuffle when inference
+            random_shuffle=False,  # DO NOT shuffle when inference
         )
         num_batches = len(dataloader)
         final_output = []
-        
+
         for batch_index, batched_input_ids in tqdm(
-            iterable=enumerate(dataloader), 
-            total=num_batches, 
-            desc="Inference", 
-            unit="batch"
+            iterable=enumerate(dataloader), total=num_batches, desc="Inference", unit="batch"
         ):
             # len(batch) = batch_size, and batch element is dataset sample
             model_input_tensor = torch.LongTensor(batched_input_ids).to("cpu" if model.device == "cpu" else "cuda")
-            if self.inferencer_args.use_accelerator:
+            if is_accelerate_env():
                 with self.accelerator.autocast():
                     batch_output = model.inference(
-                        inputs=model_input_tensor, 
+                        inputs=model_input_tensor,
                         use_vllm=False,
                     )
             else:
                 batch_output = model.inference(
-                    inputs=model_input_tensor, 
+                    inputs=model_input_tensor,
                     use_vllm=False,
                 )
-            
+
             batch_output = self.__post_process_model_output(batch_output)
             final_output.extend(batch_output)
-        
+
         if model_input.get_type() in ["text_to_textlist"]:
             final_output = self.compress_list(final_output, num_outputs)
-        
+
         return final_output
-    
-    
+
     def __distributed_inference(
         self,
         model: HFTextRegressionModel,
-        model_input: 'ray.data.Dataset',
+        model_input: "ray.data.Dataset",
         num_instances: int,
         batch_size: int,
-    ) -> List[RewardModelInferenceResultWithInput]:
+    ) -> list[RewardModelInferenceResultWithInput]:
         def scheduling_strategy_fn():
             # One bundle per tensor parallel worker
             pg = ray.util.placement_group(
-                [{
-                    "GPU": 1,
-                    "CPU": 1
-                }] * self.inferencer_args.tensor_parallel_size,
+                [{"GPU": 1, "CPU": 1}] * self.inferencer_args.tensor_parallel_size,
                 strategy="STRICT_PACK",
             )
             return dict(
-                scheduling_strategy=PlacementGroupSchedulingStrategy(
-                    pg, placement_group_capture_child_tasks=True
-                )
+                scheduling_strategy=PlacementGroupSchedulingStrategy(pg, placement_group_capture_child_tasks=True)
             )
-            
-        resources_kwarg: Dict[str, Any] = {}
+
+        resources_kwarg: dict[str, Any] = {}
         if self.inferencer_args.tensor_parallel_size == 1:
             # For tensor_parallel_size == 1, we simply set num_gpus=1.
             resources_kwarg["num_gpus"] = 1
@@ -247,113 +224,108 @@ def scheduling_strategy_fn():
             # each instance.
             resources_kwarg["num_gpus"] = 0
             resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
-            
+
         ## predictor
         class DistributedPredictor:
             def __init__(
-                self, 
+                self,
                 model_args: ModelArguments,
             ):
                 self.model = HFTextRegressionModel(
-                    model_args=model_args, 
-                    tune_strategy='none', 
-                    use_accelerator=True
+                    model_args=model_args,
+                    do_train=False,
                 )
                 self.model.activate_model_for_inference(use_vllm=False)
-                
-            def __call__(self, batch: Dict[str, np.ndarray]):
-                """batch: Dict[str, np.ndarray]
+
+            def __call__(self, batch: dict[str, np.ndarray]):
+                """batch: dict[str, np.ndarray]
                 Example (batch size=2):
                 {'input': array(['...','...'], dtype=object),
-                 'output': array([array(["...", "..."], dtype=object), array(['...','...'], dtype=object)], dtype=object),
+                 'output': array([array(["...", "..."], dtype=object),
+                                  array(["...", "..."], dtype=object),],
+                                dtype=object),
                  'input_ids': array([[[128000, 128006,    882, ..., 128256, 128256, 128256],
                          [128000, 128006,    882, ..., 128256, 128256, 128256]],
                         [[128000, 128006,    882, ..., 128256, 128256, 128256],
                          [128000, 128006,    882, ..., 128256, 128256, 128256]]])}
                 """
-                # The batch is managed by ray and the actual batch size may smaller than 
-                # inference_batch_size in config, since there may be some remainders. 
+                # The batch is managed by ray and the actual batch size may smaller than
+                # inference_batch_size in config, since there may be some remainders.
                 # For example, 10 examples with 2 inference instances and inference_batch_size=4,
-                # there will be only 2 examples for instance 0 to run and then the 
+                # there will be only 2 examples for instance 0 to run and then the
                 # actual batch size changes.
-                actual_batch_size = len(batch['input'])
-                input_tensor = torch.LongTensor([
-                    [list(arr) for arr in batch['input_ids'][batch_idx]] 
-                    for batch_idx in range(actual_batch_size)
-                ]).flatten(start_dim=0, end_dim=1).to("cuda")
+                actual_batch_size = len(batch["input"])
+                input_tensor = (
+                    torch.LongTensor(
+                        [[list(arr) for arr in batch["input_ids"][batch_idx]] for batch_idx in range(actual_batch_size)]
+                    )
+                    .flatten(start_dim=0, end_dim=1)
+                    .to("cuda")
+                )
                 batched_inference_res = self.model.inference(input_tensor).logits
-                batched_inference_res = batched_inference_res.to("cpu").reshape(actual_batch_size, -1, 1).squeeze(dim=-1).tolist() 
+                batched_inference_res = (
+                    batched_inference_res.to("cpu").reshape(actual_batch_size, -1, 1).squeeze(dim=-1).tolist()
+                )
                 # [bs, num_output_sequences]
                 batched_final_res = {
-                    "input": batch['input'].tolist(),
+                    "input": batch["input"].tolist(),
                     "output": [
                         [
                             {"score": batched_inference_res[j][i], "text": batch["output"][j][i]}
-                            for i in range(len(batch['output'][j]))
-                        ] 
+                            for i in range(len(batch["output"][j]))
+                        ]
                         for j in range(actual_batch_size)
                     ],
-                } # do this since we're writing to a pandas dataframe
+                }  # do this since we're writing to a pandas dataframe
                 return batched_final_res
 
         # inference
         model_input_mapping = model_input.map_batches(
             DistributedPredictor,
-            concurrency=num_instances, # Set the concurrency to the number of LLM instances.
+            concurrency=num_instances,  # Set the concurrency to the number of LLM instances.
             batch_size=batch_size,
             fn_constructor_kwargs={
                 "model_args": model.model_args,
             },
             **resources_kwarg,
         )
-        
-        df_model_output = model_input_mapping.to_pandas() # the actual forwards are executed here
+
+        df_model_output = model_input_mapping.to_pandas()  # the actual forwards are executed here
         logger.info(f"Distributed reward model inference result preview:\n{df_model_output.head(10)}")
-        
-        model_output = [
-            {"input": row["input"], "output": row["output"]} for _, row in df_model_output[:].iterrows()
-        ]
-        
+
+        model_output = [{"input": row["input"], "output": row["output"]} for _, row in df_model_output[:].iterrows()]
+
         return model_output
-    
-    
+
     def __vllm_inference(
         self,
         model: HFTextRegressionModel,
-        model_input: List[str],
+        model_input: list[str],
         enable_distributed_inference: bool = False,
-    ) -> List[float]:
+    ) -> list[float]:
         raise NotImplementedError("VLLM inference for reward model is not implemented yet.")
-        
-    
+
     def __post_process_model_output(
         self,
         model_output: SequenceClassifierOutputWithPast,
-    ) -> List[float]:
+    ) -> list[float]:
         final_output = model_output.logits.to("cpu").reshape(-1).tolist()
-        
+
         return final_output
-            
-    
-    def flatten_list(
-        self, 
-        list_of_list: List[List]
-    ) -> Tuple[List, List[int]]:
+
+    def flatten_list(self, list_of_list: list[list]) -> tuple[list, list[int]]:
         sublist_lengths = [len(sublist) for sublist in list_of_list]
         flattened_list = [item for sublist in list_of_list for item in sublist]
         return flattened_list, sublist_lengths
-    
-
-    def compress_list(
-        self, 
-        list_to_compress: List, 
-        sublist_lengths: List[int]
-    ) -> List[List]:
-        assert sum(sublist_lengths) == len(list_to_compress), "Sum of sublist lengths should be equal to length of list to compress."
+
+    def compress_list(self, list_to_compress: list, sublist_lengths: list[int]) -> list[list]:
+        assert sum(sublist_lengths) == len(list_to_compress), (
+            "Sum of sublist lengths should be equal to length of list to compress."
+        )
         compressed_list = []
         start_index = 0
         for length in sublist_lengths:
-            sublist = list_to_compress[start_index: start_index + length]
+            sublist = list_to_compress[start_index : start_index + length]
             compressed_list.append(sublist)
             start_index += length
         return compressed_list
diff --git a/src/lmflow/pipeline/rm_tuner.py b/src/lmflow/pipeline/rm_tuner.py
index 06b2094f1..1d131be61 100644
--- a/src/lmflow/pipeline/rm_tuner.py
+++ b/src/lmflow/pipeline/rm_tuner.py
@@ -1,24 +1,14 @@
-import sys
 import logging
-from typing import Optional
 from copy import deepcopy
 
 import numpy as np
-import datasets
-import transformers
-from transformers import set_seed
-from transformers.utils import send_example_telemetry
-from transformers.trainer_callback import (
-    TrainerCallback
-)
+from transformers.trainer_callback import TrainerCallback
 
 from lmflow.datasets import Dataset
 from lmflow.models.hf_text_regression_model import HFTextRegressionModel
 from lmflow.pipeline.finetuner import Finetuner
-from lmflow.pipeline.utils.rm_trainer import compute_metrics, RewardTrainer, PeftRewardTrainer
-from lmflow.pipeline.utils.peft_trainer import PeftSavingCallback
 from lmflow.pipeline.utils.rm_dataprocessor import RewardDataCollatorWithPadding
-
+from lmflow.pipeline.utils.rm_trainer import RewardTrainer, compute_metrics
 
 logger = logging.getLogger(__name__)
 
@@ -43,29 +33,17 @@ class RewardModelTuner(Finetuner):
     kwargs : Optional.
         Keyword arguments.
     """
-    def __init__(
-        self, 
-        model_args, 
-        data_args, 
-        finetuner_args, 
-        *args, 
-        **kwargs
-    ):
+
+    def __init__(self, model_args, data_args, finetuner_args, *args, **kwargs):
         super().__init__(model_args, data_args, finetuner_args, *args, **kwargs)
-        
-    
+
     def tune(
-        self,
-        model: HFTextRegressionModel,
-        dataset,
-        transform_dataset_in_place=True,
-        data_collator=None,
-        **kwargs
+        self, model: HFTextRegressionModel, dataset, transform_dataset_in_place=True, data_collator=None, **kwargs
     ):
         # 0. basic init
         if not transform_dataset_in_place:
             dataset = deepcopy(dataset)
-            
+
         # 1. prepare dataset
         with self.finetuner_args.main_process_first(desc="dataset map tokenization"):
             tokenized_dataset = model.tokenize(dataset)
@@ -82,7 +60,7 @@ def tune(
         if self.finetuner_args.do_train and self.data_args.max_train_samples is not None:
             max_train_samples = min(len(train_dataset), self.data_args.max_train_samples)
             train_dataset = train_dataset.select(range(max_train_samples))
-        
+
         if self.finetuner_args.do_eval:
             eval_dataset_args = deepcopy(self.data_args)
             eval_dataset_args.dataset_path = self.finetuner_args.eval_dataset_path
@@ -98,22 +76,18 @@ def tune(
                     )
             eval_dataset = lm_dataset.get_backend_dataset()
             logger.info(f"Number of eval samples: {len(eval_dataset)}")
-        
+
         if data_collator is None:
             data_collator = RewardDataCollatorWithPadding(
-                tokenizer=model.get_tokenizer(),
-                max_length=self.model_args.model_max_length
+                tokenizer=model.get_tokenizer(), max_length=self.model_args.model_max_length
             )
-            
+
         # 2. prepare trainer
-        if self.model_args.use_lora:
-            RewardModelingTrainer = PeftRewardTrainer
-            trainer_callbacks = [PeftSavingCallback]
-        else:
-            RewardModelingTrainer = RewardTrainer
-            trainer_callbacks = []
-            
+        RewardModelingTrainer = RewardTrainer
+        trainer_callbacks = []
+
         if self.finetuner_args.use_lisa:
+
             class DynamicLayerActivationCallback(TrainerCallback):
                 def __init__(self, n_layers, interval_steps, model, **kwargs):
                     super().__init__()
@@ -123,24 +97,26 @@ def __init__(self, n_layers, interval_steps, model, **kwargs):
 
                     # Determine the way to access layers based on the model type
                     class_to_layers_map = {
-                        'LlamaForCausalLM': 'model.model.layers',
-                        'Qwen2ForCausalLM': 'model.model.layers',
-                        'MistralForCausalLM': 'model.model.layers',
-                        'MixtralForCausalLM': 'model.model.layers',
-                        'GemmaForCausalLM': 'model.model.layers',
-                        'GPT2LMHeadModel': 'model.transformer.h',
+                        "LlamaForCausalLM": "model.model.layers",
+                        "Qwen2ForCausalLM": "model.model.layers",
+                        "MistralForCausalLM": "model.model.layers",
+                        "MixtralForCausalLM": "model.model.layers",
+                        "GemmaForCausalLM": "model.model.layers",
+                        "GPT2LMHeadModel": "model.transformer.h",
                     }
                     model_class_name = self.model.__class__.__name__
                     if model_class_name in class_to_layers_map:
                         self.layers_attribute = class_to_layers_map[model_class_name]
                     else:
                         self.layers_attribute = kwargs.get("lisa_layers_attribute")
-                    self.total_layers = len(eval('self.' + self.layers_attribute))  # Dynamically execute to get the number of layers
+                    self.total_layers = len(
+                        eval("self." + self.layers_attribute)
+                    )  # Dynamically execute to get the number of layers
 
                     self.active_layers_indices = []
 
                 def freeze_all_layers(self):
-                    layers = eval('self.' + self.layers_attribute)  # Dynamically execute to get layers
+                    layers = eval("self." + self.layers_attribute)  # Dynamically execute to get layers
                     for layer in layers:
                         for param in layer.parameters():
                             param.requires_grad = False
@@ -155,8 +131,10 @@ def switch_active_layers(self):
                     self.freeze_all_layers()
 
                     # Randomly select n_layers to activate
-                    layers = eval('self.' + self.layers_attribute)  # Re-fetch layer references
-                    self.active_layers_indices = np.random.choice(range(self.total_layers), self.n_layers, replace=False)
+                    layers = eval("self." + self.layers_attribute)  # Re-fetch layer references
+                    self.active_layers_indices = np.random.choice(
+                        range(self.total_layers), self.n_layers, replace=False
+                    )
                     print(f"Activating layers at indices: {self.active_layers_indices} for the next steps.", flush=True)
 
                     # Enable gradients only for the selected layers
@@ -166,14 +144,14 @@ def switch_active_layers(self):
 
             # Instantiate the callback
             dynamic_layer_activation_callback = DynamicLayerActivationCallback(
-                n_layers=self.finetuner_args.lisa_activated_layers,      # Number of layers to activate
+                n_layers=self.finetuner_args.lisa_activated_layers,  # Number of layers to activate
                 interval_steps=self.finetuner_args.lisa_interval_steps,  # Step interval to update active layers
                 model=model.get_backend_model(),
-                lisa_layers_attribute=self.finetuner_args.lisa_layers_attribute
+                lisa_layers_attribute=self.finetuner_args.lisa_layers_attribute,
             )
 
             trainer_callbacks.append(dynamic_layer_activation_callback)
-            
+
         trainer = RewardModelingTrainer(
             model=model.get_backend_model(),
             args=self.finetuner_args,
@@ -182,9 +160,9 @@ def switch_active_layers(self):
             tokenizer=model.get_tokenizer(),
             data_collator=data_collator,
             compute_metrics=compute_metrics if self.finetuner_args.do_eval else None,
-            callbacks=trainer_callbacks
+            callbacks=trainer_callbacks,
         )
-        
+
         # 3. training
         if self.finetuner_args.do_train:
             checkpoint = None
@@ -193,14 +171,15 @@ def switch_active_layers(self):
                 checkpoint = self.finetuner_args.resume_from_checkpoint
             elif last_checkpoint is not None:
                 checkpoint = last_checkpoint
-                
+
             if self.finetuner_args.gradient_checkpointing:
                 if model.get_backend_model().config.use_cache:
                     logger.warning(
-                        "Backend model config `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+                        "Backend model config `use_cache=True` is incompatible with gradient checkpointing. "
+                        "Setting `use_cache=False`."
                     )
                     model.get_backend_model().config.use_cache = False
-                
+
             train_result = trainer.train(resume_from_checkpoint=checkpoint)
 
             trainer.save_model()  # Saves the tokenizer too for easy upload
@@ -230,4 +209,4 @@ def switch_active_layers(self):
         else:
             trainer.create_model_card(**kwargs)
 
-        return model
\ No newline at end of file
+        return model
diff --git a/src/lmflow/pipeline/utils/dpov2_dataprocessor.py b/src/lmflow/pipeline/utils/dpov2_dataprocessor.py
index 4c74c8306..912554bf8 100644
--- a/src/lmflow/pipeline/utils/dpov2_dataprocessor.py
+++ b/src/lmflow/pipeline/utils/dpov2_dataprocessor.py
@@ -1,6 +1,6 @@
-from dataclasses import dataclass
 import logging
-from typing import Optional, Union, Dict, List, Any
+from dataclasses import dataclass
+from typing import Any, Optional, Union
 
 import torch
 from torch.nn.utils.rnn import pad_sequence
@@ -9,7 +9,6 @@
     PreTrainedTokenizerBase,
 )
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -27,13 +26,12 @@ class PreferenceDataCollatorWithPadding:
     max_target_length: Optional[int] = None
     mask_prompt: Optional[bool] = False
 
-
     def tokenize_batch_element(
         self,
         prompt: str,
         chosen: str,
         rejected: str,
-    ) -> Dict:
+    ) -> dict[str, Any]:
         """Tokenize a single batch element.
 
         At this stage, we don't convert to PyTorch tensors yet; we just handle the truncation
@@ -48,7 +46,7 @@ def tokenize_batch_element(
 
         if self.is_encoder_decoder:
             raise NotImplementedError
-        
+
         chosen_tokens = self.tokenizer(chosen, add_special_tokens=False)
         rejected_tokens = self.tokenizer(rejected, add_special_tokens=False)
         prompt_tokens = self.tokenizer(prompt, add_special_tokens=False)
@@ -100,9 +98,7 @@ def tokenize_batch_element(
         # if that's still too long, truncate the response
         if len(prompt_tokens["input_ids"]) + longer_response_length > self.max_length:
             chosen_tokens = {k: v[: self.max_length - self.max_prompt_length] for k, v in chosen_tokens.items()}
-            rejected_tokens = {
-                k: v[: self.max_length - self.max_prompt_length] for k, v in rejected_tokens.items()
-            }
+            rejected_tokens = {k: v[: self.max_length - self.max_prompt_length] for k, v in rejected_tokens.items()}
 
         # Create labels
         chosen_sequence_tokens = {k: prompt_tokens[k] + chosen_tokens[k] for k in chosen_tokens}
@@ -126,8 +122,6 @@ def tokenize_batch_element(
                     continue
                 batch[f"{k}_{type_key}"] = tokens
 
-
-
         batch["prompt"] = prompt
         batch["chosen"] = prompt + chosen
         batch["rejected"] = prompt + rejected
@@ -136,7 +130,6 @@ def tokenize_batch_element(
 
         return batch
 
-
     def collate(self, batch):
         # first, pad everything to the same length
         padded_batch = {}
@@ -178,8 +171,7 @@ def collate(self, batch):
 
         return padded_batch
 
-
-    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
         tokenized_batch = []
 
         for feature in features:
@@ -192,4 +184,4 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
             tokenized_batch.append(batch_element)
 
         # return collated batch
-        return self.collate(tokenized_batch)
\ No newline at end of file
+        return self.collate(tokenized_batch)
diff --git a/src/lmflow/pipeline/utils/dpov2_trainer.py b/src/lmflow/pipeline/utils/dpov2_trainer.py
index 0cb50711c..5b6e9bc90 100644
--- a/src/lmflow/pipeline/utils/dpov2_trainer.py
+++ b/src/lmflow/pipeline/utils/dpov2_trainer.py
@@ -1,16 +1,15 @@
 import logging
-from typing import Optional, Union, Dict, List, Any, Tuple, Callable, Literal
+from typing import Callable, Literal, Optional, Union
 
-from datasets import Dataset
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from datasets import Dataset
 from transformers import (
-    PreTrainedModel, 
-    PreTrainedTokenizerBase, 
-    DataCollator, 
-    TrainingArguments, 
-    TrainerCallback
+    DataCollator,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    TrainingArguments,
 )
 from transformers.trainer_callback import TrainerCallback
 from transformers.trainer_utils import EvalLoopOutput
@@ -30,7 +29,7 @@
 class DPOv2Trainer(DPOTrainer):
     def __init__(
         self,
-        model: Union[PreTrainedModel, nn.Module] = None,
+        model: Union[PreTrainedModel, nn.Module],
         ref_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
         beta: float = 0.1,
         loss_type: Literal["sigmoid", "hinge", "cross_entropy", "kl", "rev_kl", "raft"] = "rev_kl",
@@ -40,28 +39,24 @@ def __init__(
         padding_value: int = 0,
         truncation_mode: str = "keep_end",
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         tokenizer: Optional[PreTrainedTokenizerBase] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (
-            None,
-            None,
-        ),
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
         max_length: Optional[int] = None,
         max_prompt_length: Optional[int] = None,
         max_target_length: Optional[int] = None,
-        peft_config: Optional[Dict] = None,
+        peft_config: Optional[dict] = None,
         is_encoder_decoder: Optional[bool] = None,
         disable_dropout: bool = True,
         generate_during_eval: bool = False,
-        compute_metrics: Optional[Callable[[EvalLoopOutput], Dict]] = None,
+        compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
         mask_prompt: Optional[bool] = False,
         len_penalty: float = 0,
         preprocessing_num_workers: int = 1,
     ):
-
         if data_collator is None:
             data_collator = PreferenceDataCollatorWithPadding(
                 tokenizer,
@@ -113,21 +108,34 @@ def dpo_loss(
         reference_free: bool = False,
         margin: Optional[torch.FloatTensor] = None,
         len_penalty: float = 0,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
         """Compute the DPO loss for a batch of policy and reference model log probabilities.
 
         Args:
-            policy_chosen_logps: Log probabilities of the policy model for the chosen responses. Shape: (batch_size,)
-            policy_rejected_logps: Log probabilities of the policy model for the rejected responses. Shape: (batch_size,)
-            reference_chosen_logps: Log probabilities of the reference model for the chosen responses. Shape: (batch_size,)
-            reference_rejected_logps: Log probabilities of the reference model for the rejected responses. Shape: (batch_size,)
-            beta: Temperature parameter for the DPO loss, typically something in the range of 0.1 to 0.5. We ignore the reference model as beta -> 0.
-            reference_free: If True, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal probability to all responses.
+            policy_chosen_logps:
+                Log probabilities of the policy model for the chosen responses.
+                Shape: (batch_size,)
+            policy_rejected_logps:
+                Log probabilities of the policy model for the rejected responses.
+                Shape: (batch_size,)
+            reference_chosen_logps:
+                Log probabilities of the reference model for the chosen responses.
+                Shape: (batch_size,)
+            reference_rejected_logps:
+                Log probabilities of the reference model for the rejected responses.
+                Shape: (batch_size,)
+            beta:
+                Temperature parameter for the DPO loss, typically something in the
+                range of 0.1 to 0.5. We ignore the reference model as beta -> 0.
+            reference_free:
+                If True, we ignore the _provided_ reference model and implicitly use
+                a reference model that assigns equal probability to all responses.
 
         Returns:
             A tuple of three tensors: (losses, chosen_rewards, rejected_rewards).
             The losses tensor contains the DPO loss for each example in the batch.
-            The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and rejected responses, respectively.
+            The chosen_rewards and rejected_rewards tensors contain the rewards for the chosen and
+            rejected responses, respectively.
         """
         pi_logratios = policy_chosen_logps - policy_rejected_logps
         ref_logratios = reference_chosen_logps - reference_rejected_logps + len_penalty
@@ -148,7 +156,8 @@ def dpo_loss(
             losses = -policy_chosen_logps  # F.logsigmoid(self.beta * logits)
         elif self.loss_type == "ipo":
             logits = pi_logratios - ref_logratios
-            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss, denoted by tau in the paper.
+            # eqn (17) of the paper where beta is the regularization parameter for the IPO loss,
+            # denoted by tau in the paper.
             losses = (logits - 1 / (2 * self.beta)) ** 2
         elif self.loss_type == "kl":
             logits = pi_logratios - ref_logratios
@@ -184,7 +193,7 @@ def dpo_loss(
     def get_batch_loss_metrics(
         self,
         model,
-        batch: Dict[str, Union[List, torch.LongTensor]],
+        batch: dict[str, Union[list, torch.LongTensor]],
         train_eval: Literal["train", "eval"] = "train",
     ):
         return self.get_batch_metrics(model, batch, train_eval)
@@ -192,7 +201,7 @@ def get_batch_loss_metrics(
     def get_batch_metrics(
         self,
         model,
-        batch: Dict[str, Union[List, torch.LongTensor]],
+        batch: dict[str, Union[list, torch.LongTensor]],
         train_eval: Literal["train", "eval"] = "train",
     ):
         """Compute the DPO loss and other metrics for the given batch of inputs for train or test."""
diff --git a/src/lmflow/pipeline/utils/lisa_trainer.py b/src/lmflow/pipeline/utils/lisa_trainer.py
new file mode 100644
index 000000000..a1e441b1d
--- /dev/null
+++ b/src/lmflow/pipeline/utils/lisa_trainer.py
@@ -0,0 +1,66 @@
+from typing import Optional
+
+import numpy as np
+from transformers import PreTrainedModel
+from transformers.trainer_callback import TrainerCallback
+
+
+class DynamicLayerActivationCallback(TrainerCallback):
+    def __init__(
+        self,
+        n_layers: int,
+        interval_steps: int,
+        model: PreTrainedModel,
+        lisa_layers_attribute: Optional[str] = None,
+    ):
+        super().__init__()
+        self.n_layers = n_layers
+        self.interval_steps = interval_steps
+        self.model = model
+
+        # Determine the way to access layers based on the model type
+        class_to_layers_map = {
+            "LlamaForCausalLM": "model.model.layers",
+            "Qwen2ForCausalLM": "model.model.layers",
+            "MistralForCausalLM": "model.model.layers",
+            "MixtralForCausalLM": "model.model.layers",
+            "GemmaForCausalLM": "model.model.layers",
+            "GPT2LMHeadModel": "model.transformer.h",
+            "HymbaForCausalLM": "model.model.layers",
+        }
+        model_class_name = self.model.__class__.__name__
+        if model_class_name in class_to_layers_map:
+            self.layers_attribute = class_to_layers_map[model_class_name]
+        else:
+            assert lisa_layers_attribute is not None, "Please provide the attribute to access the layers of the model."
+            self.layers_attribute = lisa_layers_attribute
+        self.total_layers = len(
+            eval("self." + self.layers_attribute)
+        )  # Dynamically execute to get the number of layers
+
+        self.active_layers_indices = []
+
+    def freeze_all_layers(self):
+        layers = eval("self." + self.layers_attribute)  # Dynamically execute to get layers
+        for layer in layers:
+            for param in layer.parameters():
+                param.requires_grad = False
+
+    def on_step_begin(self, args, state, control, **kwargs):
+        # Check if it's time to switch active layers, including at step 0
+        if state.global_step % self.interval_steps == 0:
+            self.switch_active_layers()
+
+    def switch_active_layers(self):
+        # First, disable gradients for all layers
+        self.freeze_all_layers()
+
+        # Randomly select n_layers to activate
+        layers = eval("self." + self.layers_attribute)  # Re-fetch layer references
+        self.active_layers_indices = np.random.choice(range(self.total_layers), self.n_layers, replace=False)
+        print(f"Activating layers at indices: {self.active_layers_indices} for the next steps.", flush=True)
+
+        # Enable gradients only for the selected layers
+        for idx in self.active_layers_indices:
+            for param in layers[idx].parameters():
+                param.requires_grad = True
diff --git a/src/lmflow/pipeline/utils/memory_safe_dpov2_align.py b/src/lmflow/pipeline/utils/memory_safe_dpov2_align.py
index c4c1bb6ab..b27234df1 100644
--- a/src/lmflow/pipeline/utils/memory_safe_dpov2_align.py
+++ b/src/lmflow/pipeline/utils/memory_safe_dpov2_align.py
@@ -1,57 +1,48 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-import logging
-import os
-import sys
 import copy
+import logging
 
-from transformers import (
-    HfArgumentParser
-)
+from transformers import HfArgumentParser
 
-from lmflow.datasets import Dataset
-from lmflow.models.hf_decoder_model import HFDecoderModel
-from lmflow.pipeline.dpov2_aligner import DPOv2Aligner
 from lmflow.args import (
-    ModelArguments, 
-    DatasetArguments, 
+    DatasetArguments,
     DPOv2AlignerArguments,
+    ModelArguments,
 )
-from lmflow.utils.common import remove_dataclass_attr_prefix, create_copied_dataclass
-
+from lmflow.datasets import Dataset
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.pipeline.dpov2_aligner import DPOv2Aligner
+from lmflow.utils.common import create_copied_dataclass, remove_dataclass_attr_prefix
 
 logger = logging.getLogger(__name__)
 
 
 ReferenceModelArguments: ModelArguments = create_copied_dataclass(
-    original_dataclass=ModelArguments, 
-    field_prefix="reference_",
-    class_prefix="Reference"
+    original_dataclass=ModelArguments, field_prefix="reference_", class_prefix="Reference"
 )
 
 
 def main():
     # Parses arguments
-    parser = HfArgumentParser((
-        ModelArguments, 
-        ReferenceModelArguments,
-        DatasetArguments,
-        DPOv2AlignerArguments,
-    ))
+    parser = HfArgumentParser(
+        (
+            ModelArguments,
+            ReferenceModelArguments,
+            DatasetArguments,
+            DPOv2AlignerArguments,
+        )
+    )
     target_model_args, ref_model_args, data_args, aligner_args = parser.parse_args_into_dataclasses()
-        
+
     ref_model_args_dict = remove_dataclass_attr_prefix(ref_model_args, "reference_")
     ref_model_args = ModelArguments(**ref_model_args_dict)
 
     target_model = HFDecoderModel(target_model_args)
     ref_model = HFDecoderModel(ref_model_args)
     train_dataset = Dataset(data_args)
-    eval_dataset = copy.deepcopy(train_dataset.sample(
-        n=100, 
-        seed=aligner_args.random_seed
-    ))
-    
+    eval_dataset = copy.deepcopy(train_dataset.sample(n=100, seed=aligner_args.random_seed))
+
     aligner = DPOv2Aligner(
         model_args=target_model_args,
         data_args=train_dataset.data_args,
@@ -64,7 +55,7 @@ def main():
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
     )
-    
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py b/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
index d7859d9a1..db14ae96d 100644
--- a/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
+++ b/src/lmflow/pipeline/utils/memory_safe_vllm_inference.py
@@ -1,31 +1,26 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 
 # Note that this is only a workaround, since vllm
-# inference engine cannot release GPU memory properly by now. Please see this github 
+# inference engine cannot release GPU memory properly by now. Please see this github
 # [issue](https://github.com/vllm-project/vllm/issues/1908).
 
 import logging
-import sys
 import os
-from typing import Dict
+import sys
 
-from transformers import (
-    HfArgumentParser
-)
+from transformers import HfArgumentParser
 
-from lmflow.datasets import Dataset
-from lmflow.models.auto_model import AutoModel
-from lmflow.pipeline.vllm_inferencer import VLLMInferencer
 from lmflow.args import (
-    ModelArguments, 
-    DatasetArguments, 
     AutoArguments,
+    DatasetArguments,
+    ModelArguments,
 )
+from lmflow.datasets import Dataset
+from lmflow.models.auto_model import AutoModel
+from lmflow.pipeline.vllm_inferencer import VLLMInferencer
 from lmflow.utils.constants import MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -34,11 +29,7 @@ def main():
     pipeline_name = "vllm_inferencer"
     PipelineArguments = AutoArguments.get_pipeline_args_class(pipeline_name)
 
-    parser = HfArgumentParser((
-        ModelArguments, 
-        DatasetArguments,
-        PipelineArguments
-    ))
+    parser = HfArgumentParser((ModelArguments, DatasetArguments, PipelineArguments))
     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
         # If we pass only one argument to the script and it's the path to a json file,
         # let's parse it to get our arguments.
@@ -47,7 +38,7 @@ def main():
         model_args, data_args, pipeline_args = parser.parse_args_into_dataclasses()
 
     dataset = Dataset(data_args)
-    model = AutoModel.get_model(model_args, tune_strategy='none')
+    model = AutoModel.get_model(model_args, do_train=False)
     inferencer = VLLMInferencer(model_args, data_args, pipeline_args)
 
     res = inferencer.inference(
@@ -59,10 +50,11 @@ def main():
         distributed_inference_num_instances=pipeline_args.distributed_inference_num_instances,
         inference_batch_size=pipeline_args.vllm_inference_batch_size,
     )
-    
+    print(len(res))
+
     # use this as a flag, stdout will be captured by the pipeline
     print(MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG)
-    
+
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/src/lmflow/pipeline/utils/peft_trainer.py b/src/lmflow/pipeline/utils/peft_trainer.py
deleted file mode 100644
index 2eecfd7dd..000000000
--- a/src/lmflow/pipeline/utils/peft_trainer.py
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-"""Trainer for Peft models
-"""
-
-from __future__ import absolute_import
-from transformers import Trainer
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
-from transformers.trainer_callback import (
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-)
-from transformers.training_args import TrainingArguments
-import os
-import numpy as np
-
-class PeftTrainer(Trainer):
-    def _save_checkpoint(self, _, trial, metrics=None):
-        """ Don't save base model, optimizer etc.
-            but create checkpoint folder (needed for saving adapter) """
-        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
-
-        run_dir = self._get_output_dir(trial=trial)
-        output_dir = os.path.join(run_dir, checkpoint_folder)
-
-        if metrics is not None and self.args.metric_for_best_model is not None:
-            metric_to_check = self.args.metric_for_best_model
-            if not metric_to_check.startswith("eval_"):
-                metric_to_check = f"eval_{metric_to_check}"
-            metric_value = metrics[metric_to_check]
-
-            operator = np.greater if self.args.greater_is_better else np.less
-            if (self.state.best_metric is None or self.state.best_model_checkpoint is None
-                or operator(metric_value, self.state.best_metric)):
-                self.state.best_metric = metric_value
-
-                self.state.best_model_checkpoint = output_dir
-
-        os.makedirs(output_dir, exist_ok=True)
-
-        if self.args.should_save:
-            self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
-
-class PeftSavingCallback(TrainerCallback):
-    """ Correctly save PEFT model and not full model """
-    def _save(self, model, folder):
-        if folder is None:
-            folder = ""
-        peft_model_path = os.path.join(folder, "adapter_model")
-        model.save_pretrained(peft_model_path)
-
-    def on_train_end(self, args: TrainingArguments, state: TrainerState,
-            control: TrainerControl, **kwargs):
-        """ Save final best model adapter """
-        self._save(kwargs['model'], state.best_model_checkpoint)
-
-    def on_epoch_end(self, args: TrainingArguments, state: TrainerState,
-            control: TrainerControl, **kwargs):
-        """ Save intermediate model adapters in case of interrupted training """
-        folder = os.path.join(args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}")
-        self._save(kwargs['model'], folder)
-        
-    def on_save(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        checkpoint_folder = os.path.join(
-            args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}"
-        )       
-        self._save(kwargs['model'], checkpoint_folder)
-
-        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
-        kwargs["model"].save_pretrained(peft_model_path)
-        return control
\ No newline at end of file
diff --git a/src/lmflow/pipeline/utils/raft_trainer.py b/src/lmflow/pipeline/utils/raft_trainer.py
index c3a5253f7..83686c11b 100644
--- a/src/lmflow/pipeline/utils/raft_trainer.py
+++ b/src/lmflow/pipeline/utils/raft_trainer.py
@@ -13,10 +13,7 @@
 from collections.abc import Mapping
 from distutils.util import strtobool
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
-
-from tqdm.auto import tqdm
-
+from typing import Any, Callable, Optional, Union
 
 # Integrations must be imported before ML frameworks:
 # isort: off
@@ -39,13 +36,12 @@
     )
 except ImportError:
     from transformers.integrations import (
-    get_reporting_integration_callbacks,
-    hp_params,
-    is_fairscale_available,
+        get_reporting_integration_callbacks,
+        hp_params,
+        is_fairscale_available,
     )
-    from transformers.hyperparameter_search import default_hp_search_backend,ALL_HYPERPARAMETER_SEARCH_BACKENDS
+    from transformers.hyperparameter_search import default_hp_search_backend, ALL_HYPERPARAMETER_SEARCH_BACKENDS
 
-    
 
 # isort: on
 
@@ -57,12 +53,9 @@
 from torch import nn
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
-
-
 from transformers.configuration_utils import PretrainedConfig
 from transformers.data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
-from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
 from transformers.dependency_versions_check import dep_version_check
 from transformers.modelcard import TrainingSummary
 from transformers.modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model
@@ -136,7 +129,6 @@
     can_return_loss,
     find_labels,
     get_full_repo_name,
-    is_accelerate_available,
     is_apex_available,
     is_datasets_available,
     is_in_notebook,
@@ -150,6 +142,13 @@
 )
 from transformers.utils.generic import ContextManagers
 
+from lmflow.utils.versioning import is_package_version_at_least
+
+if is_package_version_at_least("transformers", "4.46.0"):
+    from transformers.integrations.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
+else:
+    from transformers.deepspeed import deepspeed_init, is_deepspeed_zero3_enabled
+
 
 _is_native_cpu_amp_available = is_torch_greater_or_equal_than_1_10
 
@@ -196,7 +195,6 @@
 skip_first_batches = None
 
 
-
 logger = logging.get_logger(__name__)
 
 
@@ -234,7 +232,7 @@ class RaftTrainer:
             `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will
             manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally
             sets the seed of the RNGs used.
-        eval_dataset (Union[`torch.utils.data.Dataset`, Dict[str, `torch.utils.data.Dataset`]), *optional*):
+        eval_dataset (Union[`torch.utils.data.Dataset`, dict[str, `torch.utils.data.Dataset`]), *optional*):
              The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the
              `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each
              dataset prepending the dictionary key to the metric name.
@@ -248,14 +246,14 @@ class RaftTrainer:
             The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to
             be able to choose different architectures according to hyper parameters (such as layer count, sizes of
             inner layers, dropout probabilities etc).
-        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
+        compute_metrics (`Callable[[EvalPrediction], dict]`, *optional*):
             The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return
             a dictionary string to metric values.
-        callbacks (List of [`TrainerCallback`], *optional*):
+        callbacks (list of [`TrainerCallback`], *optional*):
             A list of callbacks to customize the training loop. Will add those to the list of default callbacks
             detailed in [here](callback).
             If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
-        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*): A tuple
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*): A tuple
             containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model
             and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
         preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*):
@@ -287,12 +285,12 @@ def __init__(
         args: TrainingArguments = None,
         data_collator: Optional[DataCollator] = None,
         train_dataset: Optional[Dataset] = None,
-        eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None,
+        eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None,
         tokenizer: Optional[PreTrainedTokenizerBase] = None,
         model_init: Optional[Callable[[], PreTrainedModel]] = None,
-        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
-        callbacks: Optional[List[TrainerCallback]] = None,
-        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
+        compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
         preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
     ):
         ############
@@ -1047,10 +1045,10 @@ def create_optimizer(self):
                     for module in opt_model.modules():
                         if isinstance(module, nn.Embedding):
                             skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
-                            print(f"skipped {module}: {skipped/2**20}M params")
+                            print(f"skipped {module}: {skipped / 2**20}M params")
                             manager.register_module_override(module, "weight", {"optim_bits": 32})
                             logger.debug(f"bitsandbytes: will optimize {module} in fp32")
-                    print(f"skipped: {skipped/2**20}M params")
+                    print(f"skipped: {skipped / 2**20}M params")
 
         if is_sagemaker_mp_enabled():
             self.optimizer = smp.DistributedOptimizer(self.optimizer)
@@ -1058,7 +1056,7 @@ def create_optimizer(self):
         return self.optimizer
 
     @staticmethod
-    def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> Tuple[Any, Any]:
+    def get_optimizer_cls_and_kwargs(args: TrainingArguments) -> tuple[Any, Any]:
         """
         Returns the optimizer class and optimizer parameters based on the training arguments.
         Args:
@@ -1179,7 +1177,7 @@ def num_examples(self, dataloader: DataLoader) -> int:
         except (NameError, AttributeError, TypeError):  # no dataset or length, estimate by length of dataloader
             return len(dataloader) * self.args.per_device_train_batch_size
 
-    def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
+    def _hp_search_setup(self, trial: Union["optuna.Trial", dict[str, Any]]):
         """HP search setup code"""
         self._trial = trial
 
@@ -1220,7 +1218,7 @@ def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
             self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed)
             self.args.hf_deepspeed_config.trainer_config_process(self.args)
 
-    def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], step: int, metrics: Dict[str, float]):
+    def _report_to_hp_search(self, trial: Union["optuna.Trial", dict[str, Any]], step: int, metrics: dict[str, float]):
         if self.hp_search_backend is None or trial is None:
             return
         self.objective = self.compute_objective(metrics.copy())
@@ -1491,9 +1489,7 @@ def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
 
                 xm.optimizer_step = patched_optimizer_step
         elif is_sagemaker_dp_enabled():
-            model = nn.parallel.DistributedDataParallel(
-                model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))]
-            )
+            model = nn.parallel.DistributedDataParallel(model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))])
         elif self.args.local_rank != -1:
             kwargs = {}
             if self.args.ddp_find_unused_parameters is not None:
@@ -1521,9 +1517,9 @@ def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}):
     def train(
         self,
         resume_from_checkpoint: Optional[Union[str, bool]] = None,
-        trial: Union["optuna.Trial", Dict[str, Any]] = None,
-        ignore_keys_for_eval: Optional[List[str]] = None,
-        is_first_time = False,
+        trial: Union["optuna.Trial", dict[str, Any]] = None,
+        ignore_keys_for_eval: Optional[list[str]] = None,
+        is_first_time=False,
         **kwargs,
     ):
         """
@@ -1533,9 +1529,9 @@ def train(
                 If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
                 `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
                 of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
-            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
+            trial (`optuna.Trial` or `dict[str, Any]`, *optional*):
                 The trial run or the hyperparameter dictionary for hyperparameter search.
-            ignore_keys_for_eval (`List[str]`, *optional*)
+            ignore_keys_for_eval (`list[str]`, *optional*)
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions for evaluation during the training.
             kwargs:
@@ -1549,7 +1545,7 @@ def train(
 
         args = self.args
 
-        #self.is_in_train = True
+        # self.is_in_train = True
 
         # do_train is not a reliable argument, as it might not be set and .train() still called, so
         # the following is a workaround:
@@ -1613,14 +1609,12 @@ def train(
                 trial=trial,
                 ignore_keys_for_eval=ignore_keys_for_eval,
             )
-            
-    
+
     def _one_train(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
     ):
-        #print(self.lr_scheduler)
-        #print(dir(self.lr_scheduler))
-        
+        # print(self.lr_scheduler)
+        # print(dir(self.lr_scheduler))
 
         self.state = TrainerState()
         self.state.is_hyper_param_search = trial is not None
@@ -1628,7 +1622,7 @@ def _one_train(
         self._train_batch_size = batch_size
         # Data loader and number of training steps
         train_dataloader = self.get_train_dataloader()
-        #print("AAAAAAA", len(train_dataloader))
+        # print("AAAAAAA", len(train_dataloader))
 
         total_train_batch_size = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
 
@@ -1663,7 +1657,7 @@ def _one_train(
                 f" {args.max_steps}"
             )
         ###########
-        #num_train_epochs = 5
+        # num_train_epochs = 5
 
         # Train!
         logger.info("***** Running training *****")
@@ -1709,14 +1703,14 @@ def _one_train(
         # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses
         self._total_loss_scalar = 0.0
         self._globalstep_last_logged = self.state.global_step
-        #model.zero_grad()
+        # model.zero_grad()
         self.tmp_model.zero_grad()
 
         self.control = self.callback_handler.on_train_begin(args, self.state, self.control)
 
         # Skip the first epochs_trained epochs to get the random state of the dataloader at the right point.
         if not args.ignore_data_skip:
-            #print("I skip!") called
+            # print("I skip!") called
             for epoch in range(epochs_trained):
                 is_random_sampler = hasattr(train_dataloader, "sampler") and isinstance(
                     train_dataloader.sampler, RandomSampler
@@ -1732,10 +1726,10 @@ def _one_train(
                     _ = list(train_dataloader.sampler)
 
         ###############
-        #num_train_epochs = 10
+        # num_train_epochs = 10
         self.is_in_train = True
-        #print("The number of epoches: ", num_train_epochs)
-        ############# 
+        # print("The number of epoches: ", num_train_epochs)
+        #############
         total_batched_samples = 0
         for epoch in range(epochs_trained, num_train_epochs):
             if isinstance(train_dataloader, DataLoader) and isinstance(train_dataloader.sampler, DistributedSampler):
@@ -1754,9 +1748,7 @@ def _one_train(
                 self._past = None
 
             steps_in_epoch = (
-                len(epoch_iterator)
-                if len_dataloader is not None
-                else args.max_steps * args.gradient_accumulation_steps
+                len(epoch_iterator) if len_dataloader is not None else args.max_steps * args.gradient_accumulation_steps
             )
             self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
 
@@ -1771,7 +1763,7 @@ def _one_train(
                 steps_trained_in_current_epoch = 0
                 rng_to_sync = True
 
-            #print("The number of one epoch: ", len(epoch_iterator))
+            # print("The number of one epoch: ", len(epoch_iterator))
             step = -1
             for step, inputs in enumerate(epoch_iterator):
                 total_batched_samples += 1
@@ -1802,8 +1794,8 @@ def _one_train(
                     # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
                     with self.tmp_model.no_sync():
                         tr_loss_step = self.training_step(self.tmp_model, inputs)
-                    #with model.no_sync():
-                        #tr_loss_step = self.training_step(model, inputs)
+                    # with model.no_sync():
+                    # tr_loss_step = self.training_step(model, inputs)
                 else:
                     tr_loss_step = self.training_step(self.tmp_model, inputs)
 
@@ -1825,8 +1817,7 @@ def _one_train(
 
                 if total_batched_samples % args.gradient_accumulation_steps == 0 or (
                     # last step in epoch but step is always smaller than gradient_accumulation_steps
-                    steps_in_epoch <= args.gradient_accumulation_steps
-                    and (step + 1) == steps_in_epoch
+                    steps_in_epoch <= args.gradient_accumulation_steps and (step + 1) == steps_in_epoch
                 ):
                     # Gradient clipping
                     if args.max_grad_norm is not None and args.max_grad_norm > 0 and not self.deepspeed:
@@ -1959,10 +1950,10 @@ def _one_train(
     def _inner_training_loop(
         self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None
     ):
-        '''
+        """
         0 This function serves to train one time
         1 Update the self.train_dataset before calling this function
-        '''
+        """
         # 1 Get dataloader
         self._train_batch_size = batch_size
         # Data loader and number of training steps
@@ -2026,7 +2017,7 @@ def _inner_training_loop(
             self.deepspeed = deepspeed_engine
             self.optimizer = optimizer
             self.lr_scheduler = lr_scheduler
-            #print("I just create a optimizer here!")  # called
+            # print("I just create a optimizer here!")  # called
         elif not delay_optimizer_creation:
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
@@ -2037,11 +2028,10 @@ def _inner_training_loop(
         if args.gradient_checkpointing:
             self.model.gradient_checkpointing_enable()
 
-        #model = self._wrap_model(self.model_wrapped)
+        # model = self._wrap_model(self.model_wrapped)
         self.tmp_model = self._wrap_model(self.model_wrapped)
 
-
-        #if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None:
+        # if is_sagemaker_mp_enabled() and resume_from_checkpoint is not None:
         #    self._load_from_checkpoint(resume_from_checkpoint, model)
 
         # for the rest of this function `model` is the outside model, whether it was wrapped or not
@@ -2054,14 +2044,12 @@ def _inner_training_loop(
 
         return True
         # Check if saved optimizer or scheduler states exist
-        #self._load_optimizer_and_scheduler(resume_from_checkpoint)
+        # self._load_optimizer_and_scheduler(resume_from_checkpoint)
 
         # important: at this point:
         # self.model         is the Transformers Model
         # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), etc.
 
-        
-
     def _get_output_dir(self, trial):
         if self.hp_search_backend is not None and trial is not None:
             if self.hp_search_backend == HPSearchBackend.OPTUNA:
@@ -2209,16 +2197,14 @@ def _issue_warnings_after_load(self, load_result):
             else:
                 logger.warning(f"There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.")
         if len(load_result.unexpected_keys) != 0:
-            logger.warning(
-                f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}."
-            )
+            logger.warning(f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}.")
 
     def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval):
         if self.control.should_log:
             if is_torch_tpu_available():
                 xm.mark_step()
 
-            logs: Dict[str, float] = {}
+            logs: dict[str, float] = {}
 
             # all_gather + mean() to get average loss over all processes
             tr_loss_scalar = self._nested_gather(tr_loss).mean().item()
@@ -2299,7 +2285,7 @@ def _save_checkpoint(self, model, trial, metrics=None):
         # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
 
         # Save model checkpoint
-        #checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+        # checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
         checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.save_counter}"
         ##########
         self.save_counter += 1
@@ -2461,8 +2447,8 @@ def opt_load_hook(mod, opt):
 
     def hyperparameter_search(
         self,
-        hp_space: Optional[Callable[["optuna.Trial"], Dict[str, float]]] = None,
-        compute_objective: Optional[Callable[[Dict[str, float]], float]] = None,
+        hp_space: Optional[Callable[["optuna.Trial"], dict[str, float]]] = None,
+        compute_objective: Optional[Callable[[dict[str, float]], float]] = None,
         n_trials: int = 20,
         direction: str = "minimize",
         backend: Optional[Union["str", HPSearchBackend]] = None,
@@ -2480,11 +2466,11 @@ def hyperparameter_search(
         optimizer/scheduler.
         </Tip>
         Args:
-            hp_space (`Callable[["optuna.Trial"], Dict[str, float]]`, *optional*):
+            hp_space (`Callable[["optuna.Trial"], dict[str, float]]`, *optional*):
                 A function that defines the hyperparameter search space. Will default to
                 [`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`] or
                 [`~trainer_utils.default_hp_space_sigopt`] depending on your backend.
-            compute_objective (`Callable[[Dict[str, float]], float]`, *optional*):
+            compute_objective (`Callable[[dict[str, float]], float]`, *optional*):
                 A function computing the objective to minimize or maximize from the metrics returned by the `evaluate`
                 method. Will default to [`~trainer_utils.default_compute_objective`].
             n_trials (`int`, *optional*, defaults to 100):
@@ -2497,7 +2483,7 @@ def hyperparameter_search(
                 on which one is installed. If all are installed, will default to optuna.
             hp_name (`Callable[["optuna.Trial"], str]]`, *optional*):
                 A function that defines the trial/run name. Will default to None.
-            kwargs (`Dict[str, Any]`, *optional*):
+            kwargs (`dict[str, Any]`, *optional*):
                 Additional keyword arguments passed along to `optuna.create_study` or `ray.tune.run`. For more
                 information see:
                 - the documentation of
@@ -2542,7 +2528,6 @@ def hyperparameter_search(
             }
             backend_run = backend_dict[backend]
         except NameError:
-            ALL_HYPERPARAMETER_SEARCH_BACKENDS
             backend_obj = ALL_HYPERPARAMETER_SEARCH_BACKENDS[backend]()
             backend_run = backend_obj.run
         try:
@@ -2554,17 +2539,16 @@ def hyperparameter_search(
         self.hp_name = hp_name
         self.compute_objective = default_compute_objective if compute_objective is None else compute_objective
 
-
         best_run = backend_run(self, n_trials, direction, **kwargs)
         self.hp_search_backend = None
         return best_run
 
-    def log(self, logs: Dict[str, float]) -> None:
+    def log(self, logs: dict[str, float]) -> None:
         """
         Log `logs` on the various objects watching training.
         Subclass and override this method to inject custom behavior.
         Args:
-            logs (`Dict[str, float]`):
+            logs (`dict[str, float]`):
                 The values to log.
         """
         if self.state.epoch is not None:
@@ -2592,7 +2576,7 @@ def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor,
             return data.to(**kwargs)
         return data
 
-    def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]:
+    def _prepare_inputs(self, inputs: dict[str, Union[torch.Tensor, Any]]) -> dict[str, Union[torch.Tensor, Any]]:
         """
         Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
         handling potential state.
@@ -2633,14 +2617,14 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
 
         return ctx_manager
 
-    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
+    def training_step(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
         """
         Perform a training step on a batch of inputs.
         Subclass and override to inject custom behavior.
         Args:
             model (`nn.Module`):
                 The model to train.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+            inputs (`dict[str, Union[torch.Tensor, Any]]`):
                 The inputs and targets of the model.
                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                 argument `labels`. Check your model's documentation for all accepted arguments.
@@ -2856,7 +2840,7 @@ def store_flos(self):
 
     def _sorted_checkpoints(
         self, output_dir=None, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False
-    ) -> List[str]:
+    ) -> list[str]:
         ordering_and_checkpoint_path = []
 
         glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
@@ -2906,9 +2890,9 @@ def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None:
     def evaluate(
         self,
         eval_dataset: Optional[Dataset] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
-    ) -> Dict[str, float]:
+    ) -> dict[str, float]:
         """
         Run evaluation and returns metrics.
         The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
@@ -2971,7 +2955,7 @@ def evaluate(
         return output.metrics
 
     def predict(
-        self, test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "test"
+        self, test_dataset: Dataset, ignore_keys: Optional[list[str]] = None, metric_key_prefix: str = "test"
     ) -> PredictionOutput:
         """
         Run prediction and returns predictions and potential metrics.
@@ -2992,10 +2976,10 @@ def predict(
         in a token classification task) the predictions will be padded (on the right) to allow for concatenation into
         one array. The padding index is -100.
         </Tip>
-        Returns: *NamedTuple* A namedtuple with the following keys:
+        Returns: *Namedtuple* A namedtuple with the following keys:
             - predictions (`np.ndarray`): The predictions on `test_dataset`.
             - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
-            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
+            - metrics (`dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained
               labels).
         """
         # memory metrics - must set up as early as possible
@@ -3030,7 +3014,7 @@ def evaluation_loop(
         dataloader: DataLoader,
         description: str,
         prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
@@ -3156,9 +3140,7 @@ def evaluation_loop(
                     )
                 if labels_host is not None:
                     labels = nested_numpify(labels_host)
-                    all_labels = (
-                        labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
-                    )
+                    all_labels = labels if all_labels is None else nested_concat(all_labels, labels, padding_index=-100)
 
                 # Set back to None to begin a new accumulation
                 losses_host, preds_host, inputs_host, labels_host = None, None, None, None
@@ -3263,9 +3245,7 @@ def _pad_across_processes(self, tensor, pad_index=-100):
         elif isinstance(tensor, dict):
             return type(tensor)({k: self._pad_across_processes(v, pad_index=pad_index) for k, v in tensor.items()})
         elif not isinstance(tensor, torch.Tensor):
-            raise TypeError(
-                f"Can't pad the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors."
-            )
+            raise TypeError(f"Can't pad the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors.")
 
         if len(tensor.shape) < 2:
             return tensor
@@ -3290,17 +3270,17 @@ def _pad_across_processes(self, tensor, pad_index=-100):
     def prediction_step(
         self,
         model: nn.Module,
-        inputs: Dict[str, Union[torch.Tensor, Any]],
+        inputs: dict[str, Union[torch.Tensor, Any]],
         prediction_loss_only: bool,
-        ignore_keys: Optional[List[str]] = None,
-    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        ignore_keys: Optional[list[str]] = None,
+    ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
         Perform an evaluation step on `model` using `inputs`.
         Subclass and override to inject custom behavior.
         Args:
             model (`nn.Module`):
                 The model to evaluate.
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+            inputs (`dict[str, Union[torch.Tensor, Any]]`):
                 The inputs and targets of the model.
                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                 argument `labels`. Check your model's documentation for all accepted arguments.
@@ -3310,7 +3290,7 @@ def prediction_step(
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
         Return:
-            Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
+            tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss,
             logits and labels (each being optional).
         """
         has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names)
@@ -3388,13 +3368,13 @@ def prediction_step(
 
         return (loss, logits, labels)
 
-    def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
+    def floating_point_ops(self, inputs: dict[str, Union[torch.Tensor, Any]]):
         """
         For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point
         operations for every backward + forward pass. If using another model, either implement such a method in the
         model or subclass and override this method.
         Args:
-            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+            inputs (`dict[str, Union[torch.Tensor, Any]]`):
                 The inputs and targets of the model.
         Returns:
             `int`: The number of floating-point operations.
@@ -3426,7 +3406,7 @@ def init_git_repo(self, at_init: bool = False):
         create_repo(repo_name, token=self.args.hub_token, private=self.args.hub_private_repo, exist_ok=True)
         try:
             self.repo = Repository(self.args.output_dir, clone_from=repo_name, token=self.args.hub_token)
-        except EnvironmentError:
+        except OSError:
             if self.args.overwrite_output_dir and at_init:
                 # Try again after wiping output_dir
                 shutil.rmtree(self.args.output_dir)
@@ -3454,13 +3434,13 @@ def create_model_card(
         self,
         language: Optional[str] = None,
         license: Optional[str] = None,
-        tags: Union[str, List[str], None] = None,
+        tags: Union[str, list[str], None] = None,
         model_name: Optional[str] = None,
         finetuned_from: Optional[str] = None,
-        tasks: Union[str, List[str], None] = None,
-        dataset_tags: Union[str, List[str], None] = None,
-        dataset: Union[str, List[str], None] = None,
-        dataset_args: Union[str, List[str], None] = None,
+        tasks: Union[str, list[str], None] = None,
+        dataset_tags: Union[str, list[str], None] = None,
+        dataset: Union[str, list[str], None] = None,
+        dataset_args: Union[str, list[str], None] = None,
     ):
         """
         Creates a draft of a model card using the information available to the `Trainer`.
@@ -3470,20 +3450,20 @@ def create_model_card(
             license (`str`, *optional*):
                 The license of the model. Will default to the license of the pretrained model used, if the original
                 model given to the `Trainer` comes from a repo on the Hub.
-            tags (`str` or `List[str]`, *optional*):
+            tags (`str` or `list[str]`, *optional*):
                 Some tags to be included in the metadata of the model card.
             model_name (`str`, *optional*):
                 The name of the model.
             finetuned_from (`str`, *optional*):
                 The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
                 of the original model given to the `Trainer` (if it comes from the Hub).
-            tasks (`str` or `List[str]`, *optional*):
+            tasks (`str` or `list[str]`, *optional*):
                 One or several task identifiers, to be included in the metadata of the model card.
-            dataset_tags (`str` or `List[str]`, *optional*):
+            dataset_tags (`str` or `list[str]`, *optional*):
                 One or several dataset tags, to be included in the metadata of the model card.
-            dataset (`str` or `List[str]`, *optional*):
+            dataset (`str` or `list[str]`, *optional*):
                 One or several dataset identifiers, to be included in the metadata of the model card.
-            dataset_args (`str` or `List[str]`, *optional*):
+            dataset_args (`str` or `list[str]`, *optional*):
                One or several dataset arguments, to be included in the metadata of the model card.
         """
         if not self.is_world_process_zero():
@@ -3596,7 +3576,7 @@ def push_to_hub(self, commit_message: Optional[str] = "End of training", blockin
                 self.repo.push_to_hub(
                     commit_message="update model card README.md", blocking=blocking, auto_lfs_prune=True
                 )
-            except EnvironmentError as exc:
+            except OSError as exc:
                 logger.error(f"Error pushing update to the model card. Please read logs and retry.\n${exc}")
 
         return git_head_commit_url
@@ -3610,7 +3590,7 @@ def prediction_loop(
         dataloader: DataLoader,
         description: str,
         prediction_loss_only: Optional[bool] = None,
-        ignore_keys: Optional[List[str]] = None,
+        ignore_keys: Optional[list[str]] = None,
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
@@ -3654,9 +3634,9 @@ def prediction_loop(
         logger.info(f"  Num examples = {num_examples}")
         logger.info(f"  Batch size = {batch_size}")
         losses_host: torch.Tensor = None
-        preds_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        labels_host: Union[torch.Tensor, List[torch.Tensor]] = None
-        inputs_host: Union[torch.Tensor, List[torch.Tensor]] = None
+        preds_host: Union[torch.Tensor, list[torch.Tensor]] = None
+        labels_host: Union[torch.Tensor, list[torch.Tensor]] = None
+        inputs_host: Union[torch.Tensor, list[torch.Tensor]] = None
 
         world_size = max(1, args.world_size)
 
@@ -3776,7 +3756,7 @@ def _add_sm_patterns_to_gitignore(self) -> None:
 
         # Get current .gitignore content
         if os.path.exists(os.path.join(self.repo.local_dir, ".gitignore")):
-            with open(os.path.join(self.repo.local_dir, ".gitignore"), "r") as f:
+            with open(os.path.join(self.repo.local_dir, ".gitignore")) as f:
                 current_content = f.read()
         else:
             current_content = ""
diff --git a/src/lmflow/pipeline/utils/rm_dataprocessor.py b/src/lmflow/pipeline/utils/rm_dataprocessor.py
index e33a7e14d..0e6dc7044 100644
--- a/src/lmflow/pipeline/utils/rm_dataprocessor.py
+++ b/src/lmflow/pipeline/utils/rm_dataprocessor.py
@@ -1,12 +1,10 @@
 import logging
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Optional, Union
 
-from datasets import load_dataset
 from transformers import AutoTokenizer
 from transformers.utils import PaddingStrategy
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -18,7 +16,7 @@ class RewardDataCollatorWithPadding:
     pad_to_multiple_of: Optional[int] = None
     return_tensors: str = "pt"
 
-    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
+    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
         merged_features = []
         for feature in features:
             merged_features.append(
diff --git a/src/lmflow/pipeline/utils/rm_trainer.py b/src/lmflow/pipeline/utils/rm_trainer.py
index cbd064717..f3145ed50 100644
--- a/src/lmflow/pipeline/utils/rm_trainer.py
+++ b/src/lmflow/pipeline/utils/rm_trainer.py
@@ -3,24 +3,18 @@
 import torch.nn as nn
 from transformers import Trainer
 
-from .peft_trainer import PeftTrainer
-
 
 def compute_metrics(eval_pred):
     result = {}
     pos_predictions_scores = eval_pred.predictions[0]
     neg_predictions_scores = eval_pred.predictions[1]
     # We assume that the first sample is preferred by default in groundtruth
-    result['accuracy'] = np.sum(
-        pos_predictions_scores > neg_predictions_scores) / len(pos_predictions_scores)
+    result["accuracy"] = np.sum(pos_predictions_scores > neg_predictions_scores) / len(pos_predictions_scores)
     return result
 
 
 def rm_loss(model, inputs, return_outputs=False):
-    rewards = model(
-        input_ids=inputs["input_ids"], 
-        attention_mask=inputs["attention_mask"]
-    )[0]
+    rewards = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])[0]
     bsz = rewards.size(0)
     jidx = torch.arange(0, bsz, 2)
     kidx = jidx + 1
@@ -35,8 +29,3 @@ def rm_loss(model, inputs, return_outputs=False):
 class RewardTrainer(Trainer):
     def compute_loss(self, model, inputs, return_outputs=False):
         return rm_loss(model, inputs, return_outputs)
-
-
-class PeftRewardTrainer(PeftTrainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        return rm_loss(model, inputs, return_outputs)
\ No newline at end of file
diff --git a/src/lmflow/pipeline/vllm_inferencer.py b/src/lmflow/pipeline/vllm_inferencer.py
index 9a0f0189a..d2905c34a 100644
--- a/src/lmflow/pipeline/vllm_inferencer.py
+++ b/src/lmflow/pipeline/vllm_inferencer.py
@@ -1,39 +1,37 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import copy
 import importlib.resources as pkg_resources
 import json
 import logging
 import os
-os.environ['VLLM_WORKER_MULTIPROC_METHOD']='spawn'
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 import subprocess
 import sys
-from functools import partial
-from typing import List, Union, Optional, Dict, Any
+from typing import Any, Optional, Union
 
 import numpy as np
 from transformers import AutoTokenizer
 
-from lmflow.datasets import Dataset
-from lmflow.pipeline.base_pipeline import BasePipeline
-from lmflow.models.hf_decoder_model import HFDecoderModel
 from lmflow.args import (
-    InferencerArguments, 
-    ModelArguments, 
     DatasetArguments,
+    InferencerArguments,
+    ModelArguments,
 )
+from lmflow.datasets import Dataset
+from lmflow.models.hf_decoder_model import HFDecoderModel
+from lmflow.pipeline.base_pipeline import BasePipeline
 from lmflow.utils.common import make_shell_args_from_dataclass
-from lmflow.utils.constants import RETURN_CODE_ERROR_BUFFER, MEMORY_SAFE_VLLM_INFERENCE_ENV_VAR_TO_REMOVE
+from lmflow.utils.constants import MEMORY_SAFE_VLLM_INFERENCE_ENV_VAR_TO_REMOVE, RETURN_CODE_ERROR_BUFFER
 from lmflow.utils.data_utils import VLLMInferenceResultWithInput
-from lmflow.utils.versioning import is_vllm_available, is_ray_available
-
+from lmflow.utils.versioning import is_ray_available, is_vllm_available
 
 logger = logging.getLogger(__name__)
 
 
 if is_vllm_available():
-    from vllm import SamplingParams, LLM
+    from vllm import SamplingParams
 else:
     raise ImportError("VLLM is not available, please install vllm.")
 
@@ -43,11 +41,11 @@
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 else:
     logger.warning("Ray is not available, distributed vllm inference will not be supported.")
-    
+
 
 class InferencerWithOffloading(BasePipeline):
     def __init__(
-        self, 
+        self,
         model_args: ModelArguments,
         data_args: DatasetArguments,
         inferencer_args: InferencerArguments,
@@ -59,14 +57,14 @@ def __init__(
 
     def inference(self):
         raise NotImplementedError(".inference is not implemented")
-        
+
     def save_inference_results(self):
         raise NotImplementedError(".save_inference_results is not implemented")
-        
+
     def load_inference_results(self):
         raise NotImplementedError(".load_inference_results is not implemented")
-    
-    
+
+
 class VLLMInferencer(InferencerWithOffloading):
     def __init__(
         self,
@@ -77,8 +75,7 @@ def __init__(
         assert inferencer_args.use_vllm, "The inferencer_args.use_vllm must be True."
         super().__init__(model_args, data_args, inferencer_args)
         self.sampling_params = self.parse_to_sampling_params(inferencer_args)
-        
-    
+
     def parse_to_sampling_params(
         self,
         inference_args: InferencerArguments,
@@ -91,20 +88,19 @@ def parse_to_sampling_params(
             seed=inference_args.random_seed,
             top_p=inference_args.top_p,
             top_k=inference_args.top_k,
-            stop_token_ids=[self.eos_token_id] + inference_args.additional_stop_token_ids
+            stop_token_ids=[self.eos_token_id] + inference_args.additional_stop_token_ids,
         )
-        
-        
+
     def inference(
         self,
-        model: HFDecoderModel, 
-        dataset: Dataset, 
+        model: HFDecoderModel,
+        dataset: Dataset,
         enable_decode_inference_result: bool = True,
         release_gpu: bool = False,
         inference_args: Optional[InferencerArguments] = None,
         enable_distributed_inference: bool = False,
         **kwargs,
-    ) -> List[VLLMInferenceResultWithInput]:
+    ) -> list[VLLMInferenceResultWithInput]:
         """Perform inference using the provided model and dataset. Will save inference results if
         `save_results` is set to True in `inferencer_args`.
 
@@ -119,43 +115,41 @@ def inference(
         enable_decode_inference_result : bool, optional
             Whether to decode after generation, by default False.
         release_gpu : bool, optional
-            Whether to release gpu resources, by default False. 
+            Whether to release gpu resources, by default False.
         inference_args : InferencerArguments, optional
             by default None
 
         Returns
         -------
-        List[VLLMInferenceResultWithInput]
+        list[VLLMInferenceResultWithInput]
             Return a list of VLLMInferenceResultWithInput, where each
             element contains the input prompt and the corresponding output.
-            
+
             When `enable_decode_inference_result = True`, the output would be a list of strings,
             contains sampling_params.n samples for the corresponding prompt.
-            
-            When `enable_decode_inference_result = False`, return a list of list of ints 
+
+            When `enable_decode_inference_result = False`, return a list of list of ints
             (token ids, no decoding after generation).
         """
         if inference_args:
-            logger.warning(
-                "Overriding the default inference arguments with the provided arguments in .inference()"
-            )
+            logger.warning("Overriding the default inference arguments with the provided arguments in .inference()")
             sampling_params = self.parse_to_sampling_params(inference_args)
         else:
             sampling_params = self.sampling_params
-            
+
         sampling_params.detokenize = enable_decode_inference_result
-        
+
         model_input = model.prepare_inputs_for_inference(
             dataset=dataset,
             apply_chat_template=self.inferencer_args.apply_chat_template,
             use_vllm=self.inferencer_args.use_vllm,
             enable_distributed_inference=enable_distributed_inference,
         )
-        
+
         if enable_distributed_inference:
             outputs = self._distributed_inference(
-                model=model, 
-                model_input=model_input, 
+                model=model,
+                model_input=model_input,
                 sampling_params=sampling_params,
                 num_instances=kwargs.get("distributed_inference_num_instances"),
                 batch_size=kwargs.get("inference_batch_size", 4),
@@ -164,24 +158,23 @@ def inference(
         else:
             outputs = self._inference(
                 model=model,
-                model_input=model_input, 
+                model_input=model_input,
                 sampling_params=sampling_params,
                 release_gpu=release_gpu,
             )
 
         if self.inferencer_args.save_results:
             self.save_inference_results(outputs, self.inferencer_args.results_path)
-            
-        return outputs
 
+        return outputs
 
     def _inference(
         self,
-        model: HFDecoderModel, 
-        model_input: List[str], 
+        model: HFDecoderModel,
+        model_input: list[str],
         sampling_params: SamplingParams,
         release_gpu: bool = False,
-    ) -> List[VLLMInferenceResultWithInput]:        
+    ) -> list[VLLMInferenceResultWithInput]:
         outputs = model.inference(
             inputs=model_input,
             sampling_params=sampling_params,
@@ -190,38 +183,32 @@ def _inference(
             vllm_gpu_memory_utilization=self.inferencer_args.vllm_gpu_memory_utilization,
             vllm_tensor_parallel_size=self.inferencer_args.vllm_tensor_parallel_size,
         )
-        
+
         return outputs
-    
-    
+
     def _distributed_inference(
         self,
-        model: HFDecoderModel, 
-        model_input: ray.data.Dataset, 
+        model: HFDecoderModel,
+        model_input: ray.data.Dataset,
         sampling_params: SamplingParams,
         num_instances: int,
         batch_size: int = 4,
         release_gpu: bool = False,
-    ) -> List[VLLMInferenceResultWithInput]:
+    ) -> list[VLLMInferenceResultWithInput]:
         # prepare distributed inference resources
         # from https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_distributed.py
         ## strategy
         def scheduling_strategy_fn():
             # One bundle per tensor parallel worker
             pg = ray.util.placement_group(
-                [{
-                    "GPU": 1,
-                    "CPU": 1
-                }] * self.inferencer_args.vllm_tensor_parallel_size,
+                [{"GPU": 1, "CPU": 1}] * self.inferencer_args.vllm_tensor_parallel_size,
                 strategy="STRICT_PACK",
             )
             return dict(
-                scheduling_strategy=PlacementGroupSchedulingStrategy(
-                    pg, placement_group_capture_child_tasks=True
-                )
+                scheduling_strategy=PlacementGroupSchedulingStrategy(pg, placement_group_capture_child_tasks=True)
             )
-            
-        resources_kwarg: Dict[str, Any] = {}
+
+        resources_kwarg: dict[str, Any] = {}
         if self.inferencer_args.vllm_tensor_parallel_size == 1:
             # For tensor_parallel_size == 1, we simply set num_gpus=1.
             resources_kwarg["num_gpus"] = 1
@@ -231,16 +218,16 @@ def scheduling_strategy_fn():
             # each instance.
             resources_kwarg["num_gpus"] = 0
             resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
-            
+
         ## predictor
         class DistributedPredictor:
             def __init__(
-                self, 
+                self,
                 model: HFDecoderModel,
                 sampling_params: SamplingParams,
                 vllm_gpu_memory_utilization: float,
                 vllm_tensor_parallel_size: int,
-                release_gpu: bool=False,
+                release_gpu: bool = False,
             ):
                 self.model = copy.deepcopy(model)
                 self.model.activate_model_for_inference(
@@ -250,26 +237,25 @@ def __init__(
                 )
                 self.sampling_params = sampling_params
                 self.release_gpu = release_gpu
-                
-            def __call__(self, batch: Dict[str, np.ndarray]):
-                """batch: Dict[str, np.ndarray], {"item": array(['...', '...', '...', ...])}
-                """
+
+            def __call__(self, batch: dict[str, np.ndarray]):
+                """batch: dict[str, np.ndarray], {"item": array(['...', '...', '...', ...])}"""
                 batched_inference_res = self.model.inference(
-                    inputs=batch['item'],
+                    inputs=batch["item"],
                     sampling_params=self.sampling_params,
                     release_gpu=self.release_gpu,
                     use_vllm=True,
-                ) # this is the postprocessed output, see model.__vllm_inference
+                )  # this is the postprocessed output, see model.__vllm_inference
                 batched_final_res = {
-                    "input": [sample['input'] for sample in batched_inference_res],
-                    "output": [sample['output'] for sample in batched_inference_res] 
-                } # do this since we're writing to a pandas dataframe
+                    "input": [sample["input"] for sample in batched_inference_res],
+                    "output": [sample["output"] for sample in batched_inference_res],
+                }  # do this since we're writing to a pandas dataframe
                 return batched_final_res
-            
-        # inference        
+
+        # inference
         model_input_mapping = model_input.map_batches(
             DistributedPredictor,
-            concurrency=num_instances, # Set the concurrency to the number of LLM instances.
+            concurrency=num_instances,  # Set the concurrency to the number of LLM instances.
             batch_size=batch_size,
             fn_constructor_kwargs={
                 "model": model,
@@ -280,38 +266,34 @@ def __call__(self, batch: Dict[str, np.ndarray]):
             },
             **resources_kwarg,
         )
-        
-        df_model_output = model_input_mapping.to_pandas() # the actual forwards are executed here
+
+        df_model_output = model_input_mapping.to_pandas()  # the actual forwards are executed here
         logger.info(f"Distributed vllm inference result preview:\n{df_model_output.head(10)}")
-        
-        model_output = [
-            {"input": row["input"], "output": row["output"]} for _, row in df_model_output[:].iterrows()
-        ]
-        
+
+        model_output = [{"input": row["input"], "output": row["output"]} for _, row in df_model_output[:].iterrows()]
+
         return model_output
-    
-    
+
     def save_inference_results(
         self,
-        outputs: Union[List[List[str]], List[List[List[int]]]],
+        outputs: Union[list[list[str]], list[list[list[int]]]],
         save_file_path: str,
     ):
-        with open(save_file_path, "w", encoding='utf-8') as f:
+        with open(save_file_path, "w", encoding="utf-8") as f:
             json.dump(outputs, f, ensure_ascii=False, indent=4)
-            
+
         logger.info(f"Inference results are saved to {save_file_path}.")
-        
-        
+
     def load_inference_results(
         self,
         results_path: str,
-    ) -> Union[List[List[str]], List[List[List[int]]]]:
-        with open(results_path, "r") as f:
+    ) -> Union[list[list[str]], list[list[list[int]]]]:
+        with open(results_path) as f:
             results = json.load(f)
-            
+
         return results
-        
-        
+
+
 class MemorySafeVLLMInferencer(VLLMInferencer):
     def __init__(
         self,
@@ -322,13 +304,12 @@ def __init__(
         assert inferencer_args.save_results, "For MemorySafeVLLMInferencer, `save_results` must be True."
         super().__init__(model_args, data_args, inferencer_args)
         self.inferencer_file_path = pkg_resources.files("lmflow.pipeline.utils") / "memory_safe_vllm_inference.py"
-        
-    
-    def inference(self) -> List[VLLMInferenceResultWithInput]:
+
+    def inference(self) -> list[VLLMInferenceResultWithInput]:
         inferencer_args = make_shell_args_from_dataclass(
             dataclass_objects=[
-                self.model_args, 
-                self.data_args, 
+                self.model_args,
+                self.data_args,
                 self.inferencer_args,
             ],
             format="shell",
@@ -337,7 +318,7 @@ def inference(self) -> List[VLLMInferenceResultWithInput]:
         current_env = os.environ.copy()
         for var in MEMORY_SAFE_VLLM_INFERENCE_ENV_VAR_TO_REMOVE:
             current_env.pop(var, None)
-        
+
         cli_res = subprocess.run(
             args=cmd,
             stdout=sys.stdout,
@@ -347,10 +328,11 @@ def inference(self) -> List[VLLMInferenceResultWithInput]:
             env=current_env,
         )
         logger.info(f"MemorySafeVLLMInference subprocess run finished, info at finish: {cli_res}")
-        
+
         if cli_res.returncode in RETURN_CODE_ERROR_BUFFER:
-            # > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name='<stdout>'> 
-            # > at interpreter shutdown, possibly due to daemon threads
+            # > Fatal Python error: _enter_buffered_busy: could not acquire lock for
+            # > <_io.BufferedWriter name='<stdout>'> at interpreter shutdown, possibly
+            # > due to daemon threads
             logger.warning(
                 "^^^^^^^^^^ Please ignore the above error, as it comes from the subprocess. "
                 "This may due to a kill signal with unfinished stdout/stderr writing in the subprocess. "
@@ -358,8 +340,8 @@ def inference(self) -> List[VLLMInferenceResultWithInput]:
         else:
             if cli_res.returncode != 0:
                 raise RuntimeError(f"Error during MemorySafeVLLMInference: {cli_res}")
-                
+
         outputs = self.load_inference_results(self.inferencer_args.results_path)
         logger.info("MemorySafeVLLMInference result captured.")
-        
-        return outputs
\ No newline at end of file
+
+        return outputs
diff --git a/src/lmflow/tokenization/__init__.py b/src/lmflow/tokenization/__init__.py
index 8e8fc5548..7232b6002 100644
--- a/src/lmflow/tokenization/__init__.py
+++ b/src/lmflow/tokenization/__init__.py
@@ -1,3 +1,2 @@
 #!/usr/bin/env python
-# coding=utf-8
-# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
\ No newline at end of file
+# Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
diff --git a/src/lmflow/tokenization/hf_decoder_model.py b/src/lmflow/tokenization/hf_decoder_model.py
index 250c7f162..a7104621e 100644
--- a/src/lmflow/tokenization/hf_decoder_model.py
+++ b/src/lmflow/tokenization/hf_decoder_model.py
@@ -1,32 +1,29 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 
 import logging
-from logging import Logger
-from typing import Dict, Union, Optional
+from typing import Union
 
 import transformers
-from transformers.testing_utils import CaptureLogger
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers.testing_utils import CaptureLogger
 
 from lmflow.args import DatasetArguments
-from lmflow.utils.conversation_template import ConversationTemplate
 from lmflow.utils.constants import CONVERSATION_ROLE_NAMES
-
+from lmflow.utils.conversation_template import ConversationTemplate
 
 logger = logging.getLogger(__name__)
 tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
 
 
 def blocking(
-    token_dict: Dict, 
-    block_size: int, 
+    token_dict: dict,
+    block_size: int,
     model_max_length: int,
     pad_token_id: int,
     padding_side: str,
-    truncation_side: str='right',
-) -> Dict:
+    truncation_side: str = "right",
+) -> dict:
     num_example = len(token_dict[list(token_dict.keys())[0]])
     for i in range(num_example):
         max_length = min(block_size, model_max_length)
@@ -34,47 +31,31 @@ def blocking(
         if pad_length < 0:
             # Truncates too long samples
             for key in ["input_ids", "attention_mask", "labels"]:
-                if truncation_side == 'right':
+                if truncation_side == "right":
                     token_dict[key][i] = token_dict[key][i][:max_length]
-                elif truncation_side == 'left':
+                elif truncation_side == "left":
                     token_dict[key][i] = token_dict[key][i][-max_length:]
                 else:
-                    raise ValueError(
-                        f"truncation_side should be either 'right' or 'left', got {truncation_side}"
-                    )
+                    raise ValueError(f"truncation_side should be either 'right' or 'left', got {truncation_side}")
         else:
-            if padding_side == 'right':
+            if padding_side == "right":
                 # Pads too short samples
-                token_dict["input_ids"][i].extend(
-                    [pad_token_id for _ in range(pad_length)]
-                )
-                token_dict["attention_mask"][i].extend(
-                    [0 for _ in range(pad_length)]
-                )
-                token_dict["labels"][i].extend(
-                    [-100 for _ in range(pad_length)]
-                )
-            elif padding_side == 'left':
+                token_dict["input_ids"][i].extend([pad_token_id for _ in range(pad_length)])
+                token_dict["attention_mask"][i].extend([0 for _ in range(pad_length)])
+                token_dict["labels"][i].extend([-100 for _ in range(pad_length)])
+            elif padding_side == "left":
                 # Pads too short samples
-                token_dict["input_ids"][i] = (
-                    [pad_token_id for _ in range(pad_length)] + token_dict["input_ids"][i]
-                )
-                token_dict["attention_mask"][i] = (
-                    [0 for _ in range(pad_length)] + token_dict["attention_mask"][i]
-                )
-                token_dict["labels"][i] = (
-                    [-100 for _ in range(pad_length)] + token_dict["labels"][i]
-                )
+                token_dict["input_ids"][i] = [pad_token_id for _ in range(pad_length)] + token_dict["input_ids"][i]
+                token_dict["attention_mask"][i] = [0 for _ in range(pad_length)] + token_dict["attention_mask"][i]
+                token_dict["labels"][i] = [-100 for _ in range(pad_length)] + token_dict["labels"][i]
             else:
-                raise ValueError(
-                    f"padding_side should be either 'right' or 'left', got {padding_side}"
-                )
-        
+                raise ValueError(f"padding_side should be either 'right' or 'left', got {padding_side}")
+
     return token_dict
 
 
 def tokenize_function(
-    examples, 
+    examples,
     data_args: DatasetArguments,
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     column_names,
@@ -82,9 +63,8 @@ def tokenize_function(
     tokenized_column_order,
     add_special_tokens,
     use_truncation,
-) -> Dict:
-    """Handels text_only and text2text datasets tokenization
-    """
+) -> dict:
+    """Handels text_only and text2text datasets tokenization"""
     num_example = len(examples[column_names[0]])
     token_dict = {
         "input_ids": [[] for _ in range(num_example)],
@@ -102,20 +82,13 @@ def tokenize_function(
             if column_name in label_columns:
                 labels = encoding["input_ids"].copy()
             else:
-                labels = [
-                    [-100] * len(encoding["input_ids"][i])
-                    for i in range(num_example)
-                ]
+                labels = [[-100] * len(encoding["input_ids"][i]) for i in range(num_example)]
 
             for i in range(num_example):
-                token_dict["input_ids"][i].extend(
-                    encoding["input_ids"][i]
-                )
-                token_dict["attention_mask"][i].extend(
-                    encoding["attention_mask"][i]
-                )
+                token_dict["input_ids"][i].extend(encoding["input_ids"][i])
+                token_dict["attention_mask"][i].extend(encoding["attention_mask"][i])
                 token_dict["labels"][i].extend(labels[i])
-                
+
     if data_args.disable_group_texts:
         token_dict = blocking(
             token_dict=token_dict,
@@ -133,17 +106,16 @@ def tokenize_function(
             " before being passed to the model."
         )
     return token_dict
-    
+
 
 def conversation_tokenize_function(
-    examples, 
+    examples,
     data_args: DatasetArguments,
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], 
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     column_names,
-    conversation_template: Union[ConversationTemplate, str]
-) -> Dict:
-    """Handels conversation datasets tokenization
-    """
+    conversation_template: Union[ConversationTemplate, str],
+) -> dict:
+    """Handels conversation datasets tokenization"""
     num_example = len(examples[column_names[0]])
     token_dict = {
         "input_ids": [[] for _ in range(num_example)],
@@ -155,8 +127,8 @@ def conversation_tokenize_function(
             messages = examples["messages"][i]
             system = examples.get("system", [None] * num_example)[i]
             tools = examples.get("tools", [None] * num_example)[i]
-            
-            if isinstance(conversation_template, str): # jinja template
+
+            if isinstance(conversation_template, str):  # jinja template
                 conversation = [{"role": "system", "content": system}] if system is not None else []
                 conversation.extend(messages)
                 encoded_conversation = tokenizer.apply_chat_template(
@@ -166,33 +138,31 @@ def conversation_tokenize_function(
                     return_assistant_tokens_mask=True,
                     return_dict=True,
                 )
-                
+
                 if data_args.train_on_prompt:
                     labels = encoded_conversation["input_ids"]
                 else:
                     labels = [
-                        encoded_conversation["input_ids"][index] if mask == 1 else -100 
+                        encoded_conversation["input_ids"][index] if mask == 1 else -100
                         for index, mask in enumerate(encoded_conversation["assistant_masks"])
                     ]
-                    
-                token_dict['input_ids'][i].extend(encoded_conversation['input_ids'])
-                token_dict['attention_mask'][i].extend(encoded_conversation['attention_mask'])
-                token_dict['labels'][i].extend(labels)
-            
-            else: # lmflow `conversation_template`
-                if len(messages) < 2 or messages[0]['role'] != CONVERSATION_ROLE_NAMES['user']:
+
+                token_dict["input_ids"][i].extend(encoded_conversation["input_ids"])
+                token_dict["attention_mask"][i].extend(encoded_conversation["attention_mask"])
+                token_dict["labels"][i].extend(labels)
+
+            else:  # lmflow `conversation_template`
+                if len(messages) < 2 or messages[0]["role"] != CONVERSATION_ROLE_NAMES["user"]:
                     tok_logger.warning(
                         "Invalid instance encountered. Either the conversation has less than "
                         "one round or the first message is not from the user."
                     )
                     continue
-            
+
                 if len(messages) % 2 != 0:
-                    logger.warning(
-                        "The number of messages is not even, the last message will be ignored."
-                    )
+                    logger.warning("The number of messages is not even, the last message will be ignored.")
                     messages = messages[:-1]
-                    
+
                 encoded_conversation = conversation_template.encode_conversation(
                     tokenizer=tokenizer,
                     messages=messages,
@@ -203,12 +173,12 @@ def conversation_tokenize_function(
                 input_ids, labels = [], []
                 for turn_idx, (user_input, assistant_result) in enumerate(encoded_conversation):
                     input_ids += user_input + assistant_result
-                    
+
                     if data_args.train_on_prompt:
                         labels += user_input + assistant_result
                     else:
                         labels += [-100] * len(user_input) + assistant_result
-                    
+
                 token_dict["input_ids"][i].extend(input_ids)
                 token_dict["attention_mask"][i].extend([1] * len(input_ids))
                 token_dict["labels"][i].extend(labels)
@@ -229,4 +199,4 @@ def conversation_tokenize_function(
             "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits"
             " before being passed to the model."
         )
-    return token_dict
\ No newline at end of file
+    return token_dict
diff --git a/src/lmflow/tokenization/hf_text_regression_model.py b/src/lmflow/tokenization/hf_text_regression_model.py
index 28be6095e..8da910420 100644
--- a/src/lmflow/tokenization/hf_text_regression_model.py
+++ b/src/lmflow/tokenization/hf_text_regression_model.py
@@ -1,33 +1,30 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 
 import logging
-from logging import Logger
-from typing import Dict, List, Union
+from typing import Union
 
 import transformers
-from transformers.testing_utils import CaptureLogger
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers.testing_utils import CaptureLogger
 
-from lmflow.utils.conversation_template import ConversationTemplate
-from lmflow.utils.constants import CONVERSATION_ROLE_NAMES
 from lmflow.args import DatasetArguments
-
+from lmflow.utils.constants import CONVERSATION_ROLE_NAMES
+from lmflow.utils.conversation_template import ConversationTemplate
 
 logger = logging.getLogger(__name__)
 tok_logger = transformers.utils.logging.get_logger("transformers.tokenization_utils_base")
 
 
 def blocking_paired(
-    token_dict: Dict, 
-    column_names: List,
-    block_size: int, 
+    token_dict: dict,
+    column_names: list,
+    block_size: int,
     model_max_length: int,
     pad_token_id: int,
     padding_side: str,
-    truncation_side: str='right',
-) -> Dict:
+    truncation_side: str = "right",
+) -> dict:
     num_example = len(token_dict[list(token_dict.keys())[0]])
     for i in range(num_example):
         for column_name in column_names:
@@ -36,47 +33,39 @@ def blocking_paired(
             if pad_length < 0:
                 # Truncates too long samples
                 for key in [f"input_ids_{column_name}", f"attention_mask_{column_name}"]:
-                    if truncation_side == 'right':
+                    if truncation_side == "right":
                         token_dict[key][i] = token_dict[key][i][:pad_length]
-                    elif truncation_side == 'left':
+                    elif truncation_side == "left":
                         token_dict[key][i] = token_dict[key][i][-pad_length:]
                     else:
-                        raise ValueError(
-                            f"truncation_side should be either 'right' or 'left', got {truncation_side}"
-                        )
+                        raise ValueError(f"truncation_side should be either 'right' or 'left', got {truncation_side}")
             else:
-                if padding_side == 'right':
+                if padding_side == "right":
                     # Pads too short samples
-                    token_dict[f"input_ids_{column_name}"][i].extend(
-                        [pad_token_id for _ in range(pad_length)]
-                    )
-                    token_dict[f"attention_mask_{column_name}"][i].extend(
-                        [0 for _ in range(pad_length)]
-                    )
-                elif padding_side == 'left':
+                    token_dict[f"input_ids_{column_name}"][i].extend([pad_token_id for _ in range(pad_length)])
+                    token_dict[f"attention_mask_{column_name}"][i].extend([0 for _ in range(pad_length)])
+                elif padding_side == "left":
                     # Pads too short samples
-                    token_dict[f"input_ids_{column_name}"][i] = (
-                        [pad_token_id for _ in range(pad_length)] + token_dict[f"input_ids_{column_name}"][i]
-                    )
-                    token_dict[f"attention_mask_{column_name}"][i] = (
-                        [0 for _ in range(pad_length)] + token_dict[f"attention_mask_{column_name}"][i]
-                    )
+                    token_dict[f"input_ids_{column_name}"][i] = [pad_token_id for _ in range(pad_length)] + token_dict[
+                        f"input_ids_{column_name}"
+                    ][i]
+                    token_dict[f"attention_mask_{column_name}"][i] = [0 for _ in range(pad_length)] + token_dict[
+                        f"attention_mask_{column_name}"
+                    ][i]
                 else:
-                    raise ValueError(
-                        f"padding_side should be either 'right' or 'left', got {padding_side}"
-                    )
-        
+                    raise ValueError(f"padding_side should be either 'right' or 'left', got {padding_side}")
+
     return token_dict
 
 
 def blocking(
-    token_dict: Dict, 
-    block_size: int, 
+    token_dict: dict,
+    block_size: int,
     model_max_length: int,
     pad_token_id: int,
     padding_side: str,
-    truncation_side: str='right',
-) -> Dict:
+    truncation_side: str = "right",
+) -> dict:
     num_example = len(token_dict[list(token_dict.keys())[0]])
     for i in range(num_example):
         max_length = min(block_size, model_max_length)
@@ -84,101 +73,83 @@ def blocking(
         if pad_length < 0:
             # Truncates too long samples
             for key in ["input_ids", "attention_mask", "labels"]:
-                if truncation_side == 'right':
+                if truncation_side == "right":
                     token_dict[key][i] = token_dict[key][i][:pad_length]
-                elif truncation_side == 'left':
+                elif truncation_side == "left":
                     token_dict[key][i] = token_dict[key][i][-pad_length:]
                 else:
-                    raise ValueError(
-                        f"truncation_side should be either 'right' or 'left', got {truncation_side}"
-                    )
+                    raise ValueError(f"truncation_side should be either 'right' or 'left', got {truncation_side}")
         else:
-            if padding_side == 'right':
+            if padding_side == "right":
                 # Pads too short samples
-                token_dict["input_ids"][i].extend(
-                    [pad_token_id for _ in range(pad_length)]
-                )
-                token_dict["attention_mask"][i].extend(
-                    [0 for _ in range(pad_length)]
-                )
-                token_dict["labels"][i].extend(
-                    [-100 for _ in range(pad_length)]
-                )
-            elif padding_side == 'left':
+                token_dict["input_ids"][i].extend([pad_token_id for _ in range(pad_length)])
+                token_dict["attention_mask"][i].extend([0 for _ in range(pad_length)])
+                token_dict["labels"][i].extend([-100 for _ in range(pad_length)])
+            elif padding_side == "left":
                 # Pads too short samples
-                token_dict["input_ids"][i] = (
-                    [pad_token_id for _ in range(pad_length)] + token_dict["input_ids"][i]
-                )
-                token_dict["attention_mask"][i] = (
-                    [0 for _ in range(pad_length)] + token_dict["attention_mask"][i]
-                )
-                token_dict["labels"][i] = (
-                    [-100 for _ in range(pad_length)] + token_dict["labels"][i]
-                )
+                token_dict["input_ids"][i] = [pad_token_id for _ in range(pad_length)] + token_dict["input_ids"][i]
+                token_dict["attention_mask"][i] = [0 for _ in range(pad_length)] + token_dict["attention_mask"][i]
+                token_dict["labels"][i] = [-100 for _ in range(pad_length)] + token_dict["labels"][i]
             else:
-                raise ValueError(
-                    f"padding_side should be either 'right' or 'left', got {padding_side}"
-                )
-        
+                raise ValueError(f"padding_side should be either 'right' or 'left', got {padding_side}")
+
     return token_dict
 
 
 def blocking_text_to_textlist(
-    token_dict: Dict, 
-    block_size: int, 
+    token_dict: dict,
+    block_size: int,
     model_max_length: int,
     pad_token_id: int,
     padding_side: str,
-    truncation_side: str='right',
-) -> Dict:
+    truncation_side: str = "right",
+) -> dict:
     num_example = len(token_dict[list(token_dict.keys())[0]])
     max_length = min(block_size, model_max_length)
-    
+
     for example_idx in range(num_example):
         for content_idx in range(len(token_dict["input_ids"][example_idx])):
             pad_length = max_length - len(token_dict["input_ids"][example_idx][content_idx])
             if pad_length < 0:
                 # Truncates too long samples
-                if truncation_side == 'right':
-                    token_dict["input_ids"][example_idx][content_idx] = token_dict["input_ids"][example_idx][content_idx][:pad_length]
-                elif truncation_side == 'left':
-                    token_dict["input_ids"][example_idx][content_idx] = token_dict["input_ids"][example_idx][content_idx][-pad_length:]
+                if truncation_side == "right":
+                    token_dict["input_ids"][example_idx][content_idx] = token_dict["input_ids"][example_idx][
+                        content_idx
+                    ][:pad_length]
+                elif truncation_side == "left":
+                    token_dict["input_ids"][example_idx][content_idx] = token_dict["input_ids"][example_idx][
+                        content_idx
+                    ][-pad_length:]
                 else:
-                    raise ValueError(
-                        f"truncation_side should be either 'right' or 'left', got {truncation_side}"
-                    )
+                    raise ValueError(f"truncation_side should be either 'right' or 'left', got {truncation_side}")
             else:
-                if padding_side == 'right':
+                if padding_side == "right":
                     # Pads too short samples
-                    token_dict["input_ids"][example_idx][content_idx].extend(
-                        [pad_token_id for _ in range(pad_length)]
-                    )
-                elif padding_side == 'left':
+                    token_dict["input_ids"][example_idx][content_idx].extend([pad_token_id for _ in range(pad_length)])
+                elif padding_side == "left":
                     # Pads too short samples
-                    token_dict["input_ids"][example_idx][content_idx] = (
-                        [pad_token_id for _ in range(pad_length)] + token_dict["input_ids"][example_idx][content_idx]
-                    )
+                    token_dict["input_ids"][example_idx][content_idx] = [
+                        pad_token_id for _ in range(pad_length)
+                    ] + token_dict["input_ids"][example_idx][content_idx]
                 else:
-                    raise ValueError(
-                        f"padding_side should be either 'right' or 'left', got {padding_side}"
-                    )
-        
+                    raise ValueError(f"padding_side should be either 'right' or 'left', got {padding_side}")
+
     return token_dict
 
 
 def paired_conversation_tokenize_function(
-    examples, 
+    examples,
     data_args: DatasetArguments,
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], 
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     column_names,
     conversation_template: ConversationTemplate,
-) -> Dict:
+) -> dict:
     num_example = len(examples[column_names[0]])
     token_dict = {}
     for column_name in column_names:
         token_dict[f"input_ids_{column_name}"] = [[] for _ in range(num_example)]
         token_dict[f"attention_mask_{column_name}"] = [[] for _ in range(num_example)]
-        
+
     with CaptureLogger(tok_logger) as cl:
         num_corrupted = 0
         for i in range(num_example):
@@ -187,19 +158,17 @@ def paired_conversation_tokenize_function(
                     messages = examples[column_name][i]["messages"]
                     system = examples[column_name][i].get("system", None)
                     tools = examples[column_name][i].get("tools", None)
-                    if len(messages) < 2 or messages[0]['role'] != CONVERSATION_ROLE_NAMES['user']:
+                    if len(messages) < 2 or messages[0]["role"] != CONVERSATION_ROLE_NAMES["user"]:
                         tok_logger.warning(
                             "Invalid instance encountered. Either the conversation has less than "
                             "one round or the first message is not from the user."
                         )
                         continue
-                
+
                     if len(messages) % 2 != 0:
-                        logger.warning(
-                            "The number of messages is not even, the last message will be ignored."
-                        )
+                        logger.warning("The number of messages is not even, the last message will be ignored.")
                         messages = messages[:-1]
-                    
+
                     encoded_conversation = conversation_template.encode_conversation(
                         tokenizer=tokenizer,
                         messages=messages,
@@ -210,18 +179,18 @@ def paired_conversation_tokenize_function(
                     input_ids = []
                     for turn_idx, (user_input, assistant_result) in enumerate(encoded_conversation):
                         input_ids += user_input + assistant_result
-                        
+
                     token_dict[f"input_ids_{column_name}"][i].extend(input_ids)
                     token_dict[f"attention_mask_{column_name}"][i].extend([1] * len(input_ids))
-                    
-            except:
+
+            except Exception:
                 num_corrupted += 1
                 logger.error(f"Error in encoding conversation {i}: {column_name}")
                 logger.error(f"Messages: {messages}")
                 continue
         if num_corrupted > 0:
             logger.error(f"Number of corrupted examples: {num_corrupted}")
-                
+
     if data_args.disable_group_texts:
         token_dict = blocking_paired(
             token_dict=token_dict,
@@ -243,14 +212,13 @@ def paired_conversation_tokenize_function(
 
 
 def conversation_tokenize_function(
-    examples, 
+    examples,
     data_args: DatasetArguments,
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], 
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     column_names,
     conversation_template: ConversationTemplate,
-) -> Dict:
-    """Handels conversation datasets tokenization
-    """
+) -> dict:
+    """Handels conversation datasets tokenization"""
     num_example = len(examples[column_names[0]])
     token_dict = {
         "input_ids": [[] for _ in range(num_example)],
@@ -262,19 +230,17 @@ def conversation_tokenize_function(
             messages = examples["messages"][i]
             system = examples.get("system", [None] * num_example)[i]
             tools = examples.get("tools", [None] * num_example)[i]
-            if len(messages) < 2 or messages[0]['role'] != CONVERSATION_ROLE_NAMES['user']:
+            if len(messages) < 2 or messages[0]["role"] != CONVERSATION_ROLE_NAMES["user"]:
                 tok_logger.warning(
                     "Invalid instance encountered. Either the conversation has less than "
                     "one round or the first message is not from the user."
                 )
                 continue
-        
+
             if len(messages) % 2 != 0:
-                logger.warning(
-                    "The number of messages is not even, the last message will be ignored."
-                )
+                logger.warning("The number of messages is not even, the last message will be ignored.")
                 messages = messages[:-1]
-                
+
             encoded_conversation = conversation_template.encode_conversation(
                 tokenizer=tokenizer,
                 messages=messages,
@@ -285,12 +251,12 @@ def conversation_tokenize_function(
             input_ids, labels = [], []
             for turn_idx, (user_input, assistant_result) in enumerate(encoded_conversation):
                 input_ids += user_input + assistant_result
-                
+
                 if data_args.train_on_prompt:
                     labels += user_input + assistant_result
                 else:
                     labels += [-100] * len(user_input) + assistant_result
-                
+
             token_dict["input_ids"][i].extend(input_ids)
             token_dict["attention_mask"][i].extend([1] * len(input_ids))
             token_dict["labels"][i].extend(labels)
@@ -313,9 +279,9 @@ def conversation_tokenize_function(
         )
     return token_dict
 
-            
+
 def tokenize_function(
-    examples, 
+    examples,
     data_args: DatasetArguments,
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     column_names,
@@ -323,9 +289,8 @@ def tokenize_function(
     tokenized_column_order,
     add_special_tokens,
     use_truncation,
-) -> Dict:
-    """Handels text_only and text2text datasets tokenization
-    """
+) -> dict:
+    """Handels text_only and text2text datasets tokenization"""
     num_example = len(examples[column_names[0]])
     token_dict = {
         "input_ids": [[] for _ in range(num_example)],
@@ -343,18 +308,11 @@ def tokenize_function(
             if column_name in label_columns:
                 labels = encoding["input_ids"].copy()
             else:
-                labels = [
-                    [-100] * len(encoding["input_ids"][i])
-                    for i in range(num_example)
-                ]
+                labels = [[-100] * len(encoding["input_ids"][i]) for i in range(num_example)]
 
             for i in range(num_example):
-                token_dict["input_ids"][i].extend(
-                    encoding["input_ids"][i]
-                )
-                token_dict["attention_mask"][i].extend(
-                    encoding["attention_mask"][i]
-                )
+                token_dict["input_ids"][i].extend(encoding["input_ids"][i])
+                token_dict["attention_mask"][i].extend(encoding["attention_mask"][i])
                 token_dict["labels"][i].extend(labels[i])
 
     if data_args.disable_group_texts:
@@ -377,24 +335,24 @@ def tokenize_function(
 
 
 def text_to_textlist_tokenize_function(
-    examples, 
+    examples,
     data_args: DatasetArguments,
     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     column_names,
     add_special_tokens,
     use_truncation,
-) -> Dict:
+) -> dict:
     """For rm inference, and don't need attn mask and labels.
     NOTE: input_ids here refers to the tokenized input_ids of the input **and** output
     """
     num_example = len(examples[column_names[0]])
     output_dict = {column_name: examples[column_name] for column_name in column_names}
     output_dict["input_ids"] = [[] for _ in range(num_example)]
-    
+
     for example_idx in range(num_example):
         encoded = tokenizer(
             [
-                examples["input"][example_idx] + examples["output"][example_idx][i] 
+                examples["input"][example_idx] + examples["output"][example_idx][i]
                 for i in range(len(examples["output"][example_idx]))
             ],
             add_special_tokens=add_special_tokens,
@@ -402,7 +360,7 @@ def text_to_textlist_tokenize_function(
         )
 
         output_dict["input_ids"][example_idx] = encoded["input_ids"]
-        
+
     if data_args.disable_group_texts:
         output_dict = blocking_text_to_textlist(
             token_dict=output_dict,
@@ -412,5 +370,5 @@ def text_to_textlist_tokenize_function(
             padding_side=tokenizer.padding_side,
             truncation_side=tokenizer.truncation_side,
         )
-        
-    return output_dict
\ No newline at end of file
+
+    return output_dict
diff --git a/src/lmflow/utils/common.py b/src/lmflow/utils/common.py
index 0d1a5dd7e..c3473f341 100644
--- a/src/lmflow/utils/common.py
+++ b/src/lmflow/utils/common.py
@@ -1,10 +1,9 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import logging
-from dataclasses import dataclass, field, fields, Field, make_dataclass
+from dataclasses import Field, fields, make_dataclass
 from pathlib import Path
-from typing import Optional, List, Union, Dict
+from typing import Optional, Union
 
 from lmflow.utils.versioning import get_python_version
 
@@ -12,25 +11,25 @@
 
 
 def make_shell_args_from_dataclass(
-    dataclass_objects: List, 
-    format: str="subprocess",
-    skip_default: bool=True,
-    ignored_args_list: Optional[List[str]]=None,
-) -> Union[str, List[str]]:
+    dataclass_objects: list,
+    format: str = "subprocess",
+    skip_default: bool = True,
+    ignored_args_list: Optional[list[str]] = None,
+) -> Union[str, list[str]]:
     """Return a string or a list of strings that can be used as shell arguments.
 
     Parameters
     ----------
-    dataclass_objects : List
+    dataclass_objects : list
         A list of dataclass objects.
     format : str, optional
         Return format, can be "shell" or "subprocess", by default "subprocess".
     skip_default : bool, optional
-        Whether to skip attributes with default values, by default True. 
+        Whether to skip attributes with default values, by default True.
 
     Returns
     -------
-    Union[str, List[str]]
+    Union[str, list[str]]
     """
     assert isinstance(dataclass_objects, list), "dataclass_objects should be a list of dataclass objects."
     all_args = {}
@@ -47,7 +46,7 @@ def make_shell_args_from_dataclass(
             if skip_default:
                 if dataclass_object.__dataclass_fields__[k].default == v:
                     continue
-            
+
             if k not in all_args:
                 if isinstance(v, Path):
                     all_args[k] = str(v)
@@ -61,7 +60,7 @@ def make_shell_args_from_dataclass(
                 else:
                     logger.warning(f"Found different values for the same key: {k}, using value: {v} instead.")
                     all_args[k] = v
-    
+
     if format == "shell":
         final_res = " ".join([f"--{k} {v}" for k, v in all_args.items()])
     elif format == "subprocess":
@@ -70,16 +69,11 @@ def make_shell_args_from_dataclass(
             final_res.extend([f"--{k}", str(v)])
     else:
         raise ValueError(f"Unknown format: {format}")
-        
+
     return final_res
 
 
-def create_copied_dataclass(
-    original_dataclass, 
-    field_prefix: str, 
-    class_prefix: str, 
-    new_default: Dict=None
-):
+def create_copied_dataclass(original_dataclass, field_prefix: str, class_prefix: str, new_default: dict = None):
     """Create a copied dataclass with new field names and default values.
 
     Parameters
@@ -89,8 +83,8 @@ def create_copied_dataclass(
         The prefix to add to the **field** names of the copied dataclass.
     class_prefix : str
         The prefix to add to the **class** name of the copied dataclass.
-    new_default : Dict, optional
-        The new default values for the copied dataclass. When None, the 
+    new_default : dict, optional
+        The new default values for the copied dataclass. When None, the
         default values of the original dataclass are used.
 
     Returns
@@ -103,40 +97,40 @@ def create_copied_dataclass(
     for field in original_fields:
         if get_python_version().minor >= 10:
             new_field = (
-                f"{field_prefix}{field.name}", 
-                field.type, 
+                f"{field_prefix}{field.name}",
+                field.type,
                 Field(
-                    default=new_default.get(f"{field_prefix}{field.name}", field.default), 
+                    default=new_default.get(f"{field_prefix}{field.name}", field.default),
                     default_factory=field.default_factory,
                     init=field.init,
                     repr=field.repr,
                     hash=field.hash,
                     compare=field.compare,
                     metadata=field.metadata,
-                    kw_only=False, # add in py3.10: https://docs.python.org/3/library/dataclasses.html
-                )
+                    kw_only=False,  # add in py3.10: https://docs.python.org/3/library/dataclasses.html
+                ),
             )
         else:
             new_field = (
-                f"{field_prefix}{field.name}", 
-                field.type, 
+                f"{field_prefix}{field.name}",
+                field.type,
                 Field(
-                    default=new_default.get(f"{field_prefix}{field.name}", field.default), 
+                    default=new_default.get(f"{field_prefix}{field.name}", field.default),
                     default_factory=field.default_factory,
                     init=field.init,
                     repr=field.repr,
                     hash=field.hash,
                     compare=field.compare,
                     metadata=field.metadata,
-                )
+                ),
             )
-            
+
         new_fields.append(new_field)
     copied_dataclass = make_dataclass(f"{class_prefix}{original_dataclass.__name__}", new_fields)
     return copied_dataclass
 
 
-def remove_dataclass_attr_prefix(data_instance, prefix: str) -> Dict:
+def remove_dataclass_attr_prefix(data_instance, prefix: str) -> dict:
     """Remove the prefix from the attribute names of a dataclass instance.
 
     Parameters
@@ -147,19 +141,19 @@ def remove_dataclass_attr_prefix(data_instance, prefix: str) -> Dict:
 
     Returns
     -------
-    Dict
+    dict
     """
     new_attributes = {}
     for field in fields(data_instance):
         attr_name = field.name
         attr_value = getattr(data_instance, attr_name)
-        new_attr_name = f"{attr_name[len(prefix):]}"
+        new_attr_name = f"{attr_name[len(prefix) :]}"
         new_attributes[new_attr_name] = attr_value
-    
+
     return new_attributes
 
 
-def add_dataclass_attr_prefix(data_instance, prefix: str) -> Dict:
+def add_dataclass_attr_prefix(data_instance, prefix: str) -> dict:
     """Add the prefix to the attribute names of a dataclass instance.
 
     Parameters
@@ -170,7 +164,7 @@ def add_dataclass_attr_prefix(data_instance, prefix: str) -> Dict:
 
     Returns
     -------
-    Dict
+    dict
     """
     new_attributes = {}
     for field in fields(data_instance):
@@ -178,7 +172,7 @@ def add_dataclass_attr_prefix(data_instance, prefix: str) -> Dict:
         attr_value = getattr(data_instance, attr_name)
         new_attr_name = f"{prefix}{attr_name}"
         new_attributes[new_attr_name] = attr_value
-    
+
     return new_attributes
 
 
@@ -188,4 +182,4 @@ def print_banner(message: str):
 
     logger.info(border)
     logger.info(f"#   {message}   #")
-    logger.info(border)
\ No newline at end of file
+    logger.info(border)
diff --git a/src/lmflow/utils/constants.py b/src/lmflow/utils/constants.py
index 04ad56965..9e936dc94 100644
--- a/src/lmflow/utils/constants.py
+++ b/src/lmflow/utils/constants.py
@@ -1,11 +1,10 @@
 #!/usr/bin/env python
-# coding=utf-8
 """
 Commonly used constants.
 """
 
 TEXT_ONLY_DATASET_DESCRIPTION = (
-"""
+    """
 "text_only": a dataset with only raw text instances, with following format:
 
     {
@@ -21,7 +20,7 @@
 
 
 TEXT_TO_SCORED_TEXTLIST_DATASET_DESCRIPTION = (
-"""
+    """
 This kind of dataset is commonly used in reward model training/prediction, as well as rl training.
 {
     "type": "text_to_scored_textlist",
@@ -47,7 +46,7 @@
 
 
 PAIRED_TEXT_TO_TEXT_DATASET_DESCRIPTION = (
-"""
+    """
 This kind of dataset is commonly used in reward model training as well as rl training.
 {
     "type": "paired_text_to_text",
@@ -71,7 +70,7 @@
 
 
 TEXT_ONLY_DATASET_DETAILS = (
-"""
+    """
     For example,
 
     ```python
@@ -113,7 +112,7 @@
 
 
 TEXT2TEXT_DATASET_DESCRIPTION = (
-"""
+    """
 "text2text": a dataset with input & output instances, with following format:
 
     {
@@ -129,7 +128,7 @@
 
 
 CONVERSATION_DATASET_DESCRIPTION = (
-"""
+    """
 "conversation": a dataset with conversation instances, with following format (`conversation_id`, `system` and `tools` are optional):
 
     {
@@ -180,7 +179,7 @@
 
 
 PAIRED_CONVERSATION_DATASET_DESCRIPTION = (
-"""
+    """
 "paired_conversation": a dataset with paired conversation instances, with following format:
 
     {
@@ -241,7 +240,7 @@
 
 
 TEXT_TO_TEXTLIST_DATASET_DESCRIPTION = (
-"""
+    """
 This kind of dataset is commonly used in reward model inference.
 {
     "type": "text_to_textlist",
@@ -267,7 +266,7 @@
 
 
 TEXT2TEXT_DATASET_DETAILS = (
-"""
+    """
     For example,
 
     ```python
@@ -321,7 +320,7 @@
 
 
 FLOAT_ONLY_DATASET_DESCRIPTION = (
-"""
+    """
 "float_only": a dataset with only float instances, with following format:
 
     {
@@ -336,13 +335,9 @@
 ).lstrip("\n")
 
 
-TEXT_ONLY_DATASET_LONG_DESCRITION = (
-    TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS
-)
+TEXT_ONLY_DATASET_LONG_DESCRITION = TEXT_ONLY_DATASET_DESCRIPTION + TEXT_ONLY_DATASET_DETAILS
 
-TEXT2TEXT_DATASET_LONG_DESCRITION = (
-    TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS
-)
+TEXT2TEXT_DATASET_LONG_DESCRITION = TEXT2TEXT_DATASET_DESCRIPTION + TEXT2TEXT_DATASET_DETAILS
 
 
 DATASET_DESCRIPTION_MAP = {
@@ -354,7 +349,7 @@
 INSTANCE_FIELDS_MAP = {
     "text_only": ["text"],
     "text2text": ["input", "output"],
-    "conversation": ["messages"], # system, tools and conversation_id are optional
+    "conversation": ["messages"],  # system, tools and conversation_id are optional
     "paired_conversation": ["chosen", "rejected"],
     "paired_text_to_text": ["prompt", "chosen", "rejected"],
     "float_only": ["value"],
@@ -368,7 +363,7 @@
     "user": "user",
     "assistant": "assistant",
     "function": "function",
-    "observation": "observation"
+    "observation": "observation",
 }
 
 # LLAVA constants
@@ -386,28 +381,26 @@
 DEFAULT_IM_END_TOKEN = "<im_end>"
 
 # Lora
-# NOTE: This work as a mapping for those models that `peft` library doesn't support yet, and will be 
+# NOTE: This work as a mapping for those models that `peft` library doesn't support yet, and will be
 # overwritten by peft.utils.constants.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
 # if the model is supported (see hf_model_mixin.py).
-# NOTE: When passing lora_target_modules through arg parser, the 
+# NOTE: When passing lora_target_modules through arg parser, the
 # value should be a string. Using commas to separate the module names, e.g.
-# "--lora_target_modules 'q_proj, v_proj'". 
+# "--lora_target_modules 'q_proj, v_proj'".
 # However, when specifying here, they should be lists.
 LMFLOW_LORA_TARGET_MODULES_MAPPING = {
-    'qwen2': ["q_proj", "v_proj"],
-    'internlm2': ["wqkv"],
-    'hymba': ["x_proj.0", "in_proj", "out_proj", "dt_proj.0"]
+    "qwen2": ["q_proj", "v_proj"],
+    "internlm2": ["wqkv"],
+    "hymba": ["x_proj.0", "in_proj", "out_proj", "dt_proj.0"],
 }
 
 # vllm inference
 MEMORY_SAFE_VLLM_INFERENCE_FINISH_FLAG = "MEMORY_SAFE_VLLM_INFERENCE_DONE"
-RETURN_CODE_ERROR_BUFFER = [
-    134
-]
+RETURN_CODE_ERROR_BUFFER = [134]
 # return code 134:
-# > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name='<stdout>'> 
+# > Fatal Python error: _enter_buffered_busy: could not acquire lock for <_io.BufferedWriter name='<stdout>'>
 # > at interpreter shutdown, possibly due to daemon threads
-# The above error, by our observation, is due to the kill signal with unfinished 
+# The above error, by our observation, is due to the kill signal with unfinished
 # stdout/stderr writing in the subprocess
 MEMORY_SAFE_VLLM_INFERENCE_ENV_VAR_TO_REMOVE = [
     "OMP_NUM_THREADS",
@@ -450,4 +443,4 @@
     "TORCHELASTIC_USE_AGENT_STORE",
     "TORCH_NCCL_ASYNC_ERROR_HANDLING",
     "TORCHELASTIC_ERROR_FILE",
-]
\ No newline at end of file
+]
diff --git a/src/lmflow/utils/conversation_template/__init__.py b/src/lmflow/utils/conversation_template/__init__.py
index 1dcf29ea0..001f2e6a7 100644
--- a/src/lmflow/utils/conversation_template/__init__.py
+++ b/src/lmflow/utils/conversation_template/__init__.py
@@ -1,69 +1,70 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import logging
 
 from lmflow.utils.versioning import is_package_version_at_least
 
-from .base import EMPTY_TEMPLATE, EMPTY_NO_SPECIAL_TOKENS_TEMPLATE, ConversationTemplate, ConversationTemplateForTool
+from .base import EMPTY_NO_SPECIAL_TOKENS_TEMPLATE, EMPTY_TEMPLATE, ConversationTemplate, ConversationTemplateForTool
 from .chatglm import CHATGLM3_TEMPLATE
 from .chatml import CHATML_TEMPLATE
-from .deepseek import (
-    DEEPSEEK_V2_TEMPLATE,
-    DEEPSEEK_V3_TEMPLATE,
-    DEEPSEEK_R1_TEMPLATE,
-    DEEPSEEK_R1_DISTILL_TEMPLATE
-)
-from .gemma import GEMMA_TEMPLATE
+from .deepseek import DEEPSEEK_R1_DISTILL_TEMPLATE, DEEPSEEK_R1_TEMPLATE, DEEPSEEK_V2_TEMPLATE, DEEPSEEK_V3_TEMPLATE
+from .gemma import GEMMA3_TEMPLATE, GEMMA_TEMPLATE
 from .hymba import HYMBA_TEMPLATE
 from .internlm import INTERNLM2_TEMPLATE
 from .llama import LLAMA2_TEMPLATE, LLAMA3_TEMPLATE, LLAMA3_TEMPLATE_FOR_TOOL
 from .phi import PHI3_TEMPLATE
 from .qwen import (
-    QWEN2_TEMPLATE,
-    QWEN2_TEMPLATE_FOR_TOOL,
-    QWEN2_5_TEMPLATE,
     QWEN2_5_1M_TEMPLATE,
     QWEN2_5_MATH_TEMPLATE,
-    QWEN_QWQ_TEMPLATE
+    QWEN2_5_TEMPLATE,
+    QWEN2_TEMPLATE,
+    QWEN2_TEMPLATE_FOR_TOOL,
+    QWEN3_TEMPLATE,
+    QWEN_QWQ_TEMPLATE,
 )
 from .yi import YI1_5_TEMPLATE
 from .zephyr import ZEPHYR_TEMPLATE
 
+__all__ = [
+    "ConversationTemplate",
+    "ConversationTemplateForTool",
+]
 
 logger = logging.getLogger(__name__)
 
 
 PRESET_TEMPLATES = {
-    'chatglm3': CHATGLM3_TEMPLATE,
-    'chatml': CHATML_TEMPLATE,
-    'deepseek': DEEPSEEK_V2_TEMPLATE,
-    'deepseek_v2': DEEPSEEK_V2_TEMPLATE,
-    'disable': EMPTY_TEMPLATE,
-    'empty': EMPTY_TEMPLATE,
-    'empty_no_special_tokens': EMPTY_NO_SPECIAL_TOKENS_TEMPLATE,
-    'gemma': GEMMA_TEMPLATE,
-    'hymba': HYMBA_TEMPLATE,
-    'internlm2': INTERNLM2_TEMPLATE,
-    'llama2': LLAMA2_TEMPLATE,
-    'llama3': LLAMA3_TEMPLATE,
-    'llama3_for_tool': LLAMA3_TEMPLATE_FOR_TOOL,
-    'phi3': PHI3_TEMPLATE,
-    'qwen2': QWEN2_TEMPLATE,
-    'qwen2_for_tool': QWEN2_TEMPLATE_FOR_TOOL,
-    'yi': CHATML_TEMPLATE,
-    'yi1_5': YI1_5_TEMPLATE,
-    'zephyr': ZEPHYR_TEMPLATE
+    "chatglm3": CHATGLM3_TEMPLATE,
+    "chatml": CHATML_TEMPLATE,
+    "deepseek": DEEPSEEK_V2_TEMPLATE,
+    "deepseek_v2": DEEPSEEK_V2_TEMPLATE,
+    "disable": EMPTY_TEMPLATE,
+    "empty": EMPTY_TEMPLATE,
+    "empty_no_special_tokens": EMPTY_NO_SPECIAL_TOKENS_TEMPLATE,
+    "gemma": GEMMA_TEMPLATE,
+    "hymba": HYMBA_TEMPLATE,
+    "internlm2": INTERNLM2_TEMPLATE,
+    "llama2": LLAMA2_TEMPLATE,
+    "llama3": LLAMA3_TEMPLATE,
+    "llama3_for_tool": LLAMA3_TEMPLATE_FOR_TOOL,
+    "phi3": PHI3_TEMPLATE,
+    "qwen2": QWEN2_TEMPLATE,
+    "qwen2_for_tool": QWEN2_TEMPLATE_FOR_TOOL,
+    "yi": CHATML_TEMPLATE,
+    "yi1_5": YI1_5_TEMPLATE,
+    "zephyr": ZEPHYR_TEMPLATE,
 }
 
 JINJA_TEMPLATES = {
-    'deepseek_r1': DEEPSEEK_R1_TEMPLATE,
-    'deepseek_r1_distill': DEEPSEEK_R1_DISTILL_TEMPLATE,
-    'deepseek_v3': DEEPSEEK_V3_TEMPLATE,
-    'qwen2_5': QWEN2_5_TEMPLATE,
-    'qwen2_5_1m': QWEN2_5_1M_TEMPLATE,
-    'qwen2_5_math': QWEN2_5_MATH_TEMPLATE,
-    'qwen_qwq': QWEN_QWQ_TEMPLATE,
+    "deepseek_r1": DEEPSEEK_R1_TEMPLATE,
+    "deepseek_r1_distill": DEEPSEEK_R1_DISTILL_TEMPLATE,
+    "deepseek_v3": DEEPSEEK_V3_TEMPLATE,
+    "qwen2_5": QWEN2_5_TEMPLATE,
+    "qwen2_5_1m": QWEN2_5_1M_TEMPLATE,
+    "qwen2_5_math": QWEN2_5_MATH_TEMPLATE,
+    "qwen_qwq": QWEN_QWQ_TEMPLATE,
+    "qwen3": QWEN3_TEMPLATE,
+    "gemma3": GEMMA3_TEMPLATE,
 }
 
 if is_package_version_at_least("transformers", "4.43.0"):
@@ -73,4 +74,4 @@
     logger.warning(
         f"The following conversation templates require transformers>=4.43.0: {JINJA_TEMPLATES.keys()}. "
         f"Please upgrade `transformers` to use them."
-    )
\ No newline at end of file
+    )
diff --git a/src/lmflow/utils/conversation_template/base.py b/src/lmflow/utils/conversation_template/base.py
index ecc129171..ceafac708 100644
--- a/src/lmflow/utils/conversation_template/base.py
+++ b/src/lmflow/utils/conversation_template/base.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
+import logging
 import re
 from abc import ABC, abstractmethod
+from collections.abc import Sequence
 from dataclasses import dataclass, field
-from typing import Dict, Set, Sequence, Literal, Union, List, Optional, Tuple
-import logging
+from typing import Literal, Optional, Union
 
 from transformers import PreTrainedTokenizer
 
@@ -21,71 +21,74 @@ class TemplateComponent:
     Parameters
     ----------
     type : Literal['token', 'token_id', 'string', 'tools']
-        - Type of the component.  
-        
-        - When the component is a token or a string, the content should be `string`. 
-        The difference between the two is that token will be converted to token ids 
-        by the tokenizer.convert_tokens_to_ids() method, while string will be directly 
+        - Type of the component.
+
+        - When the component is a token or a string, the content should be `string`.
+        The difference between the two is that token will be converted to token ids
+        by the tokenizer.convert_tokens_to_ids() method, while string will be directly
         encoded by the tokenizer.encode() method. Specially, since the bos token and eos
         token are frequently used across different templates, we provide the convenience
         to use `'bos_token'` and `'eos_token'` to represent the actual bos and eos tokens when
-        `type` of the `TemplateComponent` is `token`. For example:  
-        
+        `type` of the `TemplateComponent` is `token`. For example:
+
         ```python
         TemplateComponent(type='token', content='bos_token')
         ```
-        
+
         After encoding, the content will be replaced by the actual token id of the bos token.
-        Please do remember that if you set the `type` to `string`, the tokenizer will try to 
+        Please do remember that if you set the `type` to `string`, the tokenizer will try to
         encode the string 'bos_token' instead of providing the actual bos token.
-        
-        - When the component is token_id, the content should be `int` or `List[int]`, and 
+
+        - When the component is token_id, the content should be `int` or `list[int]`, and
         will be directly appended to the encoded token ids.
-        
+
         - Tools are not supported yet.
-        
-    content : Union[str, int, List[str], List[int]]
+
+    content : Union[str, int, list[str], list[int]]
         Content of the component.
 
     """
-    type: Literal['token', 'token_id', 'string', 'tools']
-    content: Union[str, int, List[str], List[int]]
-    mask: Optional[bool] = True # for token specific masking, work in progress
-    
+
+    type: Literal["token", "token_id", "string", "tools"]
+    content: Union[str, int, list[str], list[int]]
+    mask: Optional[bool] = True  # for token specific masking, work in progress
+
     def __post_init__(self):
         assert self.content, "Content of the component cannot be empty."
-        
-        if self.type == 'tools':
+
+        if self.type == "tools":
             assert isinstance(self.content, list), (
-                f"Content of tools component must be a list, got {type(self.content)}")
-        elif self.type in ['token', 'string']:
+                f"Content of tools component must be a list, got {type(self.content)}"
+            )
+        elif self.type in ["token", "string"]:
             assert isinstance(self.content, str), (
-                f"Content of string/token component must be a string, got {type(self.content)}")
-        elif self.type == 'token_id':
+                f"Content of string/token component must be a string, got {type(self.content)}"
+            )
+        elif self.type == "token_id":
             assert isinstance(self.content, int) or all(isinstance(token_id, int) for token_id in self.content), (
-                f"Content of token_id component must be an integer or a list of integers.")
+                "Content of token_id component must be an integer or a list of integers."
+            )
         else:
-            raise ValueError(f"The type of the component must be either "
-                             f"'token', 'string' or 'tools', got {self.type}")
-            
+            raise ValueError(f"The type of the component must be either 'token', 'string' or 'tools', got {self.type}")
+
     def __repr__(self) -> str:
         return f"TemplateComponent(type={self.type}, content={self.content})".replace("\n", "\\n")
-    
+
     def __str__(self) -> str:
         return f"{self.content}".replace("\n", "\\n")
 
 
 @dataclass
 class Formatter(ABC):
-    template: List[TemplateComponent] = field(default_factory=list)
-    
+    template: list[TemplateComponent] = field(default_factory=list)
+
     @abstractmethod
-    def format(self, **kwargs) -> List[TemplateComponent]: ...
-    
+    def format(self, **kwargs) -> list[TemplateComponent]: ...
+
     def has_placeholder(self):
         flag = False
         for component in self.template:
-            if component.type == 'string':
+            if component.type == "string":
                 if re.search(r"{{(.*?)}}", component.content):
                     flag = True
                     break
@@ -97,7 +100,7 @@ class EmptyFormatter(Formatter):
     def __post_init__(self):
         if self.has_placeholder():
             raise ValueError("Empty formatter should not have placeholders.")
-    
+
     def format(self, **kwargs) -> list:
         """Empty formatter for when no formatting is needed.
         This is useful when user has already applied formatting to the dataset.
@@ -108,16 +111,16 @@ def format(self, **kwargs) -> list:
             Original template.
         """
         return self.template
-    
+
 
 @dataclass
 class StringFormatter(Formatter):
     def __post_init__(self):
         if not self.has_placeholder():
             raise ValueError("String formatter should have placeholders.")
-    
+
     def format(self, **kwargs) -> list:
-        """Format the string components with the provided keyword arguments. 
+        """Format the string components with the provided keyword arguments.
         Mostly used for formatting system prompt, user and assistant messages.
 
         Parameters
@@ -132,17 +135,19 @@ def format(self, **kwargs) -> list:
         """
         formatted_template = []
         for component in self.template:
-            if component.type == 'string':
+            if component.type == "string":
                 for key, value in kwargs.items():
                     templated = component.content.replace("{{" + key + "}}", value)
                     if len(templated) == 0:
-                        logger.warning("Found empty string after formatting, adding a space instead. "
-                                       "If this is not intended, please check the dataset.")
+                        logger.warning(
+                            "Found empty string after formatting, adding a space instead. "
+                            "If this is not intended, please check the dataset."
+                        )
                         templated = " "
-                    formatted_template.append(TemplateComponent(type='string', content=templated))
+                    formatted_template.append(TemplateComponent(type="string", content=templated))
             else:
                 formatted_template.append(component)
-                
+
         logger.debug(formatted_template)
         return formatted_template
 
@@ -162,27 +167,29 @@ class ConversationTemplate:
     special_stopper: Optional[TemplateComponent] = None
     template_name: Optional[str] = None
     system_default: Optional[str] = None
-    
+
     def __post_init__(self):
         if self.separator:
-            if self.separator.type not in ['string', 'token', 'token_id']:
+            if self.separator.type not in ["string", "token", "token_id"]:
                 raise NotImplementedError(f"Component type {self.separator.type} cannot be used as a separator.")
-            
+
         if self.special_starter:
-            if self.special_starter.type not in ['string', 'token', 'token_id']:
-                raise NotImplementedError(f"Component type {self.special_starter.type} cannot be used as a special starter.")
-    
+            if self.special_starter.type not in ["string", "token", "token_id"]:
+                raise NotImplementedError(
+                    f"Component type {self.special_starter.type} cannot be used as a special starter."
+                )
+
     def encode_conversation(
         self,
         tokenizer: PreTrainedTokenizer,
-        messages: List[Dict[str, str]],
+        messages: list[dict[str, str]],
         system: Optional[str] = None,
-        tools: Optional[List[str]] = None,
-        **kwargs
-    ) -> Sequence[Tuple[List[int], List[int]]]:
-        r'''
+        tools: Optional[list[str]] = None,
+        **kwargs,
+    ) -> Sequence[tuple[list[int], list[int]]]:
+        r"""
         Messages here should be guaranteed to be in pairs, with the first message being the user message and the second message being the system message.
-        Data example: 
+        Data example:
         ```json
         {
             "conversation_id": 2,
@@ -200,200 +207,193 @@ def encode_conversation(
             ]
         }
         ```
-        '''
+        """
         assert isinstance(messages, list), "Messages must be a list."
-        
+
         if tools:
             logger.warning("Tools are not supported yet. Please include tools in the system message manually.")
-        
+
         if system:
-            if system.replace(" ",""):
+            if system.replace(" ", ""):
                 if not self.system_formatter:
-                    raise ValueError("Your dataset contains system message but no system formatter is provided. "
-                                     "Consider either providing a system formatter or removing system prompt from your dataset.")
+                    raise ValueError(
+                        "Your dataset contains system message but no system formatter is provided. "
+                        "Consider either providing a system formatter or removing system prompt from your dataset."
+                    )
             else:
                 system = self.system_default if self.system_default else None
-        
+
         encoded_pairs = self._encode(tokenizer, messages, system, tools, **kwargs)
         encoded_pairs = self.post_process_pairs(encoded_pairs=encoded_pairs, tokenizer=tokenizer)
-        
+
         return encoded_pairs
-        
+
     def _encode(
         self,
         tokenizer: PreTrainedTokenizer,
-        messages: List[Dict[str, str]],
+        messages: list[dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
-        **kwargs
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+        **kwargs,
+    ) -> Sequence[tuple[list[int], list[int]]]:
         # TODO: truncation according to model max length
         # TODO: make sure the last few tokens are "learnable", not masked with token_id = -100.
-        
+
         res_all = []
-        
+
         if system:
             system_formatted = self.system_formatter.format(content=system)
         else:
-            system_formatted = self.system_formatter.format(content='') if self.force_system else []
+            system_formatted = self.system_formatter.format(content="") if self.force_system else []
         system_encoded = self._encode_template(system_formatted, tokenizer)
-        
+
         for i in range(0, len(messages), 2):
             user_message = messages[i]
             assistant_message = messages[i + 1]
-            
+
             user_formatted = self.user_formatter.format(content=user_message["content"])
             assistant_formatted = self.assistant_formatter.format(content=assistant_message["content"])
-            
+
             user_encoded = self._encode_template(user_formatted, tokenizer)
             assistant_encoded = self._encode_template(assistant_formatted, tokenizer)
-            
-            res_all.append((
-                system_encoded + user_encoded if i == 0 else user_encoded, 
-                assistant_encoded
-            ))
-            
+
+            res_all.append((system_encoded + user_encoded if i == 0 else user_encoded, assistant_encoded))
+
         return res_all
-    
+
     def _encode_template(
-        self, 
-        template: List[TemplateComponent],
-        tokenizer: PreTrainedTokenizer,
-        **kwargs
-    ) -> List[int]:
+        self, template: list[TemplateComponent], tokenizer: PreTrainedTokenizer, **kwargs
+    ) -> list[int]:
         """Encode template components into token ids.
 
         Parameters
         ----------
-        template : List[TemplateComponent]
+        template : list[TemplateComponent]
             Formatted template components.
         tokenizer : PreTrainedTokenizer
             Tokenizer to convert tokens into token ids.
 
         Returns
         -------
-        List[int]
+        list[int]
             Encoded token ids.
         """
         encoded_ids = []
         for component in template:
-            if component.type == 'string':
+            if component.type == "string":
                 if len(component.content) == 0:
                     logger.warning("Empty string component found in the template.")
                     continue
                 else:
                     encoded_ids += tokenizer.encode(component.content, add_special_tokens=False)
-            elif component.type == 'token':
-                if component.content == 'bos_token':
+            elif component.type == "token":
+                if component.content == "bos_token":
                     encoded_ids += [tokenizer.bos_token_id]
-                elif component.content == 'eos_token':
+                elif component.content == "eos_token":
                     encoded_ids += [tokenizer.eos_token_id]
                 else:
                     encoded_ids += self._ensure_id_list(tokenizer.convert_tokens_to_ids(component.content))
-            elif component.type == 'token_id':
+            elif component.type == "token_id":
                 encoded_ids += self._ensure_id_list(component.content)
             else:
                 raise NotImplementedError(f"Component type {component.type} is not supported yet.")
         return encoded_ids
-    
+
     def post_process_pairs(self, encoded_pairs, tokenizer):
         if self.separator and self.remove_last_sep:
-            # For models that require a separator between messages, 
+            # For models that require a separator between messages,
             # user can include the seperator at the end of each template
-            # and specify the separator. Auto formatting will remove the 
+            # and specify the separator. Auto formatting will remove the
             # last separator once user specifies this option.
             encoded_pairs = self.remove_last_separator(encoded_pairs, tokenizer)
-            
+
         if self.special_starter:
-            # For models that has ONLY ONE bos token at the beginning of 
+            # For models that has ONLY ONE bos token at the beginning of
             # a conversation session (not a conversation pair), user can
             # specify a special starter to add that starter to the very
-            # beginning of the conversation session. 
+            # beginning of the conversation session.
             # eg:
-            #   llama-2: <s> and </s> at every pair of conversation 
+            #   llama-2: <s> and </s> at every pair of conversation
             #   v.s.
             #   llama-3: <|begin_of_text|> only at the beginning of a session
             encoded_pairs = self.add_special_starter(encoded_pairs, tokenizer)
-            
+
         if self.special_stopper:
             encoded_pairs = self.add_special_stopper(encoded_pairs, tokenizer)
-            
+
         return encoded_pairs
-    
+
     def remove_last_separator(
-        self, 
-        encoded_pairs: Sequence[Tuple[List[int], List[int]]],
-        tokenizer: PreTrainedTokenizer
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+        self, encoded_pairs: Sequence[tuple[list[int], list[int]]], tokenizer: PreTrainedTokenizer
+    ) -> Sequence[tuple[list[int], list[int]]]:
         last_assistant_msg = encoded_pairs[-1][1]
-        if self.separator.type == 'string':
+        if self.separator.type == "string":
             separator_ids = tokenizer.encode(self.separator.content, add_special_tokens=False)
-        elif self.separator.type == 'token':
+        elif self.separator.type == "token":
             separator_ids = self._ensure_id_list(tokenizer.convert_tokens_to_ids(self.separator.content))
-        elif self.separator.type == 'token_id':
+        elif self.separator.type == "token_id":
             separator_ids = self._ensure_id_list(self.separator.content)
         else:
             raise ValueError(f"Component type {self.separator.type} cannot be used as a separator.")
-        
+
         if len(separator_ids) > len(last_assistant_msg):
             raise ValueError("Separator is longer than the last assistant message, please check.")
-        
-        if last_assistant_msg[-len(separator_ids):] == separator_ids:
-            last_assistant_msg = last_assistant_msg[:-len(separator_ids)]
-            
+
+        if last_assistant_msg[-len(separator_ids) :] == separator_ids:
+            last_assistant_msg = last_assistant_msg[: -len(separator_ids)]
+
         encoded_pairs[-1] = (encoded_pairs[-1][0], last_assistant_msg)
-        
+
         return encoded_pairs
-    
+
     def add_special_starter(
-        self,
-        encoded_pairs: Sequence[Tuple[List[int], List[int]]],
-        tokenizer: PreTrainedTokenizer
-    ) -> Sequence[Tuple[List[int], List[int]]]:
-        if self.special_starter.type == 'string':
+        self, encoded_pairs: Sequence[tuple[list[int], list[int]]], tokenizer: PreTrainedTokenizer
+    ) -> Sequence[tuple[list[int], list[int]]]:
+        if self.special_starter.type == "string":
             special_starter_ids = tokenizer.encode(self.special_starter.content, add_special_tokens=False)
-        elif self.special_starter.type == 'token':
-            if self.special_starter.content == 'bos_token':
+        elif self.special_starter.type == "token":
+            if self.special_starter.content == "bos_token":
                 special_starter_ids = [tokenizer.bos_token_id]
-            elif self.special_starter.content == 'eos_token':
+            elif self.special_starter.content == "eos_token":
                 special_starter_ids = [tokenizer.eos_token_id]
             else:
-                special_starter_ids = self._ensure_id_list(tokenizer.convert_tokens_to_ids(self.special_starter.content))
-        elif self.special_starter.type == 'token_id':
+                special_starter_ids = self._ensure_id_list(
+                    tokenizer.convert_tokens_to_ids(self.special_starter.content)
+                )
+        elif self.special_starter.type == "token_id":
             special_starter_ids = self._ensure_id_list(self.special_starter.content)
         else:
             raise ValueError(f"Component type {self.special_starter.type} cannot be used as a special starter.")
-        
+
         encoded_pairs[0] = (special_starter_ids + encoded_pairs[0][0], encoded_pairs[0][1])
-        
+
         return encoded_pairs
-    
+
     def add_special_stopper(
-        self,
-        encoded_pairs: Sequence[Tuple[List[int], List[int]]],
-        tokenizer: PreTrainedTokenizer
-    ) -> Sequence[Tuple[List[int], List[int]]]:
-        if self.special_stopper.type == 'string':
+        self, encoded_pairs: Sequence[tuple[list[int], list[int]]], tokenizer: PreTrainedTokenizer
+    ) -> Sequence[tuple[list[int], list[int]]]:
+        if self.special_stopper.type == "string":
             special_stopper_ids = tokenizer.encode(self.special_stopper.content, add_special_tokens=False)
-        elif self.special_stopper.type == 'token':
-            if self.special_stopper.content == 'bos_token':
+        elif self.special_stopper.type == "token":
+            if self.special_stopper.content == "bos_token":
                 special_stopper_ids = [tokenizer.bos_token_id]
-            elif self.special_stopper.content == 'eos_token':
+            elif self.special_stopper.content == "eos_token":
                 special_stopper_ids = [tokenizer.eos_token_id]
             else:
-                special_stopper_ids = self._ensure_id_list(tokenizer.convert_tokens_to_ids(self.special_stopper.content))
-        elif self.special_stopper.type == 'token_id':
+                special_stopper_ids = self._ensure_id_list(
+                    tokenizer.convert_tokens_to_ids(self.special_stopper.content)
+                )
+        elif self.special_stopper.type == "token_id":
             special_stopper_ids = self._ensure_id_list(self.special_stopper.content)
         else:
             raise ValueError(f"Component type {self.special_stopper.type} cannot be used as a special stopper.")
-        
+
         encoded_pairs[-1] = (encoded_pairs[-1][0], encoded_pairs[-1][1] + special_stopper_ids)
-        
+
         return encoded_pairs
-    
-    def _ensure_id_list(self, obj: Union[int, List[int]]) -> List[int]:
-        '''Make sure the object is a list of integers. Useful for handling token ids.
-        '''
+
+    def _ensure_id_list(self, obj: Union[int, list[int]]) -> list[int]:
+        """Make sure the object is a list of integers. Useful for handling token ids."""
         if isinstance(obj, int):
             return [obj]
         elif isinstance(obj, list):
@@ -401,19 +401,20 @@ def _ensure_id_list(self, obj: Union[int, List[int]]) -> List[int]:
         else:
             raise ValueError(f"Object type {type(obj)} is not supported yet.")
 
+
 @dataclass
 class ConversationTemplateForTool(ConversationTemplate):
     def encode_conversation(
         self,
         tokenizer: PreTrainedTokenizer,
-        messages: List[Dict[str, str]],
+        messages: list[dict[str, str]],
         system: Optional[str] = None,
-        tools: Optional[List[str]] = None,
-        **kwargs
-    ) -> Sequence[Tuple[List[int], List[int]]]:
-        r'''
+        tools: Optional[list[str]] = None,
+        **kwargs,
+    ) -> Sequence[tuple[list[int], list[int]]]:
+        r"""
         Messages here should be guaranteed to be in pairs, with the first message being the user message and the second message being the system message.
-        Data example: 
+        Data example:
         ```json
         {
             "conversation_id": 2,
@@ -431,64 +432,66 @@ def encode_conversation(
             ]
         }
         ```
-        '''
+        """
         assert isinstance(messages, list), "Messages must be a list."
-        
+
         tools = self._handle_tools(tools)
-        
+
         if system is None:
             system = ""
         else:
-            if system.replace(" ",""):
+            if system.replace(" ", ""):
                 if not self.system_formatter:
-                    raise ValueError("Your dataset contains system message but no system formatter is provided. "
-                                     "Consider either providing a system formatter or removing system prompt from your dataset.")
+                    raise ValueError(
+                        "Your dataset contains system message but no system formatter is provided. "
+                        "Consider either providing a system formatter or removing system prompt from your dataset."
+                    )
             else:
                 system = self.system_default if self.system_default else ""
         encoded_pairs = self._encode(tokenizer, messages, system, tools, **kwargs)
         encoded_pairs = self.post_process_pairs(encoded_pairs=encoded_pairs, tokenizer=tokenizer)
-        
+
         return encoded_pairs
-        
+
     def _encode(
         self,
         tokenizer: PreTrainedTokenizer,
-        messages: List[Dict[str, str]],
+        messages: list[dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
-        **kwargs
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+        **kwargs,
+    ) -> Sequence[tuple[list[int], list[int]]]:
         # TODO: truncation according to model max length
         # TODO: make sure the last few tokens are "learnable", not masked with token_id = -100.
-        
+
         res_all = []
         # Concatenate the system and tools strings
         system = system + tools
         if system:
             system_formatted = self.system_formatter.format(content=system)
         else:
-            system_formatted = self.system_formatter.format(content='') if self.force_system else []
+            system_formatted = self.system_formatter.format(content="") if self.force_system else []
         system_encoded = self._encode_template(system_formatted, tokenizer)
         ls_for_save = []
         for i in range(0, len(messages), 1):
-            if messages[i]['role'] == CONVERSATION_ROLE_NAMES['user']:
+            if messages[i]["role"] == CONVERSATION_ROLE_NAMES["user"]:
                 user_message = messages[i]
                 user_formatted = self.user_formatter.format(content=user_message["content"])
                 user_encoded = self._encode_template(user_formatted, tokenizer)
                 if i == 0:
                     user_encoded = system_encoded + user_encoded
                 ls_for_save.append(user_encoded)
-            elif messages[i]['role'] == CONVERSATION_ROLE_NAMES['function']:
+            elif messages[i]["role"] == CONVERSATION_ROLE_NAMES["function"]:
                 function_message = messages[i]
-                function_formatted = self.function_formatter.format(content=function_message['content'])
+                function_formatted = self.function_formatter.format(content=function_message["content"])
                 function_encoded = self._encode_template(function_formatted, tokenizer)
                 ls_for_save.append(function_encoded)
-            elif messages[i]['role'] == CONVERSATION_ROLE_NAMES['observation']:
+            elif messages[i]["role"] == CONVERSATION_ROLE_NAMES["observation"]:
                 observation_message = messages[i]
-                observation_formatted = self.observation_formatter.format(content=observation_message['content'])
+                observation_formatted = self.observation_formatter.format(content=observation_message["content"])
                 observation_encoded = self._encode_template(observation_formatted, tokenizer)
                 ls_for_save.append(observation_encoded)
-            elif messages[i]['role'] == CONVERSATION_ROLE_NAMES['assistant']:
+            elif messages[i]["role"] == CONVERSATION_ROLE_NAMES["assistant"]:
                 assistant_message = messages[i]
                 assistant_formatted = self.assistant_formatter.format(content=assistant_message["content"])
                 assistant_encoded = self._encode_template(assistant_formatted, tokenizer)
@@ -496,85 +499,74 @@ def _encode(
                 # res_tuple = (ls_for_save[0], ls_for_save[1], ls_for_save[2], ls_for_save[3])
                 res_all.append(tuple(ls_for_save))
                 ls_for_save = []
-        
+
         if ls_for_save:
             res_all.append(tuple(ls_for_save))
-            
+
         return res_all
-    
+
     def _encode_template(
-        self, 
-        template: List[TemplateComponent],
-        tokenizer: PreTrainedTokenizer,
-        **kwargs
-    ) -> List[int]:
+        self, template: list[TemplateComponent], tokenizer: PreTrainedTokenizer, **kwargs
+    ) -> list[int]:
         """Encode template components into token ids.
 
         Parameters
         ----------
-        template : List[TemplateComponent]
+        template : list[TemplateComponent]
             Formatted template components.
         tokenizer : PreTrainedTokenizer
             Tokenizer to convert tokens into token ids.
 
         Returns
         -------
-        List[int]
+        list[int]
             Encoded token ids.
         """
         encoded_ids = []
         for component in template:
-            if component.type == 'string':
+            if component.type == "string":
                 if len(component.content) == 0:
                     logger.warning("Empty string component found in the template.")
                     continue
                 else:
                     encoded_ids += tokenizer.encode(component.content, add_special_tokens=False)
-            elif component.type == 'token':
-                if component.content == 'bos_token':
+            elif component.type == "token":
+                if component.content == "bos_token":
                     encoded_ids += [tokenizer.bos_token_id]
-                elif component.content == 'eos_token':
+                elif component.content == "eos_token":
                     encoded_ids += [tokenizer.eos_token_id]
                 else:
                     encoded_ids += self._ensure_id_list(tokenizer.convert_tokens_to_ids(component.content))
-            elif component.type == 'token_id':
+            elif component.type == "token_id":
                 encoded_ids += self._ensure_id_list(component.content)
             else:
                 raise NotImplementedError(f"Component type {component.type} is not supported yet.")
         return encoded_ids
-    
-    def _handle_tools(self, tools: Optional[List[str]]) -> str:
-        tools_out = ','.join(tools) if tools is not None else ''
+
+    def _handle_tools(self, tools: Optional[list[str]]) -> str:
+        tools_out = ",".join(tools) if tools is not None else ""
         return tools_out
 
 
 EMPTY_TEMPLATE = ConversationTemplate(
-    template_name='empty',
+    template_name="empty",
     user_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='token', content='bos_token'),
-            TemplateComponent(type='string', content='{{content}}')
+            TemplateComponent(type="token", content="bos_token"),
+            TemplateComponent(type="string", content="{{content}}"),
         ]
     ),
     assistant_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='{{content}}'),
-            TemplateComponent(type='token', content='eos_token')
+            TemplateComponent(type="string", content="{{content}}"),
+            TemplateComponent(type="token", content="eos_token"),
         ]
-    )
+    ),
 )
 
 
 EMPTY_NO_SPECIAL_TOKENS_TEMPLATE = ConversationTemplate(
-    template_name='empty_no_special_tokens',
-    user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='{{content}}')
-        ]
-    ),
-    assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='{{content}}')
-        ]
-    )
-)
\ No newline at end of file
+    template_name="empty_no_special_tokens",
+    user_formatter=StringFormatter(template=[TemplateComponent(type="string", content="{{content}}")]),
+    assistant_formatter=StringFormatter(template=[TemplateComponent(type="string", content="{{content}}")]),
+)
diff --git a/src/lmflow/utils/conversation_template/chatglm.py b/src/lmflow/utils/conversation_template/chatglm.py
index 2834b1ce6..12f28ab85 100644
--- a/src/lmflow/utils/conversation_template/chatglm.py
+++ b/src/lmflow/utils/conversation_template/chatglm.py
@@ -1,25 +1,13 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-from .base import StringFormatter, TemplateComponent, ConversationTemplate
-
+from .base import ConversationTemplate, StringFormatter, TemplateComponent
 
 CHATGLM3_TEMPLATE = ConversationTemplate(
-    template_name='chatglm3',
-    user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|user|>\n {{content}}')
-        ]
-    ),
+    template_name="chatglm3",
+    user_formatter=StringFormatter(template=[TemplateComponent(type="string", content="<|user|>\n {{content}}")]),
     assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|assistant|>\n {{content}}')
-        ]
-    ),
-    system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|system|>\n {{content}}')
-        ]
+        template=[TemplateComponent(type="string", content="<|assistant|>\n {{content}}")]
     ),
-    special_starter=TemplateComponent(type='string', content='[gMASK]sop')
-)
\ No newline at end of file
+    system_formatter=StringFormatter(template=[TemplateComponent(type="string", content="<|system|>\n {{content}}")]),
+    special_starter=TemplateComponent(type="string", content="[gMASK]sop"),
+)
diff --git a/src/lmflow/utils/conversation_template/chatml.py b/src/lmflow/utils/conversation_template/chatml.py
index 60b440fba..e628a74c6 100644
--- a/src/lmflow/utils/conversation_template/chatml.py
+++ b/src/lmflow/utils/conversation_template/chatml.py
@@ -1,24 +1,16 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-from .base import StringFormatter, TemplateComponent, ConversationTemplate
-
+from .base import ConversationTemplate, StringFormatter, TemplateComponent
 
 CHATML_TEMPLATE = ConversationTemplate(
-    template_name='chatml',
+    template_name="chatml",
     user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>user\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>user\n{{content}}<|im_end|>\n")]
     ),
     assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>assistant\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>assistant\n{{content}}<|im_end|>\n")]
     ),
     system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>system\n{{content}}<|im_end|>\n')
-        ]
-    )
-)
\ No newline at end of file
+        template=[TemplateComponent(type="string", content="<|im_start|>system\n{{content}}<|im_end|>\n")]
+    ),
+)
diff --git a/src/lmflow/utils/conversation_template/deepseek.py b/src/lmflow/utils/conversation_template/deepseek.py
index 29a02801e..103d51005 100644
--- a/src/lmflow/utils/conversation_template/deepseek.py
+++ b/src/lmflow/utils/conversation_template/deepseek.py
@@ -1,28 +1,18 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-from .base import StringFormatter, TemplateComponent, ConversationTemplate
-
+from .base import ConversationTemplate, StringFormatter, TemplateComponent
 
 DEEPSEEK_V2_TEMPLATE = ConversationTemplate(
-    template_name='deepseek_v2',
-    user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='User: {{content}}\n\n')
-        ]
-    ),
+    template_name="deepseek_v2",
+    user_formatter=StringFormatter(template=[TemplateComponent(type="string", content="User: {{content}}\n\n")]),
     assistant_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='Assistant: {{content}}'),
-            TemplateComponent(type='token', content='eos_token')
+            TemplateComponent(type="string", content="Assistant: {{content}}"),
+            TemplateComponent(type="token", content="eos_token"),
         ]
     ),
-    system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='{{content}}\n\n')
-        ]
-    ),
-    special_starter=TemplateComponent(type='token', content='bos_token')
+    system_formatter=StringFormatter(template=[TemplateComponent(type="string", content="{{content}}\n\n")]),
+    special_starter=TemplateComponent(type="token", content="bos_token"),
 )
 
 
@@ -249,6 +239,6 @@
     "{{'<｜tool▁outputs▁end｜>'}}"
     "{% endif %}"
     "{% if add_generation_prompt and not ns.is_tool %}"
-    "{{'<｜Assistant｜>'}}"
+    "{{'<｜Assistant｜><think>\\n'}}"
     "{% endif %}"
-)
\ No newline at end of file
+)
diff --git a/src/lmflow/utils/conversation_template/gemma.py b/src/lmflow/utils/conversation_template/gemma.py
index adaeff240..504923c8e 100644
--- a/src/lmflow/utils/conversation_template/gemma.py
+++ b/src/lmflow/utils/conversation_template/gemma.py
@@ -1,11 +1,9 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import logging
 from dataclasses import dataclass
 
-from .base import StringFormatter, TemplateComponent, ConversationTemplate
-
+from .base import ConversationTemplate, StringFormatter, TemplateComponent
 
 logger = logging.getLogger(__name__)
 
@@ -13,33 +11,85 @@
 @dataclass
 class GemmaConversationTemplate(ConversationTemplate):
     def encode_conversation(self, *args, **kwargs):
-        if kwargs.get('system'):
+        if kwargs.get("system"):
             logger.warning(
-                'As of now, Gemma does not support system messages officially. '
-                'ConversationTemplate will add your system messages right after '
-                'the bos token and before the user message without any special formatting. '
-                'For more details, please refer to the [official template]'
-                '(https://huggingface.co/google/gemma-1.1-2b-it/blob/bf4924f313df5166dee1467161e886e55f2eb4d4/tokenizer_config.json#L1507).'
+                "As of now, Gemma does not support system messages officially. "
+                "ConversationTemplate will add your system messages right after "
+                "the bos token and before the user message without any special formatting. "
+                "For more details, please refer to the [official template]"
+                "(https://huggingface.co/google/gemma-1.1-2b-it/blob/bf4924f313df5166dee1467161e886e55f2eb4d4/tokenizer_config.json#L1507)."
             )
         return super().encode_conversation(*args, **kwargs)
-        
+
 
 GEMMA_TEMPLATE = GemmaConversationTemplate(
-    template_name='gemma',
+    template_name="gemma",
     user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<start_of_turn>user\n{{content}}<end_of_turn>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<start_of_turn>user\n{{content}}<end_of_turn>\n")]
     ),
     assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<start_of_turn>model\n{{content}}<end_of_turn>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<start_of_turn>model\n{{content}}<end_of_turn>\n")]
     ),
-    system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='{{content}}')
-        ]
-    ),
-    special_starter=TemplateComponent(type='token', content='bos_token')
-)
\ No newline at end of file
+    system_formatter=StringFormatter(template=[TemplateComponent(type="string", content="{{content}}")]),
+    special_starter=TemplateComponent(type="token", content="bos_token"),
+)
+
+GEMMA3_TEMPLATE = """{{ bos_token }}
+{%- if messages[0]['role'] == 'system' -%}
+    {%- if messages[0]['content'] is string -%}
+        {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}
+    {%- else -%}
+        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}
+    {%- endif -%}
+    {%- set loop_messages = messages[1:] -%}\n{%- else -%}
+    {%- set first_user_prefix = \"\" -%}
+    {%- set loop_messages = messages -%}
+{%- endif -%}
+{%- for message in loop_messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}
+    {%- endif -%}
+    {%- if (message['role'] == 'assistant') -%}
+        {%- set role = \"model\" -%}
+    {%- else -%}
+        {%- set role = message['role'] -%}
+    {%- endif -%}
+    {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}
+    {%- if (message['role'] == 'assistant') -%}
+        {% generation %}
+        {%- if message['content'] is string -%}
+            {{ message['content'] | trim }}
+        {%- elif message['content'] is iterable -%}
+            {%- for item in message['content'] -%}
+                {%- if item['type'] == 'image' -%}
+                    {{ '<start_of_image>' }}
+                {%- elif item['type'] == 'text' -%}
+                    {{ item['text'] | trim }}
+                {%- endif -%}
+            {%- endfor -%}
+        {%- else -%}
+            {{ raise_exception(\"Invalid content type\") }}
+        {%- endif -%}
+        {{ '<end_of_turn>\n' }}
+        {% endgeneration %}
+    {%- else -%}
+        {%- if message['content'] is string -%}
+            {{ message['content'] | trim }}
+        {%- elif message['content'] is iterable -%}
+            {%- for item in message['content'] -%}
+                {%- if item['type'] == 'image' -%}
+                    {{ '<start_of_image>' }}
+                {%- elif item['type'] == 'text' -%}
+                    {{ item['text'] | trim }}
+                {%- endif -%}
+            {%- endfor -%}
+        {%- else -%}
+            {{ raise_exception(\"Invalid content type\") }}
+        {%- endif -%}
+        {{ '<end_of_turn>\n' }}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{'<start_of_turn>model\n'}}
+{%- endif -%}
+"""
diff --git a/src/lmflow/utils/conversation_template/hymba.py b/src/lmflow/utils/conversation_template/hymba.py
index cb8a827bf..ba9e5ca7d 100644
--- a/src/lmflow/utils/conversation_template/hymba.py
+++ b/src/lmflow/utils/conversation_template/hymba.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-from .base import StringFormatter, TemplateComponent, ConversationTemplateForTool
-from typing import Dict, Set, Sequence, Literal, Union, List, Optional, Tuple
+from typing import Optional
 
-from transformers import PreTrainedTokenizer
+from .base import ConversationTemplateForTool, StringFormatter, TemplateComponent
 
 # NOTE: 'contexts' are not used in sft
 # {{'<extra_id_0>System'}}
@@ -37,8 +35,8 @@
 
 
 class HymbaConversationTemplate(ConversationTemplateForTool):
-    def _handle_tools(self, tools: Optional[List[str]]) -> str:
-        tools_out = ''
+    def _handle_tools(self, tools: Optional[list[str]]) -> str:
+        tools_out = ""
         if tools is not None:
             for tool in tools:
                 tools_out += "\n<tool> " + tool + " </tool>"
@@ -46,34 +44,24 @@ def _handle_tools(self, tools: Optional[List[str]]) -> str:
 
 
 HYMBA_TEMPLATE = HymbaConversationTemplate(
-    template_name='hymba',
+    template_name="hymba",
     user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<extra_id_1>User\n{{content}}\n')
-        ]
+        template=[TemplateComponent(type="string", content="<extra_id_1>User\n{{content}}\n")]
     ),
     assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<extra_id_1>Assistant\n{{content}}\n')
-        ]
+        template=[TemplateComponent(type="string", content="<extra_id_1>Assistant\n{{content}}\n")]
     ),
     function_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<extra_id_1>Assistant\n{{content}}\n')
-        ]
+        template=[TemplateComponent(type="string", content="<extra_id_1>Assistant\n{{content}}\n")]
     ),
     observation_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<extra_id_1>Tool\n{{content}}\n')
-        ]
+        template=[TemplateComponent(type="string", content="<extra_id_1>Tool\n{{content}}\n")]
     ),
     system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<extra_id_0>System{{content}}\n\n')
-        ]
+        template=[TemplateComponent(type="string", content="<extra_id_0>System{{content}}\n\n")]
     ),
-    separator=TemplateComponent(type='token_id', content=13),
+    separator=TemplateComponent(type="token_id", content=13),
     remove_last_sep=True,
-    special_stopper=TemplateComponent(type='token', content='eos_token'),
-    force_system=True
-)
\ No newline at end of file
+    special_stopper=TemplateComponent(type="token", content="eos_token"),
+    force_system=True,
+)
diff --git a/src/lmflow/utils/conversation_template/internlm.py b/src/lmflow/utils/conversation_template/internlm.py
index 3e55f754a..99642e611 100644
--- a/src/lmflow/utils/conversation_template/internlm.py
+++ b/src/lmflow/utils/conversation_template/internlm.py
@@ -1,25 +1,17 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-from .base import StringFormatter, TemplateComponent, ConversationTemplate
-
+from .base import ConversationTemplate, StringFormatter, TemplateComponent
 
 INTERNLM2_TEMPLATE = ConversationTemplate(
-    template_name='internlm2',
+    template_name="internlm2",
     user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>user\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>user\n{{content}}<|im_end|>\n")]
     ),
     assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>assistant\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>assistant\n{{content}}<|im_end|>\n")]
     ),
     system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>system\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>system\n{{content}}<|im_end|>\n")]
     ),
-    special_starter=TemplateComponent(type='token', content='bos_token')
-)
\ No newline at end of file
+    special_starter=TemplateComponent(type="token", content="bos_token"),
+)
diff --git a/src/lmflow/utils/conversation_template/llama.py b/src/lmflow/utils/conversation_template/llama.py
index 5cc147cfe..f4cf11090 100644
--- a/src/lmflow/utils/conversation_template/llama.py
+++ b/src/lmflow/utils/conversation_template/llama.py
@@ -1,91 +1,96 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import logging
-from typing import Dict, Set, Sequence, Literal, Union, List, Optional, Tuple
+from collections.abc import Sequence
+from typing import Optional
 
 from transformers import PreTrainedTokenizer
 
-from .base import StringFormatter, TemplateComponent, ConversationTemplate, ConversationTemplateForTool
-
 from lmflow.utils.constants import CONVERSATION_ROLE_NAMES
 
+from .base import ConversationTemplate, ConversationTemplateForTool, StringFormatter, TemplateComponent
+
 logger = logging.getLogger(__name__)
 
-class Llama2ConversationTemplate(ConversationTemplate):    
+
+class Llama2ConversationTemplate(ConversationTemplate):
     def _encode(
         self,
         tokenizer: PreTrainedTokenizer,
-        messages: List[Dict[str, str]],
+        messages: list[dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
-        **kwargs
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+        **kwargs,
+    ) -> Sequence[tuple[list[int], list[int]]]:
         if tools:
-            logger.warning("Formatted tools are not supported in Llama2, thus tools will be ignored. "
-                           "If this is intended, please include tools in the system message manually.")
-        
+            logger.warning(
+                "Formatted tools are not supported in Llama2, thus tools will be ignored. "
+                "If this is intended, please include tools in the system message manually."
+            )
+
         res_all = []
-        
+
         system_formatted = self.system_formatter.format(content=system) if system else []
-        system_formatted_text = "".join([component.content for component in system_formatted if component.type == 'string']) # HACK
-        
+        system_formatted_text = "".join(
+            [component.content for component in system_formatted if component.type == "string"]
+        )  # HACK
+
         for i in range(0, len(messages), 2):
             user_message = messages[i]
             assistant_message = messages[i + 1]
-            
+
             user_content = system_formatted_text + user_message["content"] if i == 0 else user_message["content"]
             user_formatted = self.user_formatter.format(content=user_content)
             assistant_formatted = self.assistant_formatter.format(content=assistant_message["content"])
-            
+
             user_encoded = self._encode_template(user_formatted, tokenizer)
             assistant_encoded = self._encode_template(assistant_formatted, tokenizer)
-            
-            res_all.append((
-                user_encoded, 
-                assistant_encoded
-            ))
-            
+
+            res_all.append((user_encoded, assistant_encoded))
+
         return res_all
 
-class Llama2ConversationTemplateForTool(Llama2ConversationTemplate):    
+
+class Llama2ConversationTemplateForTool(Llama2ConversationTemplate):
     def _encode(
         self,
         tokenizer: PreTrainedTokenizer,
-        messages: List[Dict[str, str]],
+        messages: list[dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
-        **kwargs
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+        **kwargs,
+    ) -> Sequence[tuple[list[int], list[int]]]:
         if tools:
             # logger.warning("Formatted tools are not supported in Llama2, thus tools will be ignored. "
             #                "If this is intended, please include tools in the system message manually.")
             system = system + tools
         res_all = []
         system_formatted = self.system_formatter.format(content=system) if system else []
-        system_formatted_text = "".join([component.content for component in system_formatted if component.type == 'string']) # HACK
+        system_formatted_text = "".join(
+            [component.content for component in system_formatted if component.type == "string"]
+        )  # HACK
         ls_for_save = []
         for i in range(0, len(messages), 1):
-            if messages[i]['role'] == CONVERSATION_ROLE_NAMES['user']:
+            if messages[i]["role"] == CONVERSATION_ROLE_NAMES["user"]:
                 user_message = messages[i]
                 if i == 0:
-                    user_content = system_formatted_text + user_message['content']
+                    user_content = system_formatted_text + user_message["content"]
                 else:
-                    user_content = user_message['content']
+                    user_content = user_message["content"]
                 user_formatted = self.user_formatter.format(content=user_content)
                 user_encoded = self._encode_template(user_formatted, tokenizer)
                 ls_for_save.append(user_encoded)
-            elif messages[i]['role'] == CONVERSATION_ROLE_NAMES['function']:
+            elif messages[i]["role"] == CONVERSATION_ROLE_NAMES["function"]:
                 function_message = messages[i]
-                function_formatted = self.assistant_formatter.format(content=function_message['content'])
+                function_formatted = self.assistant_formatter.format(content=function_message["content"])
                 function_encoded = self._encode_template(function_formatted, tokenizer)
                 ls_for_save.append(function_encoded)
-            elif messages[i]['role'] == CONVERSATION_ROLE_NAMES['observation']:
+            elif messages[i]["role"] == CONVERSATION_ROLE_NAMES["observation"]:
                 observation_message = messages[i]
-                observation_formatted = self.user_formatter.format(content=observation_message['content'])
+                observation_formatted = self.user_formatter.format(content=observation_message["content"])
                 observation_encoded = self._encode_template(observation_formatted, tokenizer)
                 ls_for_save.append(observation_encoded)
-            elif messages[i]['role'] == CONVERSATION_ROLE_NAMES['assistant']:
+            elif messages[i]["role"] == CONVERSATION_ROLE_NAMES["assistant"]:
                 assistant_message = messages[i]
                 assistant_formatted = self.assistant_formatter.format(content=assistant_message["content"])
                 assistant_encoded = self._encode_template(assistant_formatted, tokenizer)
@@ -93,101 +98,114 @@ def _encode(
                 # res_tuple = (ls_for_save[0], ls_for_save[1], ls_for_save[2], ls_for_save[3])
                 res_all.append(tuple(ls_for_save))
                 ls_for_save = []
-        
+
         if ls_for_save:
             res_all.append(tuple(ls_for_save))
 
         return res_all
 
+
 LLAMA3_TEMPLATE = ConversationTemplate(
-    template_name='llama3',
+    template_name="llama3",
     user_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>')
+            TemplateComponent(
+                type="string", content="<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+            )
         ]
     ),
     assistant_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|start_header_id|>assistant<|end_header_id|>\n\n{{content}}<|eot_id|>')
+            TemplateComponent(
+                type="string", content="<|start_header_id|>assistant<|end_header_id|>\n\n{{content}}<|eot_id|>"
+            )
         ]
     ),
     system_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>')
+            TemplateComponent(
+                type="string", content="<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"
+            )
         ]
     ),
-    special_starter=TemplateComponent(type='token', content='bos_token')
+    special_starter=TemplateComponent(type="token", content="bos_token"),
 )
 
 LLAMA3_TEMPLATE_FOR_TOOL = ConversationTemplateForTool(
-    template_name='llama3_for_tool',
+    template_name="llama3_for_tool",
     user_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>')
+            TemplateComponent(
+                type="string", content="<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+            )
         ]
     ),
     function_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|start_header_id|>assistant<|end_header_id|>\n\n{{content}}<|eot_id|>')
+            TemplateComponent(
+                type="string", content="<|start_header_id|>assistant<|end_header_id|>\n\n{{content}}<|eot_id|>"
+            )
         ]
     ),
     observation_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|start_header_id|>tool<|end_header_id|>\n\n{{content}}<|eot_id|>')
+            TemplateComponent(
+                type="string", content="<|start_header_id|>tool<|end_header_id|>\n\n{{content}}<|eot_id|>"
+            )
         ]
     ),
     assistant_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|start_header_id|>assistant<|end_header_id|>\n\n{{content}}<|eot_id|>')
+            TemplateComponent(
+                type="string", content="<|start_header_id|>assistant<|end_header_id|>\n\n{{content}}<|eot_id|>"
+            )
         ]
     ),
     system_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>')
+            TemplateComponent(
+                type="string", content="<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"
+            )
         ]
     ),
-    special_starter=TemplateComponent(type='token', content='bos_token')
+    special_starter=TemplateComponent(type="token", content="bos_token"),
 )
 
 
 LLAMA2_TEMPLATE = Llama2ConversationTemplate(
-    template_name='llama2',
+    template_name="llama2",
     user_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='token', content='bos_token'),
-            TemplateComponent(type='string', content='[INST] {{content}} [/INST]')
+            TemplateComponent(type="token", content="bos_token"),
+            TemplateComponent(type="string", content="[INST] {{content}} [/INST]"),
         ]
     ),
     assistant_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='{{content}}'),
-            TemplateComponent(type='token', content='eos_token')
+            TemplateComponent(type="string", content="{{content}}"),
+            TemplateComponent(type="token", content="eos_token"),
         ]
     ),
     system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<<SYS>>\n{{content}}\n<</SYS>>\n\n')
-        ]
-    )
+        template=[TemplateComponent(type="string", content="<<SYS>>\n{{content}}\n<</SYS>>\n\n")]
+    ),
 )
 
 LLAMA2_TEMPLATE_FOR_TOOL = Llama2ConversationTemplate(
-    template_name='llama2_for_tool',
+    template_name="llama2_for_tool",
     user_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='token', content='bos_token'),
-            TemplateComponent(type='string', content='[INST] {{content}} [/INST]')
+            TemplateComponent(type="token", content="bos_token"),
+            TemplateComponent(type="string", content="[INST] {{content}} [/INST]"),
         ]
     ),
     assistant_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='{{content}}'),
-            TemplateComponent(type='token', content='eos_token')
+            TemplateComponent(type="string", content="{{content}}"),
+            TemplateComponent(type="token", content="eos_token"),
         ]
     ),
     system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<<SYS>>\n{{content}}\n<</SYS>>\n\n')
-        ]
-    )
-)
\ No newline at end of file
+        template=[TemplateComponent(type="string", content="<<SYS>>\n{{content}}\n<</SYS>>\n\n")]
+    ),
+)
diff --git a/src/lmflow/utils/conversation_template/phi.py b/src/lmflow/utils/conversation_template/phi.py
index cc23d51e2..8aec13549 100644
--- a/src/lmflow/utils/conversation_template/phi.py
+++ b/src/lmflow/utils/conversation_template/phi.py
@@ -1,26 +1,18 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-from .base import StringFormatter, TemplateComponent, ConversationTemplate
-
+from .base import ConversationTemplate, StringFormatter, TemplateComponent
 
 PHI3_TEMPLATE = ConversationTemplate(
-    template_name='phi3',
+    template_name="phi3",
     user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|user|>\n{{content}}<|end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|user|>\n{{content}}<|end|>\n")]
     ),
     assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|assistant|>\n{{content}}<|end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|assistant|>\n{{content}}<|end|>\n")]
     ),
     system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|system|>\n{{content}}<|end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|system|>\n{{content}}<|end|>\n")]
     ),
-    special_starter=TemplateComponent(type='token', content='bos_token'),
-    special_stopper=TemplateComponent(type='token', content='eos_token')
-)
\ No newline at end of file
+    special_starter=TemplateComponent(type="token", content="bos_token"),
+    special_stopper=TemplateComponent(type="token", content="eos_token"),
+)
diff --git a/src/lmflow/utils/conversation_template/qwen.py b/src/lmflow/utils/conversation_template/qwen.py
index 406278e62..d359e589c 100644
--- a/src/lmflow/utils/conversation_template/qwen.py
+++ b/src/lmflow/utils/conversation_template/qwen.py
@@ -1,62 +1,42 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-from typing import Dict, Set, Sequence, Literal, Union, List, Optional, Tuple
 
-from transformers import PreTrainedTokenizer
-
-from .base import StringFormatter, TemplateComponent, ConversationTemplate, ConversationTemplateForTool
 
+from .base import ConversationTemplate, ConversationTemplateForTool, StringFormatter, TemplateComponent
 
 QWEN2_TEMPLATE = ConversationTemplate(
-    template_name='qwen2',
+    template_name="qwen2",
     user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>user\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>user\n{{content}}<|im_end|>\n")]
     ),
     assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>assistant\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>assistant\n{{content}}<|im_end|>\n")]
     ),
     system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>system\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>system\n{{content}}<|im_end|>\n")]
     ),
-    separator=TemplateComponent(type='string', content='\n')
+    separator=TemplateComponent(type="string", content="\n"),
 )
 
 
 QWEN2_TEMPLATE_FOR_TOOL = ConversationTemplateForTool(
-    template_name='qwen2_for_tool',
+    template_name="qwen2_for_tool",
     user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>user\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>user\n{{content}}<|im_end|>\n")]
     ),
     function_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>assistant\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>assistant\n{{content}}<|im_end|>\n")]
     ),
     observation_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>tool\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>tool\n{{content}}<|im_end|>\n")]
     ),
     assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>assistant\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>assistant\n{{content}}<|im_end|>\n")]
     ),
     system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>system\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>system\n{{content}}<|im_end|>\n")]
     ),
-    separator=TemplateComponent(type='string', content='\n')
+    separator=TemplateComponent(type="string", content="\n"),
 )
 
 
@@ -68,12 +48,12 @@
     "{%- else %}"
     "{{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}"
     "{%- endif %}"
-    "{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}"
+    '{{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}'
     "{%- for tool in tools %}"
-    "{{- \"\\n\" }}"
+    '{{- "\\n" }}'
     "{{- tool | tojson }}"
     "{%- endfor %}"
-    "{{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}"
+    '{{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}'
     "{%- else %}"
     "{%- if messages[0]['role'] == 'system' %}"
     "{{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}"
@@ -82,8 +62,8 @@
     "{%- endif %}"
     "{%- endif %}"
     "{%- for message in messages %}"
-    "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}"
-    "{%- if message.role == \"assistant\" %}"
+    '{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}'
+    '{%- if message.role == "assistant" %}'
     "{{- '<|im_start|>' + message.role + '\\n' }}"
     "{% generation %}"
     "{{ message.content + '<|im_end|>' + '\\n' }}"
@@ -91,7 +71,7 @@
     "{%- else %}"
     "{{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}"
     "{%- endif %}"
-    "{%- elif message.role == \"assistant\" %}"
+    '{%- elif message.role == "assistant" %}'
     "{{- '<|im_start|>' + message.role }}"
     "{%- if message.content %}"
     "{% generation %}"
@@ -103,9 +83,9 @@
     "{%- set tool_call = tool_call.function %}"
     "{%- endif %}"
     "{% generation %}"
-    "{{- '\\n<tool_call>\\n{\"name\": \"' }}"
+    '{{- \'\\n<tool_call>\\n{"name": "\' }}'
     "{{- tool_call.name }}"
-    "{{- '\", \"arguments\": ' }}"
+    '{{- \'", "arguments": \' }}'
     "{{- tool_call.arguments | tojson }}"
     "{{- '}\\n</tool_call>' }}"
     "{% endgeneration %}"
@@ -113,14 +93,14 @@
     "{% generation %}"
     "{{- '<|im_end|>\\n' }}"
     "{% endgeneration %}"
-    "{%- elif message.role == \"tool\" %}"
-    "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}"
+    '{%- elif message.role == "tool" %}'
+    '{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}'
     "{{- '<|im_start|>user' }}"
     "{%- endif %}"
     "{{- '\\n<tool_response>\\n' }}"
     "{{- message.content }}"
     "{{- '\\n</tool_response>' }}"
-    "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}"
+    '{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}'
     "{{- '<|im_end|>\\n' }}"
     "{%- endif %}"
     "{%- endif %}"
@@ -139,12 +119,12 @@
     "{%- else %}"
     "{{- 'You are a helpful assistant.' }}"
     "{%- endif %}"
-    "{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}"
+    '{{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}'
     "{%- for tool in tools %}"
-    "{{- \"\\n\" }}"
+    '{{- "\\n" }}'
     "{{- tool | tojson }}"
     "{%- endfor %}"
-    "{{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}"
+    '{{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}'
     "{%- else %}"
     "{%- if messages[0]['role'] == 'system' %}"
     "{{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}"
@@ -153,8 +133,8 @@
     "{%- endif %}"
     "{%- endif %}"
     "{%- for message in messages %}"
-    "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}"
-    "{%- if message.role == \"assistant\" %}"
+    '{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}'
+    '{%- if message.role == "assistant" %}'
     "{{- '<|im_start|>' + message.role + '\\n' }}"
     "{% generation %}"
     "{{ message.content + '<|im_end|>' + '\\n' }}"
@@ -162,7 +142,7 @@
     "{%- else %}"
     "{{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}"
     "{%- endif %}"
-    "{%- elif message.role == \"assistant\" %}"
+    '{%- elif message.role == "assistant" %}'
     "{{- '<|im_start|>' + message.role }}"
     "{%- if message.content %}"
     "{% generation %}"
@@ -174,9 +154,9 @@
     "{%- set tool_call = tool_call.function %}"
     "{%- endif %}"
     "{% generation %}"
-    "{{- '\\n<tool_call>\\n{\"name\": \"' }}"
+    '{{- \'\\n<tool_call>\\n{"name": "\' }}'
     "{{- tool_call.name }}"
-    "{{- '\", \"arguments\": ' }}"
+    '{{- \'", "arguments": \' }}'
     "{{- tool_call.arguments | tojson }}"
     "{{- '}\\n</tool_call>' }}"
     "{% endgeneration %}"
@@ -184,14 +164,14 @@
     "{% generation %}"
     "{{- '<|im_end|>\\n' }}"
     "{% endgeneration %}"
-    "{%- elif message.role == \"tool\" %}"
-    "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}"
+    '{%- elif message.role == "tool" %}'
+    '{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}'
     "{{- '<|im_start|>user' }}"
     "{%- endif %}"
     "{{- '\\n<tool_response>\\n' }}"
     "{{- message.content }}"
     "{{- '\\n</tool_response>' }}"
-    "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}"
+    '{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}'
     "{{- '<|im_end|>\\n' }}"
     "{%- endif %}"
     "{%- endif %}"
@@ -200,8 +180,8 @@
     "{{- '<|im_start|>assistant\\n' }}"
     "{%- endif %}"
 )
-      
-      
+
+
 QWEN2_5_MATH_TEMPLATE = (
     "{%- if tools %}"
     "{{- '<|im_start|>system\\n' }}"
@@ -210,12 +190,12 @@
     "{%- else %}"
     "{{- 'Please reason step by step, and put your final answer within \\\\boxed{}.' }}"
     "{%- endif %}"
-    "{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}"
+    '{{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}'
     "{%- for tool in tools %}"
-    "{{- \"\\n\" }}"
+    '{{- "\\n" }}'
     "{{- tool | tojson }}"
     "{%- endfor %}"
-    "{{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}"
+    '{{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}'
     "{%- else %}"
     "{%- if messages[0]['role'] == 'system' %}"
     "{{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}"
@@ -224,8 +204,8 @@
     "{%- endif %}"
     "{%- endif %}"
     "{%- for message in messages %}"
-    "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}"
-    "{%- if message.role == \"assistant\" %}"
+    '{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}'
+    '{%- if message.role == "assistant" %}'
     "{{- '<|im_start|>' + message.role + '\\n' }}"
     "{% generation %}"
     "{{ message.content + '<|im_end|>' + '\\n' }}"
@@ -233,7 +213,7 @@
     "{%- else %}"
     "{{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}"
     "{%- endif %}"
-    "{%- elif message.role == \"assistant\" %}"
+    '{%- elif message.role == "assistant" %}'
     "{{- '<|im_start|>' + message.role }}"
     "{%- if message.content %}"
     "{% generation %}"
@@ -245,9 +225,9 @@
     "{%- set tool_call = tool_call.function %}"
     "{%- endif %}"
     "{% generation %}"
-    "{{- '\\n<tool_call>\\n{\"name\": \"' }}"
+    '{{- \'\\n<tool_call>\\n{"name": "\' }}'
     "{{- tool_call.name }}"
-    "{{- '\", \"arguments\": ' }}"
+    '{{- \'", "arguments": \' }}'
     "{{- tool_call.arguments | tojson }}"
     "{{- '}\\n</tool_call>' }}"
     "{% endgeneration %}"
@@ -255,14 +235,14 @@
     "{% generation %}"
     "{{- '<|im_end|>\\n' }}"
     "{% endgeneration %}"
-    "{%- elif message.role == \"tool\" %}"
-    "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}"
+    '{%- elif message.role == "tool" %}'
+    '{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}'
     "{{- '<|im_start|>user' }}"
     "{%- endif %}"
     "{{- '\\n<tool_response>\\n' }}"
     "{{- message.content }}"
     "{{- '\\n</tool_response>' }}"
-    "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}"
+    '{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}'
     "{{- '<|im_end|>\\n' }}"
     "{%- endif %}"
     "{%- endif %}"
@@ -271,8 +251,8 @@
     "{{- '<|im_start|>assistant\\n' }}"
     "{%- endif %}"
 )
-      
-      
+
+
 QWEN_QWQ_TEMPLATE = (
     "{%- if tools %}"
     "{{- '<|im_start|>system\\n' }}"
@@ -281,12 +261,12 @@
     "{%- else %}"
     "{{- 'You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.' }}"
     "{%- endif %}"
-    "{{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}"
+    '{{- "\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}'
     "{%- for tool in tools %}"
-    "{{- \"\\n\" }}"
+    '{{- "\\n" }}'
     "{{- tool | tojson }}"
     "{%- endfor %}"
-    "{{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}"
+    '{{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}'
     "{%- else %}"
     "{%- if messages[0]['role'] == 'system' %}"
     "{{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}"
@@ -295,8 +275,8 @@
     "{%- endif %}"
     "{%- endif %}"
     "{%- for message in messages %}"
-    "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}"
-    "{%- if message.role == \"assistant\" %}"
+    '{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}'
+    '{%- if message.role == "assistant" %}'
     "{{- '<|im_start|>' + message.role + '\\n' }}"
     "{% generation %}"
     "{{ message.content + '<|im_end|>' + '\\n' }}"
@@ -304,7 +284,7 @@
     "{%- else %}"
     "{{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}"
     "{%- endif %}"
-    "{%- elif message.role == \"assistant\" %}"
+    '{%- elif message.role == "assistant" %}'
     "{{- '<|im_start|>' + message.role }}"
     "{%- if message.content %}"
     "{% generation %}"
@@ -316,9 +296,9 @@
     "{%- set tool_call = tool_call.function %}"
     "{%- endif %}"
     "{% generation %}"
-    "{{- '\\n<tool_call>\\n{\"name\": \"' }}"
+    '{{- \'\\n<tool_call>\\n{"name": "\' }}'
     "{{- tool_call.name }}"
-    "{{- '\", \"arguments\": ' }}"
+    '{{- \'", "arguments": \' }}'
     "{{- tool_call.arguments | tojson }}"
     "{{- '}\\n</tool_call>' }}"
     "{% endgeneration %}"
@@ -326,14 +306,14 @@
     "{% generation %}"
     "{{- '<|im_end|>\\n' }}"
     "{% endgeneration %}"
-    "{%- elif message.role == \"tool\" %}"
-    "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}"
+    '{%- elif message.role == "tool" %}'
+    '{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}'
     "{{- '<|im_start|>user' }}"
     "{%- endif %}"
     "{{- '\\n<tool_response>\\n' }}"
     "{{- message.content }}"
     "{{- '\\n</tool_response>' }}"
-    "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}"
+    '{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}'
     "{{- '<|im_end|>\\n' }}"
     "{%- endif %}"
     "{%- endif %}"
@@ -341,4 +321,94 @@
     "{%- if add_generation_prompt %}"
     "{{- '<|im_start|>assistant\\n' }}"
     "{%- endif %}"
-)
\ No newline at end of file
+)
+
+QWEN3_TEMPLATE = (
+    "{%- if tools %}"
+    "{{- '<|im_start|>system\\n' }}"
+    "{%- if messages[0].role == 'system' %}"
+    "{{- messages[0].content + '\\n\\n' }}"
+    "{%- endif %}"
+    '{{- "# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>" }}'
+    "{%- for tool in tools %}"
+    '{{- "\\n" }}'
+    "{{- tool | tojson }}"
+    "{%- endfor %}"
+    '{{- "\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\"name\\": <function-name>, \\"arguments\\": <args-json-object>}\\n</tool_call><|im_end|>\\n" }}'
+    "{%- else %}"
+    "{%- if messages[0].role == 'system' %}"
+    "{{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}"
+    "{%- endif %}"
+    "{%- endif %}"
+    "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}"
+    "{%- for message in messages[::-1] %}"
+    "{%- set index = (messages|length - 1) - loop.index0 %}"
+    "{%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}"
+    "{%- set ns.multi_step_tool = false %}"
+    "{%- set ns.last_query_index = index %}"
+    "{%- endif %}"
+    "{%- endfor %}"
+    "{%- for message in messages %}"
+    '{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}'
+    "{{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}"
+    '{%- elif message.role == "assistant" %}'
+    "{% generation %}"
+    "{%- set content = message.content %}"
+    "{%- set reasoning_content = '' %}"
+    "{%- if message.reasoning_content is defined and message.reasoning_content is not none %}"
+    "{%- set reasoning_content = message.reasoning_content %}"
+    "{%- else %}"
+    "{%- if '</think>' in message.content %}"
+    "{%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}"
+    "{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}"
+    "{%- endif %}"
+    "{%- endif %}"
+    "{%- if loop.index0 > ns.last_query_index %}"
+    "{%- if loop.last or (not loop.last and reasoning_content) %}"
+    "{{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}"
+    "{%- else %}"
+    "{{- '<|im_start|>' + message.role + '\\n' + content }}"
+    "{%- endif %}"
+    "{%- else %}"
+    "{{- '<|im_start|>' + message.role + '\\n' + content }}"
+    "{%- endif %}"
+    "{%- if message.tool_calls %}"
+    "{%- for tool_call in message.tool_calls %}"
+    "{%- if (loop.first and content) or (not loop.first) %}"
+    "{{- '\\n' }}"
+    "{%- endif %}"
+    "{%- if tool_call.function %}"
+    "{%- set tool_call = tool_call.function %}"
+    "{%- endif %}"
+    '{{- \'<tool_call>\\n{"name": "\' }}'
+    "{{- tool_call.name }}"
+    '{{- \'", "arguments": \' }}'
+    "{%- if tool_call.arguments is string %}"
+    "{{- tool_call.arguments }}"
+    "{%- else %}"
+    "{{- tool_call.arguments | tojson }}"
+    "{%- endif %}"
+    "{{- '}\\n</tool_call>' }}"
+    "{%- endfor %}"
+    "{%- endif %}"
+    "{{- '<|im_end|>\\n' }}"
+    "{% endgeneration %}"
+    '{%- elif message.role == "tool" %}'
+    '{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}'
+    "{{- '<|im_start|>user' }}"
+    "{%- endif %}"
+    "{{- '\\n<tool_response>\\n' }}"
+    "{{- message.content }}"
+    "{{- '\\n</tool_response>' }}"
+    '{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}'
+    "{{- '<|im_end|>\\n' }}"
+    "{%- endif %}"
+    "{%- endif %}"
+    "{%- endfor %}"
+    "{%- if add_generation_prompt %}"
+    "{{- '<|im_start|>assistant\\n' }}"
+    "{%- if enable_thinking is defined and enable_thinking is false %}"
+    "{{- '<think>\\n\\n</think>\\n\\n' }}"
+    "{%- endif %}"
+    "{%- endif %}"
+)
diff --git a/src/lmflow/utils/conversation_template/yi.py b/src/lmflow/utils/conversation_template/yi.py
index c04bb8440..abbe69935 100644
--- a/src/lmflow/utils/conversation_template/yi.py
+++ b/src/lmflow/utils/conversation_template/yi.py
@@ -1,24 +1,14 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
-from .base import StringFormatter, TemplateComponent, ConversationTemplate
-
+from .base import ConversationTemplate, StringFormatter, TemplateComponent
 
 YI1_5_TEMPLATE = ConversationTemplate(
-    template_name='yi1_5',
+    template_name="yi1_5",
     user_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>user\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>user\n{{content}}<|im_end|>\n")]
     ),
     assistant_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='<|im_start|>assistant\n{{content}}<|im_end|>\n')
-        ]
+        template=[TemplateComponent(type="string", content="<|im_start|>assistant\n{{content}}<|im_end|>\n")]
     ),
-    system_formatter=StringFormatter(
-        template=[
-            TemplateComponent(type='string', content='{{content}}')
-        ]
-    )
-)
\ No newline at end of file
+    system_formatter=StringFormatter(template=[TemplateComponent(type="string", content="{{content}}")]),
+)
diff --git a/src/lmflow/utils/conversation_template/zephyr.py b/src/lmflow/utils/conversation_template/zephyr.py
index d79d57864..9ec7cb81e 100644
--- a/src/lmflow/utils/conversation_template/zephyr.py
+++ b/src/lmflow/utils/conversation_template/zephyr.py
@@ -1,73 +1,69 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import logging
-from typing import Dict, Set, Sequence, Literal, Union, List, Optional, Tuple
+from collections.abc import Sequence
+from typing import Optional
 
 from transformers import PreTrainedTokenizer
 
-from .base import StringFormatter, TemplateComponent, ConversationTemplate
-
+from .base import ConversationTemplate, StringFormatter, TemplateComponent
 
 logger = logging.getLogger(__name__)
 
 
-class ZephyrConversationTemplate(ConversationTemplate):    
+class ZephyrConversationTemplate(ConversationTemplate):
     def _encode(
         self,
         tokenizer: PreTrainedTokenizer,
-        messages: List[Dict[str, str]],
+        messages: list[dict[str, str]],
         system: Optional[str] = None,
         tools: Optional[str] = None,
-        **kwargs
-    ) -> Sequence[Tuple[List[int], List[int]]]:
+        **kwargs,
+    ) -> Sequence[tuple[list[int], list[int]]]:
         # TODO: truncation according to model max length
         # TODO: make sure the last few tokens are "learnable", not masked with token_id = -100.
-        
+
         res_all = []
-        
+
         system_formatted = self.system_formatter.format(content=system) if system else []
         system_encoded = self._encode_template(system_formatted, tokenizer)
-        
+
         for i in range(0, len(messages), 2):
             user_message = messages[i]
             assistant_message = messages[i + 1]
-            
+
             user_formatted = self.user_formatter.format(content=user_message["content"])
             if i == 0 and not system:
                 # when system is not provided, the first user message should not start with a newline
-                user_formatted[0].content = user_formatted[0].content.replace('\n', '', 1)
+                user_formatted[0].content = user_formatted[0].content.replace("\n", "", 1)
             assistant_formatted = self.assistant_formatter.format(content=assistant_message["content"])
-            
+
             user_encoded = self._encode_template(user_formatted, tokenizer)
             assistant_encoded = self._encode_template(assistant_formatted, tokenizer)
-            
-            res_all.append((
-                system_encoded + user_encoded if i == 0 else user_encoded, 
-                assistant_encoded
-            ))
-            
+
+            res_all.append((system_encoded + user_encoded if i == 0 else user_encoded, assistant_encoded))
+
         return res_all
-    
-    
+
+
 ZEPHYR_TEMPLATE = ZephyrConversationTemplate(
-    template_name='zephyr',
+    template_name="zephyr",
     user_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='\n<|user|>\n{{content}}'),
-            TemplateComponent(type='token', content='eos_token')
+            TemplateComponent(type="string", content="\n<|user|>\n{{content}}"),
+            TemplateComponent(type="token", content="eos_token"),
         ]
     ),
     assistant_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='\n<|assistant|>\n{{content}}'),
-            TemplateComponent(type='token', content='eos_token')
+            TemplateComponent(type="string", content="\n<|assistant|>\n{{content}}"),
+            TemplateComponent(type="token", content="eos_token"),
         ]
     ),
     system_formatter=StringFormatter(
         template=[
-            TemplateComponent(type='string', content='<|system|>\n{{content}}'),
-            TemplateComponent(type='token', content='eos_token')
+            TemplateComponent(type="string", content="<|system|>\n{{content}}"),
+            TemplateComponent(type="token", content="eos_token"),
         ]
-    )
-)
\ No newline at end of file
+    ),
+)
diff --git a/src/lmflow/utils/data_utils.py b/src/lmflow/utils/data_utils.py
index d7ecd1eda..7ce34772a 100644
--- a/src/lmflow/utils/data_utils.py
+++ b/src/lmflow/utils/data_utils.py
@@ -1,4 +1,4 @@
-"""The program includes several functions: setting a random seed, 
+"""The program includes several functions: setting a random seed,
 loading data from a JSON file, batching data, and extracting answers from generated text.
 """
 
@@ -6,7 +6,7 @@
 import os
 import random
 import re
-from typing import Union, List, TypedDict, Dict
+from typing import TypedDict, Union
 
 import numpy as np
 import torch
@@ -20,7 +20,7 @@ def set_random_seed(seed: int):
     ------------
     seed : int
         The default seed.
-        
+
     """
     random.seed(seed)
     np.random.seed(seed)
@@ -28,6 +28,7 @@ def set_random_seed(seed: int):
     if torch.cuda.is_available():
         torch.cuda.manual_seed_all(seed)
 
+
 def load_data(file_name: str):
     """
     Load data with file name.
@@ -36,20 +37,20 @@ def load_data(file_name: str):
     ------------
     file_name : str.
         The dataset file name.
-    
+
     Returns
     ------------
     inputs : list.
         The input texts of the dataset.
     outputs : list.
-        The output texts file datasets.    
+        The output texts file datasets.
     len : int.
         The length of the dataset.
     """
     inputs = []
     outputs = []
     type = ""
-    with open(file_name, encoding='utf-8') as f:
+    with open(file_name, encoding="utf-8") as f:
         json_data = json.load(f)
         type = json_data["type"]
         for line in json_data["instances"]:
@@ -61,6 +62,7 @@ def load_data(file_name: str):
 
     return inputs, outputs, len(outputs)
 
+
 def batchlize(examples: list, batch_size: int, random_shuffle: bool):
     """
     Convert examples to a dataloader.
@@ -73,7 +75,7 @@ def batchlize(examples: list, batch_size: int, random_shuffle: bool):
 
     random_shuffle : bool
         If true, the dataloader shuffle the training data.
-    
+
     Returns
     ------------
     dataloader:
@@ -82,15 +84,15 @@ def batchlize(examples: list, batch_size: int, random_shuffle: bool):
     size = 0
     dataloader = []
     length = len(examples)
-    if (random_shuffle):
+    if random_shuffle:
         random.shuffle(examples)
     while size < length:
         if length - size > batch_size:
-            dataloader.append(examples[size : size+batch_size])
+            dataloader.append(examples[size : size + batch_size])
             size += batch_size
         else:
-            dataloader.append(examples[size : size+(length-size)])
-            size += (length - size)
+            dataloader.append(examples[size : size + (length - size)])
+            size += length - size
     return dataloader
 
 
@@ -98,35 +100,34 @@ def preview_file(file_path: str, chars: int = 100):
     """
     Returns the first and last specified number of characters from a file
     without loading the entire file into memory, working with any file type.
-    
+
     Args:
         file_path (str): Path to the file to be previewed
         chars (int, optional): Number of characters to show from start and end. Defaults to 100.
-    
+
     Returns:
         tuple: (first_chars, last_chars) - The first and last characters from the file
     """
     file_size = os.path.getsize(file_path)
-    
-    with open(file_path, 'r', encoding='utf-8') as f:
+
+    with open(file_path, encoding="utf-8") as f:
         first_chars = f.read(chars)
-        
+
         if file_size <= 2 * chars:
             return first_chars, ""
-        
+
         last_chunk_position = max(0, file_size - chars)
-        
+
         f.seek(0)
         f.seek(last_chunk_position)
-        
+
         last_chars = f.read(chars)
-        
+
     return first_chars, last_chars
 
 
 def get_dataset_type_fast(file_path: str, max_chars: int = 100) -> Union[str, None]:
-    '''Get the type values from the first and last n lines of a large json dataset.
-    '''
+    """Get the type values from the first and last n lines of a large json dataset."""
     file_content_preview = []
     dataset_type = None
     dataset_type_pattern = re.compile(r'[\"\']type[\"\']:\s*[\'\"]([^"]+)[\'\"]')
@@ -141,10 +142,9 @@ def get_dataset_type_fast(file_path: str, max_chars: int = 100) -> Union[str, No
 
 
 def check_dataset_instances_key_fast(file_path: str, instances_key: str, max_lines: int = 100) -> bool:
-    '''Check if the dataset instances key matches the instance_key.
-    '''
+    """Check if the dataset instances key matches the instance_key."""
     file_content_preview = []
-    instance_key_pattern = re.compile(r'[\"\']' + instances_key + r'[\"\']')
+    instance_key_pattern = re.compile(r"[\"\']" + instances_key + r"[\"\']")
     file_content_preview.extend(preview_file(file_path, max_lines))
     for content in file_content_preview:
         if instance_key_pattern.search(content):
@@ -152,14 +152,13 @@ def check_dataset_instances_key_fast(file_path: str, instances_key: str, max_lin
     return False
 
 
-def answer_extraction(response, answer_type=None):   #use this funtion to extract answers from generated text
-
+def answer_extraction(response, answer_type=None):  # use this funtion to extract answers from generated text
     """
     Use this funtion to extract answers from generated text
 
     Parameters
     ------------
-    args : 
+    args :
         Arguments.
     response : str
         plain string response.
@@ -175,16 +174,16 @@ def answer_extraction(response, answer_type=None):   #use this funtion to extrac
     temp = response
     if answer_type in ("gsm8k", "svamp", "asdiv", "addsub", "singleeq", "multiarith", "math"):
         temp = temp.replace(",", "")
-        temp = [s for s in re.findall(r'-?\d+\.?\d*', temp)]
+        temp = [s for s in re.findall(r"-?\d+\.?\d*", temp)]
     elif answer_type in ("aqua", "csqa", "multiple_choice"):
-        temp = re.findall(r'A|B|C|D|E', temp)
+        temp = re.findall(r"A|B|C|D|E", temp)
     elif answer_type in ("strategyqa", "coin_flip"):
         temp = temp.lower()
-        temp = re.sub("\"|\'|\n|\.|\s|\:|\,"," ", temp)
+        temp = re.sub("\"|'|\n|\.|\s|\:|\,", " ", temp)
         temp = temp.split(" ")
         temp = [i for i in temp if i in ("yes", "no")]
     elif answer_type in ("last_letters"):
-        temp = re.sub("\"|\'|\n|\.|\s","", temp)
+        temp = re.sub("\"|'|\n|\.|\s", "", temp)
         temp = [temp]
     elif answer_type in ("pubmedqa", "binary_choice"):
         # pattern = "Output: (yes|no|maybe)"
@@ -217,7 +216,7 @@ def answer_extraction(response, answer_type=None):   #use this funtion to extrac
             pattern = "\(*(A|B|C|D|a|b|c|d)\)*(\.|\s)"
             sttr = re.search(pattern, temp)
             if sttr is not None:
-                if '(' in sttr.group(0):
+                if "(" in sttr.group(0):
                     answer = sttr.group(0)[1].lower()
                 else:
                     answer = sttr.group(0)[0].lower()
@@ -238,7 +237,7 @@ def answer_extraction(response, answer_type=None):   #use this funtion to extrac
             pattern = "\(*(A|B|C|D|a|b|c|d)\)*(\.|\s)"
             sttr = re.search(pattern, temp)
             if sttr is not None:
-                if '(' in sttr.group(0):
+                if "(" in sttr.group(0):
                     answer = sttr.group(0)[1].lower()
                 else:
                     answer = sttr.group(0)[0].lower()
@@ -259,16 +258,16 @@ def answer_extraction(response, answer_type=None):   #use this funtion to extrac
                 answer = answer[:-1]
 
             # round the answer to nearest integer
-        if answer_type in ("gsm8k", "svamp"):
+        if answer_type in ["gsm8k", "svamp"]:
             try:
                 answer = str(round(float(answer)))
-            except:
-                answer = "" # no sol or sol doesn't have valid format
-        elif answer_type in ("last_letters"):
-            try:
-                answer = answer[-args.concat_length:]
-            except:
-                answer = ""
+            except Exception:
+                answer = ""  # no sol or sol doesn't have valid format
+        # elif answer_type in ["last_letters"]:
+        #     try:
+        #         answer = answer[-args.concat_length :] # TODO: args?
+        #     except:
+        #         answer = ""
     else:
         answer = ""
     return answer
@@ -288,9 +287,9 @@ def process_image_flag(text, image_flag="<ImageHere>"):
 
 class VLLMInferenceResultWithInput(TypedDict):
     input: str
-    output: Union[List[str], List[List[int]]]
-    
+    output: Union[list[str], list[list[int]]]
+
 
 class RewardModelInferenceResultWithInput(TypedDict):
     input: str
-    output: List[Dict[str, Union[str, float]]] # [{"score": 0.5, "text": "output text"}]
\ No newline at end of file
+    output: list[dict[str, Union[str, float]]]  # [{"score": 0.5, "text": "output text"}]
diff --git a/src/lmflow/utils/debug/profiler.py b/src/lmflow/utils/debug/profiler.py
index fcb7ebf6f..7ccd23c4d 100644
--- a/src/lmflow/utils/debug/profiler.py
+++ b/src/lmflow/utils/debug/profiler.py
@@ -1,5 +1,5 @@
-import time
 import pprint
+import time
 
 
 class Timer:
@@ -14,24 +14,26 @@ def start(self, tag):
     def end(self, tag):
         self.runtimes[tag]["end"] = time.time()
         self.runtimes[tag]["elapsed"] = self.runtimes[tag]["end"] - self.runtimes[tag]["start"]
-        
+
     def get_runtime(self, tag):
         return self.runtimes[tag]["elapsed"]
-    
+
     def show(self):
         self._to_readable()
         pprint.pprint(self.runtimes_readable)
-        
+
     def _to_readable(self):
         for tag, runtime in self.runtimes.items():
-            self.runtimes_readable[tag] = {"start": time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(runtime["start"]))}
-            self.runtimes_readable[tag]["end"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(runtime["end"]))
+            self.runtimes_readable[tag] = {
+                "start": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(runtime["start"]))
+            }
+            self.runtimes_readable[tag]["end"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(runtime["end"]))
             self.runtimes_readable[tag]["elapsed"] = round(runtime["elapsed"], 5)
-            
+
 
 if __name__ == "__main__":
     timer = Timer("profiler")
     timer.start("main")
     time.sleep(1)
     timer.end("main")
-    timer.show()
\ No newline at end of file
+    timer.show()
diff --git a/src/lmflow/utils/envs.py b/src/lmflow/utils/envs.py
new file mode 100644
index 000000000..1740d154a
--- /dev/null
+++ b/src/lmflow/utils/envs.py
@@ -0,0 +1,8 @@
+import os
+
+
+def is_accelerate_env():
+    for key, _ in os.environ.items():
+        if key.startswith("ACCELERATE_"):
+            return True
+    return False
diff --git a/src/lmflow/utils/flash_attention/__init__.py b/src/lmflow/utils/flash_attention/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/src/lmflow/utils/flash_attention/bloom_flash_attention.py b/src/lmflow/utils/flash_attention/bloom_flash_attention.py
deleted file mode 100644
index 84702ba26..000000000
--- a/src/lmflow/utils/flash_attention/bloom_flash_attention.py
+++ /dev/null
@@ -1,102 +0,0 @@
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-import torch.nn.functional as F
-
-import transformers
-from transformers.models.bloom.modeling_bloom import dropout_add
-
-from einops import rearrange
-
-from .triton_flash_attention import flash_attn_qkvpacked_func
-
-def forward(
-        self,
-        hidden_states: torch.Tensor,
-        residual: torch.Tensor,
-        alibi: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        use_cache: bool = False,
-        output_attentions: bool = False,
-    ):  
-        dtype = hidden_states.dtype
-        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-
-        # 3 x [batch_size, seq_length, num_heads, head_dim]
-        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
-
-        batch_size, q_length, _, _ = query_layer.shape
-        bsz, q_len = batch_size, q_length
-
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            # concatenate along seq_length dimension:
-            #  - key: [batch_size * self.num_heads, head_dim, kv_length]
-            #  - value: [batch_size * self.num_heads, kv_length, head_dim]
-            key_layer = torch.cat((past_key, key_layer), dim=2)
-            value_layer = torch.cat((past_value, value_layer), dim=1)
-
-        if use_cache is True:
-            present = (key_layer, value_layer)
-        else:
-            present = None
-
-        reshaped_alibi = rearrange(alibi, '(b h) one s-> b h one s', h = self.num_heads)
-        reshaped_alibi = reshaped_alibi * self.beta
-        
-        attention_mask = (1.0 - attention_mask)
-        attention_mask = attention_mask[:, None, None, :].bool()
-        reshaped_alibi_masked = reshaped_alibi.masked_fill(attention_mask, -1e9)
-
-        reshaped_query_layer = query_layer
-        reshaped_key_layer = key_layer
-        reshaped_value_layer = value_layer
-
-        qkv = torch.concat([reshaped_query_layer.unsqueeze(2), reshaped_key_layer.unsqueeze(2), reshaped_value_layer.unsqueeze(2)], dim = 2)
-
-        output = flash_attn_qkvpacked_func(
-                qkv, reshaped_alibi_masked, True, self.inv_norm_factor
-            )
-
-        output = rearrange(output, 'b s h d -> (b h) s d')
-        
-        # change view [batch_size, num_heads, q_length, head_dim]
-        context_layer = self._merge_heads(output)
-
-        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
-        if self.pretraining_tp > 1 and self.slow_but_exact:
-            slices = self.hidden_size / self.pretraining_tp
-            output_tensor = torch.zeros_like(context_layer)
-            for i in range(self.pretraining_tp):
-                output_tensor = output_tensor + F.linear(
-                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
-                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
-                )
-        else:
-            output_tensor = self.dense(context_layer)
-
-        output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
-        
-        outputs = (output_tensor, present)
-        if output_attentions:
-            outputs += (context_layer,)
-
-        return outputs
-
-
-# Disable the transformation of the attention mask in LlamaModel as the flash attention
-# requires the attention mask to be the same as the key_padding_mask
-def _prepare_attn_mask(
-        self, attention_mask: torch.Tensor, input_shape: Tuple[int, int], past_key_values_length: int
-    ) -> torch.BoolTensor:
-
-        return attention_mask
-
-def replace_bloom_attn_with_flash_attn():
-    transformers.models.bloom.modeling_bloom.BloomModel._prepare_attn_mask = (
-        _prepare_attn_mask
-    )
-    transformers.models.bloom.modeling_bloom.BloomAttention.forward = forward
\ No newline at end of file
diff --git a/src/lmflow/utils/flash_attention/gpt2_flash_attention.py b/src/lmflow/utils/flash_attention/gpt2_flash_attention.py
deleted file mode 100644
index bac90b447..000000000
--- a/src/lmflow/utils/flash_attention/gpt2_flash_attention.py
+++ /dev/null
@@ -1,136 +0,0 @@
-from typing import List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-
-import transformers
-from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
-
-from einops import rearrange
-
-#try to import flash_attn 2.x.x, if not, import flash_attn 1.x.x
-try:
-    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
-except:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
-
-from flash_attn.bert_padding import unpad_input, pad_input
-
-
-def forward(
-        self,
-        hidden_states: Optional[Tuple[torch.FloatTensor]],
-        layer_past: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = False,
-        output_attentions: Optional[bool] = False,
-    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
-    
-
-        if encoder_hidden_states is not None:
-            if not hasattr(self, "q_attn"):
-                raise ValueError(
-                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
-                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
-                )
-
-            query = self.q_attn(hidden_states)
-            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
-            attention_mask = encoder_attention_mask
-        else:
-            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query = self._split_heads(query, self.num_heads, self.head_dim)
-        key = self._split_heads(key, self.num_heads, self.head_dim)
-        value = self._split_heads(value, self.num_heads, self.head_dim)
-
-        #TODO Should we support?
-        if layer_past is not None:
-            past_key, past_value = layer_past
-            key = torch.cat((past_key, key), dim=-2)
-            value = torch.cat((past_value, value), dim=-2)
-
-        assert use_cache is False, "Use cache is not supported"
-        present = None
-        # if use_cache is True:
-        #     present = (key, value)
-        # else:
-        #     present = None
-
-        assert self.reorder_and_upcast_attn is False, "reorder_and_upcast_attn is not supported yet"
-
-        qkv = torch.stack([query, key, value], dim = 2)
-        qkv = qkv.transpose(1, 3)   # [bsz, seq_len, 3, heads, hiddens_per_head]
-        
-        # breakpoint()
-        key_padding_mask = attention_mask
-        # key_padding_mask = None
-        # breakpoint()
-        if key_padding_mask is None:
-            qkv = rearrange(qkv, "b s ... -> (b s) ...")
-            max_s = q_len
-            cu_q_lens = torch.arange(
-                0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
-            )
-            output = flash_attn_unpadded_qkvpacked_func(
-                qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
-            )
-            output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-        else:
-            # flip in flash attention
-            key_padding_mask = key_padding_mask.clone()
-            key_padding_mask = (1.0 - key_padding_mask)
-            key_padding_mask = key_padding_mask.squeeze(1).squeeze(1)
-            nheads = qkv.shape[-2]
-            x = rearrange(qkv, "b s three h d -> b s (three h d)")
-            x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
-            x_unpad = rearrange(
-                x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
-            )
-            output_unpad = flash_attn_unpadded_qkvpacked_func(
-                x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
-            )
-            output = rearrange(
-                pad_input(
-                    rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
-                ),
-                "b s (h d) -> b s h d",
-                h=nheads,
-            )
-        # if self.reorder_and_upcast_attn:
-        #     attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
-        # else:
-        #     attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-        output = rearrange(output, 'b s h d -> b h s d')
-        attn_output = self._merge_heads(output, self.num_heads, self.head_dim)
-        attn_output = self.c_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
-
-        outputs = (attn_output, present)
-        
-        assert output_attentions is False, "output attentions is not supported yet"
-        # if output_attentions:
-        #     outputs += (attn_weights,)
-
-        return outputs  # a, present, (attentions)
-
-
-# Disable the transformation of the attention mask in LlamaModel as the flash attention
-# requires the attention mask to be the same as the key_padding_mask
-def _prepare_decoder_attention_mask(
-    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
-):
-    # [bsz, seq_len]
-    return attention_mask
-
-
-def replace_gpt2_attn_with_flash_attn():
-    # transformers.models.gpt2.modeling_gpt2.LlamaModel._prepare_decoder_attention_mask = (
-    #     _prepare_decoder_attention_mask
-    # )
-    transformers.models.gpt2.modeling_gpt2.GPT2Attention.forward = forward
\ No newline at end of file
diff --git a/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py b/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py
deleted file mode 100644
index 49c3d50a0..000000000
--- a/src/lmflow/utils/flash_attention/gpt_neo_flash_attention.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from typing import List, Optional, Tuple
-
-import torch
-import transformers
-from einops import rearrange
-
-#try to import flash_attn 2.x.x, if not, import flash_attn 1.x.x
-try:
-    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
-except:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
-
-from flash_attn.bert_padding import unpad_input, pad_input
-
-def _attn(self, query, key, value, attention_mask=None, head_mask=None):
-    # (batch, head, seq_length, head_features)
-    query = query.to(torch.bfloat16)
-    key = key.to(torch.bfloat16)
-    query = query * torch.sqrt(torch.tensor(self.head_dim))
-    qkv = torch.stack(
-        [query, key, value], dim=2
-    )# [bsz, nh, 3, t, hd]
-    qkv = qkv.transpose(1,3)## [bsz, q_len, 3, nh, hd]
-    bsz = qkv.shape[0]
-    q_len = qkv.shape[1]
-    
-    attention_mask = torch.where(attention_mask == -0.0, True, False)
-    key_padding_mask = rearrange(attention_mask, "b () () s -> b s") if attention_mask is not None else None
-    if key_padding_mask is None:
-        qkv = rearrange(qkv, "b s ... -> (b s) ...")
-        max_s = q_len
-        cu_q_lens = torch.arange(
-            0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
-        )
-        output = flash_attn_unpadded_qkvpacked_func(
-            qkv, cu_q_lens, max_s, self.attn_dropout.p if self.training else 0.0 , softmax_scale=None, causal=True
-        )# attention compute
-        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-    else:
-        nheads = qkv.shape[-2]
-        x = rearrange(qkv, "b s three h d -> b s (three h d)")
-        x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
-        x_unpad = rearrange(
-            x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
-        )
-        output_unpad = flash_attn_unpadded_qkvpacked_func(
-            x_unpad, cu_q_lens, max_s, self.attn_dropout.p if self.training else 0.0, softmax_scale=None, causal=True
-        )
-        output = rearrange(
-            pad_input(
-                rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
-            ),
-            "b s (h d) -> b s h d",
-            h=nheads,
-        )
-    
-    return output, None
-
-def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        layer_past=None,
-        head_mask=None,
-        use_cache=False,
-        output_attentions=False,
-    ):
-    
-    assert head_mask is None, "head_mask is not supported"
-    assert not output_attentions, "output_attentions is not supported"
-    assert not use_cache, "use_cache is not supported"
-    
-    query = self.q_proj(hidden_states)
-    key = self.k_proj(hidden_states)
-    value = self.v_proj(hidden_states)
-
-    query = self._split_heads(query, self.num_heads, self.head_dim)
-    key = self._split_heads(key, self.num_heads, self.head_dim)
-    value = self._split_heads(value, self.num_heads, self.head_dim)
-    
-    if layer_past is not None:
-        past_key = layer_past[0]
-        past_value = layer_past[1]
-        key = torch.cat((past_key, key), dim=-2)
-        value = torch.cat((past_value, value), dim=-2)
-        
-    present = None
-    attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
-    new_shape = attn_output.size()[:-2] + (self.num_heads * self.head_dim,)
-    attn_output = attn_output.view(new_shape)
-    attn_output = self.out_proj(attn_output)
-    attn_output = self.resid_dropout(attn_output)
-
-    outputs = (attn_output, present)
-
-    return outputs  # a, present, (attentions)
-        
-def replace_gpt_neo_attn_with_flash_attn():
-    transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._attn = _attn
-    transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention.forward = forward
\ No newline at end of file
diff --git a/src/lmflow/utils/flash_attention/llama_flash_attention.py b/src/lmflow/utils/flash_attention/llama_flash_attention.py
deleted file mode 100644
index 3fc691455..000000000
--- a/src/lmflow/utils/flash_attention/llama_flash_attention.py
+++ /dev/null
@@ -1,124 +0,0 @@
-from typing import List, Optional, Tuple
-
-import torch
-from torch import nn
-import math
-
-import transformers
-from transformers.models.llama.modeling_llama import apply_rotary_pos_emb,_make_causal_mask,_expand_mask
-
-from einops import rearrange
-
-#try to import flash_attn 2.x.x, if not, import flash_attn 1.x.x
-try:
-    from flash_attn.flash_attn_interface import flash_attn_func
-except:
-    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func as flash_attn_func
-
-from flash_attn.bert_padding import unpad_input, pad_input
-
-
-def forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    bsz, q_len, _ = hidden_states.size()
-
-    if self.config.pretraining_tp > 1:
-        raise ValueError("pretraining_tp > 1 is not supported for flash attention")
-    else:
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-    kv_seq_len = key_states.shape[-2]
-
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-
-    if past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = torch.cat([past_key_value[0], key_states], dim=2)
-        value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-    past_key_value = (key_states, value_states) if use_cache else None
-
-    query_states, key_states, value_states = [
-        rearrange(x, "b h s d -> b s h d") for x in [query_states, key_states, value_states]
-    ]
-
-    input_dtype = query_states.dtype
-    if input_dtype == torch.float32:
-        # Handle the case where the model is quantized
-        if hasattr(self.config, "_pre_quantization_dtype"):
-            target_dtype = self.config._pre_quantization_dtype
-        else:
-            target_dtype = self.q_proj.weight.dtype
-
-        query_states = query_states.to(target_dtype)
-        key_states = key_states.to(target_dtype)
-        value_states = value_states.to(target_dtype)
-
-    # below output will have shape (batch_size, seqlen, nheads, headdim)
-    attn_output = flash_attn_func(query_states, key_states, value_states, causal=True)
-
-    if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-
-    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-    attn_output = self.o_proj(attn_output)
-    if output_attentions:
-        raise NotImplementedError("`output_attentions` is not supported when `use_flash_attn` is True")
-    attn_weights = None
-
-    return attn_output, attn_weights, past_key_value
-
-
-# Disable the transformation of the attention mask in LlamaModel as the flash attention
-# requires the attention mask to be the same as the key_padding_mask
-def _prepare_decoder_attention_mask(
-    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
-):
-    # [bsz, seq_len]
-    if input_shape[-1] > 1 and past_key_values_length == 0:  # encode
-        return attention_mask
-    # create causal mask
-    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-    combined_attention_mask = None
-    if input_shape[-1] > 1:
-        combined_attention_mask = _make_causal_mask(
-            input_shape,
-            inputs_embeds.dtype,
-            device=inputs_embeds.device,
-            past_key_values_length=past_key_values_length,
-        )
-
-    if attention_mask is not None:
-        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-        expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
-            inputs_embeds.device
-        )
-        combined_attention_mask = (
-            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
-        )
-
-    return combined_attention_mask
-
-
-def replace_llama_attn_with_flash_attn():
-    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
-    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
diff --git a/src/lmflow/utils/flash_attention/triton_flash_attention.py b/src/lmflow/utils/flash_attention/triton_flash_attention.py
deleted file mode 100644
index 1fefabe3b..000000000
--- a/src/lmflow/utils/flash_attention/triton_flash_attention.py
+++ /dev/null
@@ -1,832 +0,0 @@
-"""
-*Experimental* implementation of FlashAttention in Triton.
-Tested with triton==2.0.0.dev20221202.
-Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
-other than 64:
-https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
-We'll update this implementation with the new Triton backend once this is fixed.
-
-We use the FlashAttention implementation from Phil Tillet a starting point.
-https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
-
-Changes:
-- Implement both causal and non-causal attention.
-- Implement both self-attention and cross-attention.
-- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
-- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
-- Support attention bias.
-- Speed up the forward pass a bit, and only store the LSE instead of m and l.
-- Make the backward for d=128 much faster by reducing register spilling.
-- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
-small batch size * nheads.
-
-Caution:
-- This is an *experimental* implementation. The forward pass should be quite robust but
-I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
-- This implementation has only been tested on A100.
-- If you plan to use headdim other than 64 and 128, you should test for race conditions
-(due to the Triton compiler), as done in tests/test_flash_attn.py
-"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
-for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
-that there are none left for other head dimensions.
-
-Differences between this Triton version and the CUDA version:
-- Triton version doesn't support dropout.
-- Triton forward is generally faster than CUDA forward, while Triton backward is
-generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
-than CUDA forward + backward.
-- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
-- Triton version supports attention bias, while CUDA version doesn't.
-"""
-
-import math
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-# Disabling autotune for now, set num_warps=4 if headdim=64 and num_warps=8 if headdim=128
-# @triton.autotune(
-#     configs=[
-#         triton.Config({"BLOCK_M": 128, "BLOCK_N": 128}, num_warps=4, num_stages=1),
-#         # This config has a race condition when EVEN_M == False, disabling it for now.
-#         # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64}, num_warps=4, num_stages=1),
-#     ],
-#     key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM']
-# )
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _fwd_kernel(
-    Q, K, V, Bias, Out,
-    Lse, TMP,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
-    softmax_scale,
-    stride_qb, stride_qh, stride_qm,
-    stride_kb, stride_kh, stride_kn,
-    stride_vb, stride_vh, stride_vn,
-    stride_bb, stride_bh, stride_bm,
-    stride_ob, stride_oh, stride_om,
-    nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim,
-    CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    # off_b = tl.program_id(1)
-    # off_h = tl.program_id(2)
-    # off_hb = off_b * nheads + off_h
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    # Initialize pointers to Q, K, V
-    # Adding parenthesis around indexing might use int32 math instead of int64 math?
-    # https://github.com/openai/triton/issues/741
-    # I'm seeing a tiny bit of difference (5-7us)
-    q_ptrs = Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    if BIAS_TYPE == 'vector':
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
-    elif BIAS_TYPE == 'matrix':
-        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :])
-    # initialize pointer to m and l
-    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
-    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
-    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
-    # load q: it will stay in SRAM throughout
-    # [2022-10-30] TD: Triton bug - in the case of EVEN_M=True and EVEN_N=False, if we just call
-    # tl.load(q_ptrs), we get the wrong output!
-    if EVEN_M & EVEN_N:
-        if EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        else:
-            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    else:
-        if EVEN_HEADDIM:
-            q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
-        else:
-            q = tl.load(q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                        other=0.0)
-    # loop over k, v and update accumulator
-    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
-    for start_n in range(0, end_n, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
-        # -- compute qk ----
-        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
-            if EVEN_HEADDIM:
-                k = tl.load(k_ptrs + start_n * stride_kn)
-            else:
-                k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
-        else:
-            if EVEN_HEADDIM:
-                k = tl.load(k_ptrs + start_n * stride_kn, mask=(start_n + offs_n)[:, None] < seqlen_k,
-                            other=0.0)
-            else:
-                k = tl.load(k_ptrs + start_n * stride_kn,
-                            mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                            other=0.0)
-        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
-        # Trying to combine the two masks seem to make the result wrong
-        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
-            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
-        if IS_CAUSAL:
-            qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf"))
-        if BIAS_TYPE != 'none':
-            if BIAS_TYPE == 'vector':
-                if EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs + start_n, mask=(start_n + offs_n) < seqlen_k, other=0.0).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == 'matrix':
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs + start_n,
-                                   mask=(offs_m[:, None] < seqlen_q)
-                                        & ((start_n + offs_n)[None, :] < seqlen_k),
-                                   other=0.0).to(tl.float32)
-            # Slightly faster to multiply the softmax_scale in the tl.exp below since the compiler
-            # can then fuse the mult and add into an fma instruction. But if we have bias we need to
-            # to multiply with softmax_scale here.
-            qk = qk * softmax_scale + bias
-            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
-            p = tl.exp(qk - m_ij[:, None])
-        else:
-            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
-            p = tl.exp(qk * softmax_scale - m_ij[:, None])
-        l_ij = tl.sum(p, 1)
-
-        # scale acc_o
-        acc_o_scale = tl.exp(m_i - m_ij)
-
-        # # -- update output accumulator --
-        # BUG: have to store and immediately load
-        tl.store(t_ptrs, acc_o_scale)
-        acc_o_scale = tl.load(t_ptrs)
-        acc_o = acc_o * acc_o_scale[:, None]
-        # update acc_o
-        if EVEN_N & EVEN_M:  # If we just do "if EVEN_N", there seems to be some race condition
-            if EVEN_HEADDIM:
-                v = tl.load(v_ptrs + start_n * stride_vn)
-            else:
-                v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
-        else:
-            if EVEN_HEADDIM:
-                v = tl.load(v_ptrs + start_n * stride_vn, mask=(start_n + offs_n)[:, None] < seqlen_k,
-                            other=0.0)
-            else:
-                v = tl.load(v_ptrs + start_n * stride_vn,
-                            mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                            other=0.0)
-        p = p.to(v.dtype)
-        acc_o += tl.dot(p, v)
-
-        # -- update statistics
-        m_i = m_ij
-        l_i_new = tl.exp(lse_i - m_ij) + l_ij
-        lse_i = m_ij + tl.log(l_i_new)
-
-    o_scale = tl.exp(m_i - lse_i)
-    # BUG: have to store and immediately load
-    tl.store(t_ptrs, o_scale)
-    o_scale = tl.load(t_ptrs)
-    acc_o = acc_o * o_scale[:, None]
-    # rematerialize offsets to save registers
-    start_m = tl.program_id(0)
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    # write back l and m
-    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
-    tl.store(lse_ptrs, lse_i)
-    # initialize pointers to output
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    out_ptrs = Out + off_b * stride_ob + off_h * stride_oh + (offs_m[:, None] * stride_om + offs_d[None, :])
-    if EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(out_ptrs, acc_o)
-        else:
-            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
-    else:
-        if EVEN_HEADDIM:
-            tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
-        else:
-            tl.store(out_ptrs, acc_o,
-                     mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
-
-
-@triton.jit
-def _bwd_preprocess_do_o_dot(
-    Out, DO, Delta,
-    stride_ob, stride_oh, stride_om,
-    stride_dob, stride_doh, stride_dom,
-    nheads, seqlen_q, seqlen_q_rounded, headdim,
-    BLOCK_M: tl.constexpr, BLOCK_HEADDIM: tl.constexpr,
-):
-    start_m = tl.program_id(0)
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    # initialize offsets
-    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    # load
-    o = tl.load(Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :],
-                mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
-    do = tl.load(DO + off_b * stride_dob + off_h * stride_doh + offs_m[:, None] * stride_dom + offs_d[None, :],
-                 mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
-    delta = tl.sum(o * do, axis=1)
-    # write-back
-    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
-
-
-@triton.jit
-def _bwd_store_dk_dv(
-    dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim,
-    EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr,
-):
-    # [2022-11-01] TD: Same bug. In the case of EVEN_N=True and EVEN_M=False,
-    # if we just call tl.store(dv_ptrs), there's a race condition
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            tl.store(dv_ptrs, dv)
-            tl.store(dk_ptrs, dk)
-        else:
-            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
-            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
-    else:
-        if EVEN_HEADDIM:
-            tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
-            tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
-        else:
-            tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-            tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
-
-
-@triton.jit
-def _bwd_kernel_one_col_block(
-    start_n,
-    Q, K, V, Bias,
-    DO, DQ, DK, DV,
-    LSE, D,
-    softmax_scale,
-    stride_qm, stride_kn, stride_vn, stride_bm,
-    stride_dom, stride_dqm, stride_dkn, stride_dvn,
-    seqlen_q, seqlen_k, headdim,
-    ATOMIC_ADD: tl.constexpr,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
-):
-    # We need to make sure begin_m is a multiple of BLOCK_M (not BLOCK_N)
-    begin_m = 0 if not IS_CAUSAL else ((start_n * BLOCK_N) // BLOCK_M) * BLOCK_M
-    # initialize row/col offsets
-    offs_qm = begin_m + tl.arange(0, BLOCK_M)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    offs_m = tl.arange(0, BLOCK_M)
-    offs_d = tl.arange(0, BLOCK_HEADDIM)
-    # initialize pointers to value-like data
-    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
-    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
-    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
-    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
-    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
-    if BIAS_TYPE == 'vector':
-        b_ptrs = Bias + offs_n
-    elif BIAS_TYPE == 'matrix':
-        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
-    # initialize dv and dk
-    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
-    # There seems to be some problem with Triton pipelining that makes results wrong for
-    # headdim=64, seqlen=(113, 255), bias_type='matrix'. In this case the for loop
-    # may have zero step, and pipelining with the bias matrix could screw it up.
-    # So we just exit early.
-    if begin_m >= seqlen_q:
-        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-        _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim,
-                         EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
-        return
-    # k and v stay in SRAM throughout
-    # [2022-10-30] TD: Same bug as the fwd. In the case of EVEN_N=True and EVEN_M=False,
-    # if we just call tl.load(k_ptrs), we get the wrong output!
-    if EVEN_N & EVEN_M:
-        if EVEN_HEADDIM:
-            k = tl.load(k_ptrs)
-            v = tl.load(v_ptrs)
-        else:
-            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-    else:
-        if EVEN_HEADDIM:
-            k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-            v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
-        else:
-            k = tl.load(k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                        other=0.0)
-            v = tl.load(v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
-                        other=0.0)
-    # loop over rows
-    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
-    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
-        start_m = tl.multiple_of(start_m, BLOCK_M)
-        offs_m_curr = start_m + offs_m
-        # load q, k, v, do on-chip
-        # Same bug as below. Otherwise gives wrong result for headdim=40, seqlen=(128, 117)
-        if EVEN_M & EVEN_HEADDIM:
-            q = tl.load(q_ptrs)
-        else:
-            if EVEN_HEADDIM:
-                q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
-            else:
-                q = tl.load(q_ptrs, mask=(offs_m_curr[:, None] < seqlen_q)
-                                         & (offs_d[None, :] < headdim), other=0.0)
-        # recompute p = softmax(qk, dim=-1).T
-        qk = tl.dot(q, k, trans_b=True)
-        # Trying to combine the two masks seem to make the result wrong
-        if not EVEN_N:  # Need to mask out otherwise the softmax is wrong
-            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
-        if IS_CAUSAL:
-            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
-        if BIAS_TYPE != 'none':
-            tl.debug_barrier()  # Race condition otherwise
-            if BIAS_TYPE == 'vector':
-                if EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
-                bias = bias[None, :]
-            elif BIAS_TYPE == 'matrix':
-                if EVEN_M & EVEN_N:
-                    bias = tl.load(b_ptrs).to(tl.float32)
-                else:
-                    bias = tl.load(b_ptrs,
-                                   mask=(offs_m_curr[:, None] < seqlen_q)
-                                        & (offs_n[None, :] < seqlen_k),
-                                   other=0.0).to(tl.float32)
-            qk = qk * softmax_scale + bias
-        # There seems to be a race condition when headdim=48/96, and dq, dk, dv are wrong.
-        # Also wrong for headdim=64.
-        if not (EVEN_M & EVEN_HEADDIM):
-            tl.debug_barrier()
-        lse_i = tl.load(LSE + offs_m_curr)
-        if BIAS_TYPE == 'none':
-            p = tl.exp(qk * softmax_scale - lse_i[:, None])
-        else:
-            p = tl.exp(qk - lse_i[:, None])
-        # compute dv
-        # [2022-10-30] TD: A Triton bug: if EVEN_M=True and EVEN_HEADDIM=False, if we call
-        # do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0), we get wrong outputs
-        # in the case of headdim=48/96, seqlen_q & seqlen_k >= 512. If headdim=40 or seqlen < 512,
-        # the output is correct.
-        if EVEN_M & EVEN_HEADDIM:
-            do = tl.load(do_ptrs)
-        else:
-            # [2022-11-01] TD: Triton bug, there's a race condition if we just use m_mask and not d_mask.
-            do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q)
-                                        & (offs_d[None, :] < headdim), other=0.0)
-        # if EVEN_M:
-        #     if EVEN_HEADDIM:
-        #         do = tl.load(do_ptrs)
-        #     else:
-        #         do = tl.load(do_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
-        # else:
-        #     if EVEN_HEADDIM:
-        #         do = tl.load(do_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
-        #     else:
-        #         do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q)
-        #                                    & (offs_d[None, :] < headdim), other=0.0)
-        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
-        # compute dp = dot(v, do)
-        # There seems to be a race condition when headdim=48/96, and dq, dk are wrong.
-        # Also wrong for headdim=128, seqlen=(108, 256), and ATOMIC_ADD=True
-        # Also wrong for headdim=64, seqlen=(1023, 1024), and ATOMIC_ADD=False
-        if not (EVEN_M & EVEN_HEADDIM):
-            tl.debug_barrier()
-        dp = tl.dot(do, v, trans_b=True)
-        # There's a race condition for headdim=48
-        if not EVEN_HEADDIM:
-            tl.debug_barrier()
-        # compute ds = p * (dp - delta[:, None])
-        # Putting the subtraction after the dp matmul (instead of before) is slightly faster
-        Di = tl.load(D + offs_m_curr)
-        # Converting ds to q.dtype here reduces register pressure and makes it much faster
-        # for BLOCK_HEADDIM=128
-        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
-        # compute dk = dot(ds.T, q)
-        dk += tl.dot(ds, q, trans_a=True)
-        # compute dq
-        if not (EVEN_M & EVEN_HEADDIM):  # Otherewise there's a race condition when BIAS_TYPE='matrix'
-            tl.debug_barrier()
-        if not ATOMIC_ADD:
-            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
-                dq = tl.load(dq_ptrs, eviction_policy="evict_last")
-                dq += tl.dot(ds, k)
-                tl.store(dq_ptrs, dq, eviction_policy="evict_last")
-            else:
-                if EVEN_HEADDIM:
-                    dq = tl.load(dq_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0,
-                                eviction_policy="evict_last")
-                    dq += tl.dot(ds, k)
-                    tl.store(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q,
-                            eviction_policy="evict_last")
-                else:
-                    dq = tl.load(dq_ptrs,
-                                 mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                                 other=0.0, eviction_policy="evict_last")
-                    dq += tl.dot(ds, k)
-                    tl.store(dq_ptrs, dq,
-                             mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
-                             eviction_policy="evict_last")
-        else:  # If we're parallelizing across the seqlen_k dimension
-            dq = tl.dot(ds, k)
-            if EVEN_M & EVEN_HEADDIM:  # Race condition if we just do EVEN_M
-                tl.atomic_add(dq_ptrs, dq)
-            else:
-                if EVEN_HEADDIM:
-                    tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
-                else:
-                    tl.atomic_add(dq_ptrs, dq,
-                                  mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
-        # increment pointers
-        dq_ptrs += BLOCK_M * stride_dqm
-        q_ptrs += BLOCK_M * stride_qm
-        do_ptrs += BLOCK_M * stride_dom
-        if BIAS_TYPE == 'matrix':
-            b_ptrs += BLOCK_M * stride_bm
-    # write-back
-    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
-    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
-    _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim,
-                     EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
-
-
-def init_to_zero(name):
-    return lambda nargs: nargs[name].zero_()
-
-
-@triton.autotune(
-    configs=[
-        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
-        triton.Config({"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # Other configs seem to give wrong results when seqlen_q % 128 != 0, disabling them for now
-        # # Kernel is buggy (give wrong result) if we set BLOCK_m=128, BLOCK_n=64, num_warps=*4*
-        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # triton.Config({"BLOCK_M": 128, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
-        # triton.Config({"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True}, num_warps=4, num_stages=1, pre_hook=init_to_zero('DQ')),
-    ],
-    key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM'],
-)
-@triton.heuristics(
-    {
-        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
-        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
-        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
-    }
-)
-@triton.jit
-def _bwd_kernel(
-    Q, K, V, Bias,
-    DO, DQ, DK, DV,
-    LSE, D,
-    softmax_scale,
-    stride_qb, stride_qh, stride_qm,
-    stride_kb, stride_kh, stride_kn,
-    stride_vb, stride_vh, stride_vn,
-    stride_bb, stride_bh, stride_bm,
-    stride_dob, stride_doh, stride_dom,
-    stride_dqb, stride_dqh, stride_dqm,
-    stride_dkb, stride_dkh, stride_dkn,
-    stride_dvb, stride_dvh, stride_dvn,
-    nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim,
-    CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K,
-    BIAS_TYPE: tl.constexpr,
-    IS_CAUSAL: tl.constexpr,
-    BLOCK_HEADDIM: tl.constexpr,
-    SEQUENCE_PARALLEL: tl.constexpr,
-    EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr,
-    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr,
-):
-    off_hb = tl.program_id(1)
-    off_b = off_hb // nheads
-    off_h = off_hb % nheads
-    # offset pointers for batch/head
-    Q += off_b * stride_qb + off_h * stride_qh
-    K += off_b * stride_kb + off_h * stride_kh
-    V += off_b * stride_vb + off_h * stride_vh
-    DO += off_b * stride_dob + off_h * stride_doh
-    DQ += off_b * stride_dqb + off_h * stride_dqh
-    DK += off_b * stride_dkb + off_h * stride_dkh
-    DV += off_b * stride_dvb + off_h * stride_dvh
-    if BIAS_TYPE != 'none':
-        Bias += off_b * stride_bb + off_h * stride_bh
-    # pointer to row-wise quantities in value-like data
-    D += off_hb * seqlen_q_rounded
-    LSE += off_hb * seqlen_q_rounded
-    if not SEQUENCE_PARALLEL:
-        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
-        for start_n in range(0, num_block_n):
-            _bwd_kernel_one_col_block(
-                start_n,
-                Q, K, V, Bias,
-                DO, DQ, DK, DV,
-                LSE, D,
-                softmax_scale,
-                stride_qm, stride_kn, stride_vn, stride_bm,
-                stride_dom, stride_dqm, stride_dkn, stride_dvn,
-                seqlen_q, seqlen_k, headdim,
-                ATOMIC_ADD=False,
-                BIAS_TYPE=BIAS_TYPE,
-                IS_CAUSAL=IS_CAUSAL,
-                BLOCK_HEADDIM=BLOCK_HEADDIM,
-                EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM,
-                BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N
-            )
-    else:
-        start_n = tl.program_id(0)
-        _bwd_kernel_one_col_block(
-            start_n,
-            Q, K, V, Bias,
-            DO, DQ, DK, DV,
-            LSE, D,
-            softmax_scale,
-            stride_qm, stride_kn, stride_vn, stride_bm,
-            stride_dom, stride_dqm, stride_dkn, stride_dvn,
-            seqlen_q, seqlen_k, headdim,
-            ATOMIC_ADD=True,
-            BIAS_TYPE=BIAS_TYPE,
-            IS_CAUSAL=IS_CAUSAL,
-            BLOCK_HEADDIM=BLOCK_HEADDIM,
-            EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM,
-            BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N
-        )
-
-
-def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
-    # shape constraints
-    batch, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, _, _ = k.shape
-    assert k.shape == (batch, seqlen_k, nheads, d)
-    assert v.shape == (batch, seqlen_k, nheads, d)
-    assert d <= 128, 'FlashAttention only support head dimensions up to 128'
-    assert q.dtype == k.dtype == v.dtype, 'All tensors must have the same type'
-    assert q.dtype in [torch.float16, torch.bfloat16], 'Only support fp16 and bf16'
-    assert q.is_cuda and k.is_cuda and v.is_cuda
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-
-    has_bias = bias is not None
-    bias_type = 'none'
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        if bias.stride(-1) != 1:
-            bias = bias.contiguous()
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = 'vector'
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = 'matrix'
-        else:
-            raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k)'
-                               ' or (seqlen_q, seqlen_k)')
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
-    o = torch.empty_like(q)
-
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    BLOCK = 128
-    num_warps = 4 if d <= 64 else 8
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _fwd_kernel[grid](
-        q, k, v, bias, o,
-        lse, tmp,
-        softmax_scale,
-        q.stride(0), q.stride(2), q.stride(1),
-        k.stride(0), k.stride(2), k.stride(1),
-        v.stride(0), v.stride(2), v.stride(1),
-        *bias_strides,
-        o.stride(0), o.stride(2), o.stride(1),
-        nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d,
-        seqlen_q // 32,  seqlen_k // 32, # key for triton cache (limit number of compilations)
-        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
-        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
-        bias_type, causal, BLOCK_HEADDIM,
-        BLOCK_M=BLOCK, BLOCK_N=BLOCK,
-        num_warps=num_warps,
-        num_stages=1,
-    )
-    return o, lse, softmax_scale  # softmax_scale could have been updated
-
-
-def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None):
-    # Make sure that the last dimension is contiguous
-    if do.stride(-1) != 1:
-        do = do.contiguous()
-    batch, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, _, _ = k.shape
-    # assert d in {16, 32, 64, 128}
-    assert d <= 128
-    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
-    assert lse.shape == (batch, nheads, seqlen_q_rounded)
-    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
-    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
-    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
-    # dq_accum = torch.zeros_like(q, dtype=torch.float32)
-    dq_accum = torch.empty_like(q, dtype=torch.float32)
-    delta = torch.empty_like(lse)
-    # delta = torch.zeros_like(lse)
-
-    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
-    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
-    _bwd_preprocess_do_o_dot[grid](
-        o, do, delta,
-        o.stride(0), o.stride(2), o.stride(1),
-        do.stride(0), do.stride(2), do.stride(1),
-        nheads, seqlen_q, seqlen_q_rounded, d,
-        BLOCK_M=128, BLOCK_HEADDIM=BLOCK_HEADDIM,
-    )
-
-    has_bias = bias is not None
-    bias_type = 'none'
-    if has_bias:
-        assert bias.dtype in [q.dtype, torch.float]
-        assert bias.is_cuda
-        assert bias.dim() == 4
-        assert bias.stride(-1) == 1
-        if bias.shape[2:] == (1, seqlen_k):
-            bias_type = 'vector'
-        elif bias.shape[2:] == (seqlen_q, seqlen_k):
-            bias_type = 'matrix'
-        else:
-            raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k)'
-                               ' or (seqlen_q, seqlen_k)')
-        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
-    bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
-
-    # BLOCK_M = 128
-    # BLOCK_N = 64
-    # num_warps = 4
-    grid = lambda META: (triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1,
-                    batch * nheads)
-    _bwd_kernel[grid](
-        q, k, v, bias,
-        do, dq_accum, dk, dv,
-        lse, delta,
-        softmax_scale,
-        q.stride(0), q.stride(2), q.stride(1),
-        k.stride(0), k.stride(2), k.stride(1),
-        v.stride(0), v.stride(2), v.stride(1),
-        *bias_strides,
-        do.stride(0), do.stride(2), do.stride(1),
-        dq_accum.stride(0), dq_accum.stride(2), dq_accum.stride(1),
-        dk.stride(0), dk.stride(2), dk.stride(1),
-        dv.stride(0), dv.stride(2), dv.stride(1),
-        nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d,
-        seqlen_q // 32,  seqlen_k // 32, # key for triton cache (limit number of compilations)
-        # Can't use kwargs here because triton autotune expects key to be args, not kwargs
-        # IS_CAUSAL=causal, BLOCK_HEADDIM=d,
-        bias_type, causal, BLOCK_HEADDIM,
-        # SEQUENCE_PARALLEL=False,
-        # BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N,
-        # num_warps=num_warps,
-        # num_stages=1,
-    )
-    dq.copy_(dq_accum)
-
-
-class FlashAttnQKVPackedFunc(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
-        """
-            qkv: (batch, seqlen, 3, nheads, headdim)
-            bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
-                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
-                ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
-        """
-        # Make sure that the last dimension is contiguous
-        if qkv.stride(-1) != 1:
-            qkv = qkv.contiguous()
-        o, lse, ctx.softmax_scale = _flash_attn_forward(
-            qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], bias=bias, causal=causal,
-            softmax_scale=softmax_scale
-        )
-        ctx.save_for_backward(qkv, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        qkv, o, lse, bias = ctx.saved_tensors
-        assert not ctx.needs_input_grad[1], 'FlashAttention does not support bias gradient yet'
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            dqkv = torch.empty_like(qkv)
-            _flash_attn_backward(do, qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], o, lse,
-                                 dqkv[:, :, 0], dqkv[:, :, 1], dqkv[:, :, 2],
-                                 bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
-        return dqkv, None, None, None
-
-
-flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
-
-
-class FlashAttnKVPackedFunc(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
-        """
-            q: (batch, seqlen_q, nheads, headdim)
-            kv: (batch, seqlen_k, 2, nheads, headdim)
-            bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-                ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        # Make sure that the last dimension is contiguous
-        q, kv = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
-        o, lse, ctx.softmax_scale = _flash_attn_forward(
-            q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale
-        )
-        ctx.save_for_backward(q, kv, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, kv, o, lse, bias = ctx.saved_tensors
-        if len(ctx.needs_input_grad) >= 3:
-            assert not ctx.needs_input_grad[2], 'FlashAttention does not support bias gradient yet'
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dkv = torch.empty_like(kv)
-            _flash_attn_backward(do, q, kv[:, :, 0], kv[:, :, 1], o, lse,
-                                 dq, dkv[:, :, 0], dkv[:, :, 1],
-                                 bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
-        return dq, dkv, None, None, None
-
-
-flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
-
-
-class FlashAttnFunc(torch.autograd.Function):
-
-    @staticmethod
-    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
-        """
-            q: (batch_size, seqlen_q, nheads, headdim)
-            k, v: (batch_size, seqlen_k, nheads, headdim)
-            bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
-                For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
-                ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
-        """
-        # Make sure that the last dimension is contiguous
-        q, k, v = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
-        o, lse, ctx.softmax_scale = _flash_attn_forward(
-            q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale
-        )
-        ctx.save_for_backward(q, k, v, o, lse, bias)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, k, v, o, lse, bias = ctx.saved_tensors
-        assert not ctx.needs_input_grad[3], 'FlashAttention does not support bias gradient yet'
-        # Triton's autotune causes the Tensor._version to change, and so Pytorch autograd
-        # does a memcpy. To avoid this we run in inference_mode, which doesn't track the version.
-        with torch.inference_mode():
-            dq = torch.empty_like(q)
-            dk = torch.empty_like(k)
-            dv = torch.empty_like(v)
-            _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv,
-                                 bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
-        return dq, dk, dv, None, None, None
-
-
-flash_attn_func = FlashAttnFunc.apply
\ No newline at end of file
diff --git a/src/lmflow/utils/llava_conversation_lib.py b/src/lmflow/utils/llava_conversation_lib.py
index 2a18384ce..166c95b6f 100644
--- a/src/lmflow/utils/llava_conversation_lib.py
+++ b/src/lmflow/utils/llava_conversation_lib.py
@@ -1,10 +1,11 @@
 import dataclasses
-from enum import auto, Enum
-from typing import List, Tuple
+from enum import Enum, auto
+from typing import Optional
 
 
 class SeparatorStyle(Enum):
     """Different separator style."""
+
     SINGLE = auto()
     TWO = auto()
     MPT = auto()
@@ -15,13 +16,14 @@ class SeparatorStyle(Enum):
 @dataclasses.dataclass
 class Conversation:
     """A class that keeps all conversation history."""
+
     system: str
-    roles: List[str]
-    messages: List[List[str]]
+    roles: list[str]
+    messages: list[list[str]]
     offset: int
     sep_style: SeparatorStyle = SeparatorStyle.SINGLE
     sep: str = "###"
-    sep2: str = None
+    sep2: Optional[str] = None
     version: str = "Unknown"
 
     skip_next: bool = False
@@ -32,7 +34,7 @@ def get_prompt(self):
             messages = self.messages.copy()
             init_role, init_msg = messages[0].copy()
             init_msg = init_msg[0].replace("<image>", "").strip()
-            if 'mmtag' in self.version:
+            if "mmtag" in self.version:
                 messages[0] = (init_role, init_msg)
                 messages.insert(0, (self.roles[0], "<Image><image></Image>"))
                 messages.insert(1, (self.roles[1], "Received."))
@@ -79,7 +81,8 @@ def get_prompt(self):
                 if message:
                     if type(message) is tuple:
                         message, _, _ = message
-                    if i == 0: message = wrap_sys(self.system) + message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
                     if i % 2 == 0:
                         message = wrap_inst(message)
                         ret += self.sep + message
@@ -108,14 +111,17 @@ def append_message(self, role, message):
 
     def get_images(self, return_pil=False):
         images = []
-        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
             if i % 2 == 0:
                 if type(msg) is tuple:
                     import base64
                     from io import BytesIO
+
                     from PIL import Image
+
                     msg, image, image_process_mode = msg
                     if image_process_mode == "Pad":
+
                         def expand2square(pil_img, background_color=(122, 116, 104)):
                             width, height = pil_img.size
                             if width == height:
@@ -128,6 +134,7 @@ def expand2square(pil_img, background_color=(122, 116, 104)):
                                 result = Image.new(pil_img.mode, (height, height), background_color)
                                 result.paste(pil_img, ((height - width) // 2, 0))
                                 return result
+
                         image = expand2square(image)
                     elif image_process_mode == "Crop":
                         pass
@@ -157,11 +164,12 @@ def expand2square(pil_img, background_color=(122, 116, 104)):
 
     def to_gradio_chatbot(self):
         ret = []
-        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
             if i % 2 == 0:
                 if type(msg) is tuple:
                     import base64
                     from io import BytesIO
+
                     msg, image, image_process_mode = msg
                     max_hw, min_hw = max(image.size), min(image.size)
                     aspect_ratio = max_hw / min_hw
@@ -179,7 +187,7 @@ def to_gradio_chatbot(self):
                     img_b64_str = base64.b64encode(buffered.getvalue()).decode()
                     img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
                     ret.append([img_str, None])
-                    msg = msg.replace('<image>', '').strip()
+                    msg = msg.replace("<image>", "").strip()
                     if len(msg) > 0:
                         ret.append([msg, None])
                 else:
@@ -197,7 +205,8 @@ def copy(self):
             sep_style=self.sep_style,
             sep=self.sep,
             sep2=self.sep2,
-            version=self.version)
+            version=self.version,
+        )
 
     def dict(self):
         if len(self.get_images()) > 0:
@@ -221,11 +230,12 @@ def dict(self):
 
 conv_vicuna_v0 = Conversation(
     system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
     roles=("Human", "Assistant"),
     messages=(
         ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
-        ("Assistant",
+        (
+            "Assistant",
             "Renewable energy sources are those that can be replenished naturally in a relatively "
             "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
             "Non-renewable energy sources, on the other hand, are finite and will eventually be "
@@ -243,7 +253,8 @@ def dict(self):
             "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
             "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
             "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
-            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ),
     ),
     offset=2,
     sep_style=SeparatorStyle.SINGLE,
@@ -277,8 +288,8 @@ def dict(self):
 
 conv_llava_llama_2 = Conversation(
     system="You are a helpful language and vision assistant. "
-           "You are able to understand the visual content that the user provides, "
-           "and assist the user with a variety of tasks using natural language.",
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
     roles=("USER", "ASSISTANT"),
     version="llama_v2",
     messages=(),
@@ -302,8 +313,7 @@ def dict(self):
 conv_llava_plain = Conversation(
     system="",
     roles=("", ""),
-    messages=(
-    ),
+    messages=(),
     offset=0,
     sep_style=SeparatorStyle.PLAIN,
     sep="\n",
@@ -311,12 +321,9 @@ def dict(self):
 
 conv_llava_v0 = Conversation(
     system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
     roles=("Human", "Assistant"),
-    messages=(
-        ("Human", "Hi!"),
-        ("Assistant", "Hi there! How can I help you today?")
-    ),
+    messages=(("Human", "Hi!"), ("Assistant", "Hi there! How can I help you today?")),
     offset=2,
     sep_style=SeparatorStyle.SINGLE,
     sep="###",
@@ -324,11 +331,10 @@ def dict(self):
 
 conv_llava_v0_mmtag = Conversation(
     system="A chat between a curious user and an artificial intelligence assistant. "
-           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
-           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
     roles=("Human", "Assistant"),
-    messages=(
-    ),
+    messages=(),
     offset=0,
     sep_style=SeparatorStyle.SINGLE,
     sep="###",
@@ -337,7 +343,7 @@ def dict(self):
 
 conv_llava_v1 = Conversation(
     system="A chat between a curious human and an artificial intelligence assistant. "
-           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
     roles=("USER", "ASSISTANT"),
     version="v1",
     messages=(),
@@ -349,8 +355,8 @@ def dict(self):
 
 conv_llava_v1_mmtag = Conversation(
     system="A chat between a curious user and an artificial intelligence assistant. "
-           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
-           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
     roles=("USER", "ASSISTANT"),
     messages=(),
     offset=0,
@@ -360,14 +366,13 @@ def dict(self):
     version="v1_mmtag",
 )
 
-default_conversation = conv_vicuna_v1 # currently only support the v1 version
+default_conversation = conv_vicuna_v1  # currently only support the v1 version
 conv_templates = {
     "default": conv_vicuna_v0,
     "v0": conv_vicuna_v0,
     "v1": conv_vicuna_v1,
     "vicuna_v1": conv_vicuna_v1,
     "llama_2": conv_llama_2,
-
     "plain": conv_llava_plain,
     "v0_plain": conv_llava_plain,
     "llava_v0": conv_llava_v0,
@@ -375,7 +380,6 @@ def dict(self):
     "llava_v1": conv_llava_v1,
     "v1_mmtag": conv_llava_v1_mmtag,
     "llava_llama_2": conv_llava_llama_2,
-
     "mpt": conv_mpt,
 }
 
diff --git a/src/lmflow/utils/model.py b/src/lmflow/utils/model.py
index 99f7cc37d..40b015292 100644
--- a/src/lmflow/utils/model.py
+++ b/src/lmflow/utils/model.py
@@ -1,25 +1,23 @@
 #!/usr/bin/env python
-# coding=utf-8
 # Copyright 2024 Statistics and Machine Learning Research Group. All rights reserved.
 import logging
-from typing import Dict, Any, List, Tuple, Union
 
 from transformers import AutoTokenizer
 
 from lmflow.args import ModelArguments
 
-
 logger = logging.getLogger(__name__)
 
 
-def check_homogeneity(model_args_list: List[ModelArguments]) -> bool:
-    assert all(isinstance(model_args, ModelArguments) for model_args in model_args_list), \
+def check_homogeneity(model_args_list: list[ModelArguments]) -> bool:
+    assert all(isinstance(model_args, ModelArguments) for model_args in model_args_list), (
         "model_args_list should be a list of ModelArguments objects."
+    )
     assert len(model_args_list) > 1, "model_args_list should have at least two elements."
-    
+
     tokenizer_names = []
     for model_args in model_args_list:
         tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, use_fast=False)
         tokenizer_names.append(tokenizer.__class__.__name__)
-    
-    return len(set(tokenizer_names)) == 1
\ No newline at end of file
+
+    return len(set(tokenizer_names)) == 1
diff --git a/src/lmflow/utils/multimodal.py b/src/lmflow/utils/multimodal.py
index ba3669833..2c0589de5 100644
--- a/src/lmflow/utils/multimodal.py
+++ b/src/lmflow/utils/multimodal.py
@@ -1,20 +1,19 @@
 import glob
+
 import torch
-from transformers import LlamaConfig
 from tqdm import tqdm
+from transformers import LlamaConfig
 
 
 def update_custom_config(config, model_args):
     if model_args.llm_model_name_or_path is not None:
-        text_config = LlamaConfig.from_pretrained(
-            model_args.llm_model_name_or_path)
+        text_config = LlamaConfig.from_pretrained(model_args.llm_model_name_or_path)
         config.text_config = text_config
     config.with_qformer = model_args.with_qformer
     config.custom_vision_model = model_args.custom_vision_model
     if model_args.custom_vision_model:
         # config.vision_model_args = model_args
-        config.image_encoder_name_or_path = \
-            model_args.image_encoder_name_or_path
+        config.image_encoder_name_or_path = model_args.image_encoder_name_or_path
         config.vision_select_layer = model_args.vision_select_layer
         if getattr(model_args, "vision_select_feature", None) is not None:
             config.vision_select_feature = model_args.vision_select_feature
@@ -35,16 +34,16 @@ def load_llava_pretrain_model(model, checkpoint_path):
         model.load_state_dict(new_state_dict, strict=False)
     return model
 
+
 def adapt_llava_model_to_lmflow_type(state_dict):
     new_state_dict = {}
     for key, item in state_dict.items():
         key = key.replace("model.layers", "language_model.model.layers")
-        key = key.replace("model.embed_tokens",
-                          "language_model.model.embed_tokens")
+        key = key.replace("model.embed_tokens", "language_model.model.embed_tokens")
         key = key.replace("model.mm_projector", "language_projection")
         key = key.replace("lm_head", "language_model.lm_head")
         key = key.replace("model.norm", "language_model.model.norm")
         if "vision_tower" in key:
             continue
         new_state_dict[key] = item
-    return new_state_dict
\ No newline at end of file
+    return new_state_dict
diff --git a/src/lmflow/utils/position_interpolation/llama_rope_scaled_monkey_patch.py b/src/lmflow/utils/position_interpolation/llama_rope_scaled_monkey_patch.py
index 136420e94..441e109f9 100644
--- a/src/lmflow/utils/position_interpolation/llama_rope_scaled_monkey_patch.py
+++ b/src/lmflow/utils/position_interpolation/llama_rope_scaled_monkey_patch.py
@@ -4,23 +4,24 @@
 import transformers
 import transformers.models.llama.modeling_llama
 
+
 class CondenseRotaryEmbedding(torch.nn.Module):
     def __init__(self, dim, pi_ratio, ntk_ratio, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
-        
+
         self.ntk_ratio = ntk_ratio
         max_position_embeddings *= ntk_ratio
-        base = base * ntk_ratio ** (dim / (dim-2)) #Base change formula
-        
+        base = base * ntk_ratio ** (dim / (dim - 2))  # Base change formula
+
         inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
         self.register_buffer("inv_freq", inv_freq)
-        
+
         self.pi_ratio = pi_ratio
         max_position_embeddings *= pi_ratio
         self.max_seq_len_cached = max_position_embeddings
         t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype) / pi_ratio
         freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        
+
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         dtype = torch.get_default_dtype()
@@ -34,16 +35,19 @@ def forward(self, x, seq_len=None):
             self.max_seq_len_cached = seq_len
             t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype) / self.pi_ratio
             freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            
+
             # Different from paper, but it uses a different permutation in order to obtain the same calculation
             emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
             self.register_buffer("cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False)
             self.register_buffer("sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False)
-            
+
         return (
             self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
             self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
         )
 
+
 def replace_llama_with_condense(pi_ratio, ntk_ratio):
-    transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(CondenseRotaryEmbedding, pi_ratio=pi_ratio, ntk_ratio=ntk_ratio)
\ No newline at end of file
+    transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial(
+        CondenseRotaryEmbedding, pi_ratio=pi_ratio, ntk_ratio=ntk_ratio
+    )
diff --git a/src/lmflow/utils/test_utils.py b/src/lmflow/utils/test_utils.py
new file mode 100644
index 000000000..d30aec68a
--- /dev/null
+++ b/src/lmflow/utils/test_utils.py
@@ -0,0 +1,19 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+
+def compare_model(
+    model_ref: "nn.Module", model_trained: "nn.Module", module_trained: Optional[list[str]] = None
+) -> None:
+    state_dict_ref = model_ref.state_dict()
+    state_dict_trained = model_trained.state_dict()
+    assert set(state_dict_ref.keys()) == set(state_dict_trained.keys())
+
+    for name in state_dict_ref.keys():
+        if module_trained is not None:
+            if any([module in name for module in module_trained]):
+                assert torch.allclose(state_dict_ref[name], state_dict_trained[name], rtol=1e-4, atol=1e-5) is False
+        else:
+            assert torch.allclose(state_dict_ref[name], state_dict_trained[name], rtol=1e-4, atol=1e-5) is True
diff --git a/src/lmflow/utils/versioning.py b/src/lmflow/utils/versioning.py
index bce50b6b8..a08bf119f 100644
--- a/src/lmflow/utils/versioning.py
+++ b/src/lmflow/utils/versioning.py
@@ -1,12 +1,11 @@
 import importlib
-import sys
 import logging
-from typing import Tuple, List, Union
-from importlib.metadata import version, PackageNotFoundError
+import sys
+from pathlib import Path
+from typing import Union
 
 import pkg_resources
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -20,30 +19,29 @@ def _is_package_available(package_name: str, skippable: bool = False):
         importlib.import_module(package_name)
         return True
     except Exception as e:
-        if e.__class__ == ModuleNotFoundError:
+        if isinstance(e, ModuleNotFoundError):
             return False
         else:
             if skippable:
-                logger.warning(f'An error occurred when importing {package_name}:\n{e}\n{package_name} is disabled.')
+                logger.warning(f"An error occurred when importing {package_name}:\n{e}\n{package_name} is disabled.")
                 return False
             else:
                 raise e
-            
-            
-def _is_packages_available(packages: Union[List[str], List[Tuple[str, bool]]]):
+
+
+def _is_packages_available(packages: Union[list[str], list[tuple[str, bool]]]):
     if isinstance(packages[0], str):
         return all([_is_package_available(package) for package in packages])
     elif isinstance(packages[0], tuple):
         return all([_is_package_available(package, skippable) for package, skippable in packages])
     else:
         raise ValueError(f"Invalid type of packages: {type(packages[0])}")
-    
+
 
 def is_package_version_at_least(package_name, min_version):
     try:
         package_version = pkg_resources.get_distribution(package_name).version
-        if (pkg_resources.parse_version(package_version)
-                < pkg_resources.parse_version(min_version)):
+        if pkg_resources.parse_version(package_version) < pkg_resources.parse_version(min_version):
             return False
     except pkg_resources.DistributionNotFound:
         return False
@@ -75,4 +73,32 @@ def is_trl_available():
 
 
 def is_multimodal_available():
-    return _is_packages_available(["PIL"])
\ No newline at end of file
+    return _is_packages_available(["PIL"])
+
+
+def is_deepspeed_available():
+    return _is_package_available("deepspeed")
+
+
+def get_lmflow_dir(return_src_dir: bool = False) -> Path:
+    try:
+        from importlib.util import find_spec
+
+        spec = find_spec("lmflow")
+        if spec is None:
+            return None
+
+        origin = Path(spec.origin)
+        if origin is None:
+            return None
+
+        if origin.name == "__init__.py":
+            if return_src_dir:
+                return origin.parent
+            else:
+                return origin.parent.parent.parent
+        else:
+            return None
+
+    except (ImportError, AttributeError):
+        return None
diff --git a/src/lmflow/version.py b/src/lmflow/version.py
index 00ec2dcdb..5becc17c0 100644
--- a/src/lmflow/version.py
+++ b/src/lmflow/version.py
@@ -1 +1 @@
-__version__ = "0.0.9"
+__version__ = "1.0.0"
diff --git a/tests/conftest.py b/tests/conftest.py
index ff5378519..9d7aee5f9 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,16 +1,8 @@
-def pytest_addoption(parser):
-    parser.addoption("--cpu-only", action="store_true", help="run tests that only requires cpu")
-    parser.addoption("--skip-slow", action="store_true", help="skip slow tests")
-    
-def pytest_collection_modifyitems(config, items):
-    new_items = []
-    for item in items:
-        func = item.function
-        if config.getoption("--cpu-only"):
-            if not (func.__doc__ and "#cpu" in func.__doc__.lower()):
-                continue
-        if config.getoption("--skip-slow"):
-            if func.__doc__ and "#slow" in func.__doc__.lower():
-                continue
-        new_items.append(item)
-    items[:] = new_items
\ No newline at end of file
+import pytest
+
+
+def pytest_configure(config: pytest.Config):
+    config.addinivalue_line("markers", "gpu: requires gpu")
+    config.addinivalue_line("markers", "slow: marks tests as slow")
+    config.addinivalue_line("markers", "lmflow_core: tests for core lmflow functionality")
+    config.addinivalue_line("markers", "dothis: mark for dev to do the specified tests only")
diff --git a/tests/datasets/test_dataset.py b/tests/datasets/test_dataset.py
index 98cebd30a..c3c9fa358 100644
--- a/tests/datasets/test_dataset.py
+++ b/tests/datasets/test_dataset.py
@@ -1,5 +1,4 @@
 #!/bin/env/python3
-# coding=utf-8
 """A one-line summary of the module or program, terminated by a period.
 
 Leave one blank line.  The rest of this docstring should contain an
@@ -12,11 +11,10 @@
   foo = ClassFoo()
   bar = foo.FunctionBar()
 """
-from __future__ import absolute_import
-import unittest
 
 import json
 import os
+import unittest
 from pathlib import Path
 
 from lmflow.args import DatasetArguments
@@ -24,40 +22,35 @@
 
 
 class DatasetTest(unittest.TestCase):
-
     def test_init(self):
-        dataset_dir = 'data/example_dataset/train'
-        data_args = DatasetArguments(
-            dataset_path=dataset_dir
-        )
-        dataset = Dataset(data_args, backend='huggingface')
+        dataset_dir = "data/example_dataset/train"
+        data_args = DatasetArguments(dataset_path=dataset_dir)
+        dataset = Dataset(data_args, backend="huggingface")
         hf_dataset = dataset.get_backend_dataset()
 
-        with open(os.path.join(Path(dataset_dir), 'train_50.json'), 'r') as fin:
+        with open(os.path.join(Path(dataset_dir), "train_50.json")) as fin:
             json_obj = json.load(fin)
             for i in range(len(hf_dataset)):
-                self.assertEqual(json_obj['instances'][i], hf_dataset[i])
-
+                self.assertEqual(json_obj["instances"][i], hf_dataset[i])
 
     def test_create_from_dict(self):
         data_dict = {
             "type": "text2text",
             "instances": [
-                { "input": "INPUT 1", "output": "OUTPUT 1" },
-                { "input": "INPUT 2", "output": "OUTPUT 2" },
-            ]
+                {"input": "INPUT 1", "output": "OUTPUT 1"},
+                {"input": "INPUT 2", "output": "OUTPUT 2"},
+            ],
         }
         dataset = Dataset.create_from_dict(data_dict)
         self.assertEqual(dataset.to_dict(), data_dict)
 
-
     def test_create_from_dict_bad_type(self):
         data_dict = {
             "type": "non-supported",
             "instances": [
-                { "input": "INPUT 1", "output": "OUTPUT 1" },
-                { "input": "INPUT 2", "output": "OUTPUT 2" },
-            ]
+                {"input": "INPUT 1", "output": "OUTPUT 1"},
+                {"input": "INPUT 2", "output": "OUTPUT 2"},
+            ],
         }
         with self.assertRaises(ValueError):
             dataset = Dataset.create_from_dict(data_dict)
diff --git a/tests/models/test_auto_model.py b/tests/models/test_auto_model.py
index 7c86bbecf..a8abda559 100644
--- a/tests/models/test_auto_model.py
+++ b/tests/models/test_auto_model.py
@@ -3,43 +3,37 @@
 from lmflow.args import ModelArguments
 from lmflow.models.auto_model import AutoModel
 from lmflow.models.hf_decoder_model import HFDecoderModel
-from lmflow.models.text_regression_model import TextRegressionModel
-from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
+
+# from lmflow.models.hf_encoder_decoder_model import HFEncoderDecoderModel
 
 MODEL_NAME = "gpt2"
 
 
 class AutoModelTest(unittest.TestCase):
-
     def test_get_decoder_model(self):
-        model_args = ModelArguments(
-            arch_type="decoder_only", model_name_or_path=MODEL_NAME)
+        model_args = ModelArguments(arch_type="decoder_only", model_name_or_path=MODEL_NAME)
         model = AutoModel.get_model(model_args)
         self.assertTrue(isinstance(model, HFDecoderModel))
 
-
     # This unit test is commented out since the encoder decoder model has not been fully implemented
-    '''
+    """
     def test_get_text_regression_model(self):
         model_args = ModelArguments(
             arch_type="text_regression", model_name_or_path=MODEL_NAME)
         model = AutoModel.get_model(model_args)
         self.assertTrue(isinstance(model, TextRegressionModel))
-    '''
-
+    """
 
     # This unit test is commented out since the encoder decoder model has not been fully implemented
-    '''
+    """
     def test_get_encoder_decoder(self):
         model_args = ModelArguments(
             arch_type="encoder_decoder", model_name_or_path=MODEL_NAME)
         model = AutoModel.get_model(model_args)
         self.assertTrue(isinstance(model, HFEncoderDecoderModel))
-    '''
-
+    """
 
     def test_get_unsupported_model(self):
-        model_args = ModelArguments(
-            arch_type="unsupported model", model_name_or_path=MODEL_NAME)
+        model_args = ModelArguments(arch_type="unsupported model", model_name_or_path=MODEL_NAME)
         with self.assertRaises(NotImplementedError):
             model = AutoModel.get_model(model_args)
diff --git a/tests/models/test_hf_decoder_model.py b/tests/models/test_hf_decoder_model.py
index c6e8d096d..4a07c32ff 100644
--- a/tests/models/test_hf_decoder_model.py
+++ b/tests/models/test_hf_decoder_model.py
@@ -18,7 +18,6 @@
 import json
 import os
 from pathlib import Path
-from transformers.deepspeed import HfDeepSpeedConfig
 
 from lmflow.args import DatasetArguments, ModelArguments
 from lmflow.datasets.dataset import Dataset
@@ -653,13 +652,12 @@ def test_inference(self):
         ds_config_path = "examples/ds_config.json"
         with open (ds_config_path, "r") as f:
             ds_config = json.load(f)
-        dschf = HfDeepSpeedConfig(ds_config)
         model_name = 'gpt2'
         model_args = ModelArguments(
             model_name_or_path=model_name,
             use_ram_optimized_load=False
         )
-        model = HFDecoderModel(model_args, tune_strategy='none', ds_config=ds_config)
+        model = HFDecoderModel(model_args, do_train=False, ds_config=ds_config)
         self.local_rank = int(os.getenv("LOCAL_RANK", "0"))
         self.world_size = int(os.getenv("WORLD_SIZE", "1"))
         torch.cuda.set_device(self.local_rank) 
diff --git a/tests/models/test_tool_inferencer.py b/tests/models/test_tool_inferencer.py
index fd856bf57..8d66135a4 100644
--- a/tests/models/test_tool_inferencer.py
+++ b/tests/models/test_tool_inferencer.py
@@ -1,11 +1,9 @@
-from lmflow.pipeline.inferencer import ToolInferencer
 import unittest
-from lmflow.args import InferencerArguments
-from lmflow.args import ModelArguments
-from lmflow.args import DatasetArguments
-from lmflow.models import hf_decoder_model
 
-CODE_1 = "print(\"hello world\")"
+from lmflow.args import DatasetArguments, InferencerArguments, ModelArguments
+from lmflow.pipeline.inferencer import ToolInferencer
+
+CODE_1 = 'print("hello world")'
 RES_1 = "hello world\n"
 CODE_2 = "b=a+1\nprint(b)"
 RES_2 = """Traceback (most recent call last):
@@ -13,24 +11,18 @@
 NameError: name 'a' is not defined
 """
 
+
 class ToolInferencerTest(unittest.TestCase):
     def set_up(self):
         model_args = ModelArguments(model_name_or_path="codellama/CodeLlama-7b-instruct-hf")
-        model = hf_decoder_model.HFDecoderModel(model_args)
         inferencer_args = InferencerArguments()
         data_args = DatasetArguments()
         self.toolinf = ToolInferencer(model_args, data_args, inferencer_args)
-        
-    def test_code_exec_1(self,code=CODE_1, expected_output=RES_1):
-        
+
+    def test_code_exec_1(self, code=CODE_1, expected_output=RES_1):
         toolinf_res = self.toolinf.code_exec(code)
         self.assertEqual(toolinf_res, expected_output)
-        
-    def test_code_exec_2(self,code=CODE_2):
+
+    def test_code_exec_2(self, code=CODE_2):
         toolinf_res = self.toolinf.code_exec(code)
         self.assertNotEqual(toolinf_res.returncode, 0)
-        
-unittest.main()
-        
-        
-        
\ No newline at end of file
diff --git a/tests/pipeline/test_auto_pipeline.py b/tests/pipeline/test_auto_pipeline.py
index e7a09786a..621c2d920 100644
--- a/tests/pipeline/test_auto_pipeline.py
+++ b/tests/pipeline/test_auto_pipeline.py
@@ -1,28 +1,20 @@
 import unittest
 
-from lmflow.args import DatasetArguments
-from lmflow.args import EvaluatorArguments
-from lmflow.args import FinetunerArguments
-from lmflow.args import InferencerArguments
-from lmflow.args import ModelArguments
-from lmflow.args import RaftAlignerArguments
+from lmflow.args import DatasetArguments, EvaluatorArguments, FinetunerArguments, InferencerArguments, ModelArguments
 from lmflow.pipeline.auto_pipeline import AutoPipeline
 from lmflow.pipeline.evaluator import Evaluator
 from lmflow.pipeline.finetuner import Finetuner
 from lmflow.pipeline.inferencer import Inferencer
-from lmflow.pipeline.raft_aligner import RaftAligner
 
 MODEL_NAME = "gpt2"
 
 
 class AutoPipelineTest(unittest.TestCase):
-
     def test_get_evaluator_pipeline(self):
         model_args = ModelArguments(model_name_or_path=MODEL_NAME)
         dataset_args = DatasetArguments()
         evaluator_args = EvaluatorArguments()
-        pipeline = AutoPipeline.get_pipeline(
-            "evaluator", model_args, dataset_args, evaluator_args)
+        pipeline = AutoPipeline.get_pipeline("evaluator", model_args, dataset_args, evaluator_args)
 
         self.assertTrue(isinstance(pipeline, Evaluator))
 
@@ -30,8 +22,7 @@ def test_get_finetuner_pipeline(self):
         model_args = ModelArguments(model_name_or_path=MODEL_NAME)
         dataset_args = DatasetArguments()
         finetuner_args = FinetunerArguments(output_dir="~/tmp")
-        pipeline = AutoPipeline.get_pipeline(
-            "finetuner", model_args, dataset_args, finetuner_args)
+        pipeline = AutoPipeline.get_pipeline("finetuner", model_args, dataset_args, finetuner_args)
 
         self.assertTrue(isinstance(pipeline, Finetuner))
 
@@ -39,24 +30,13 @@ def test_get_inferencer_pipeline(self):
         model_args = ModelArguments(model_name_or_path=MODEL_NAME)
         dataset_args = DatasetArguments()
         inferencer_args = InferencerArguments()
-        pipeline = AutoPipeline.get_pipeline(
-            "inferencer", model_args, dataset_args, inferencer_args)
+        pipeline = AutoPipeline.get_pipeline("inferencer", model_args, dataset_args, inferencer_args)
 
         self.assertTrue(isinstance(pipeline, Inferencer))
 
-    def test_get_raft_aligner_pipeline(self):
-        model_args = ModelArguments(model_name_or_path=MODEL_NAME)
-        dataset_args = DatasetArguments()
-        raft_aligner_args = RaftAlignerArguments(output_dir="~/tmp")
-        pipeline = AutoPipeline.get_pipeline(
-            "raft_aligner", model_args, dataset_args, raft_aligner_args)
-
-        self.assertTrue(isinstance(pipeline, RaftAligner))
-
     def test_get_unsupported_pipeline(self):
         model_args = ModelArguments(model_name_or_path=MODEL_NAME)
         dataset_args = DatasetArguments()
 
-        with self.assertRaisesRegex(NotImplementedError, "Pipeline \"unsupported\" is not supported"):
-            pipeline = AutoPipeline.get_pipeline(
-                "unsupported", model_args, dataset_args, None)
+        with self.assertRaisesRegex(NotImplementedError, 'Pipeline "unsupported" is not supported'):
+            pipeline = AutoPipeline.get_pipeline("unsupported", model_args, dataset_args, None)
diff --git a/tests/pipeline/test_finetuner_distributed_loss.py b/tests/pipeline/test_finetuner_distributed_loss.py
new file mode 100644
index 000000000..b84831abe
--- /dev/null
+++ b/tests/pipeline/test_finetuner_distributed_loss.py
@@ -0,0 +1,411 @@
+import json
+import os
+from datetime import datetime
+from enum import Enum
+from typing import Optional
+
+import pytest
+import torch
+from transformers.testing_utils import (
+    TestCasePlus,
+    _RunOutput,
+    execute_subprocess_async,
+    get_torch_dist_unique_port,
+)
+
+from lmflow.utils.versioning import get_lmflow_dir, is_deepspeed_available
+
+TEST_START_TIME = datetime.now().strftime("%Y%m%d_%H%M%S")
+KEEP_TEST_FILES = os.environ.get("LMFLOW_KEEP_TEST_FILES", "1") == "1"
+LOG_TEST_STD = os.environ.get("LMFLOW_LOG_TEST_STD", "1") == "1"
+TEST_MODEL_NAME_OR_PATH = os.getenv("LMFLOW_TEST_MODEL_NAME_OR_PATH", "Qwen/Qwen2.5-0.5B-Instruct")
+TEST_STEPS = int(os.getenv("LMFLOW_TEST_STEPS", 100))
+# python subprocess takes list of strings as command, don't forget to split
+ACCELERATE_ARGS = f"""
+    --machine_rank 0
+    --main_training_function main
+    --num_machines 1
+    --num_processes {torch.cuda.device_count()}
+    --rdzv_backend static
+    --same_network
+    --mixed_precision no
+    --dynamo_backend no
+    --main_process_port {get_torch_dist_unique_port()}
+""".split()
+ACCELERATE_DSZ3_ARGS = """
+    --use_deepspeed
+    --deepspeed_multinode_launcher standard
+    --offload_optimizer_device none
+    --offload_param_device none
+    --zero3_init_flag true
+    --zero3_save_16bit_model true
+    --zero_stage 3
+""".split()
+ACCELERATE_FSDP_ARGS = """
+    --use_fsdp
+    --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP
+    --fsdp_min_num_params 1000000
+    --fsdp_backward_prefetch BACKWARD_PRE
+    --fsdp_forward_prefetch false
+    --fsdp_cpu_ram_efficient_loading true
+    --fsdp_offload_params false
+    --fsdp_sharding_strategy FULL_SHARD
+    --fsdp_state_dict_type FULL_STATE_DICT
+    --fsdp_sync_module_states true
+    --fsdp_use_orig_params true
+""".split()
+DEEPSPEED_BASE_ARGS = f"""
+    --master_port {get_torch_dist_unique_port()}
+    --num_gpus {torch.cuda.device_count()}
+""".split()
+DEEPSPEED_ZERO3_CONFIG_ARGS = """
+    --deepspeed configs/deepspeed/zero3.json
+""".split()
+LMFLOW_BASE_ARGS = f"""
+    --model_name_or_path {TEST_MODEL_NAME_OR_PATH}
+    --trust_remote_code 0 
+    --dataset_path {get_lmflow_dir() / "data/alpaca/train_conversation"} 
+    --overwrite_output_dir 
+    --conversation_template qwen2 
+    --disable_group_texts 1 
+    --num_train_epochs 1 
+    --per_device_train_batch_size 1 
+    --gradient_accumulation_steps 1 
+    --block_size 512 
+    --learning_rate 2e-5 
+    --lr_scheduler_type cosine  
+    --validation_split_percentage 0 
+    --logging_steps 1 
+    --max_steps {TEST_STEPS} 
+    --do_train 
+    --ddp_timeout 72000 
+    --save_steps 5000 
+    --use_flash_attention 0 
+    --gradient_checkpointing 0 
+    --dataloader_num_workers 8 
+    --report_to wandb 
+    --seed 42 
+    --save_strategy no
+""".split()
+LMFLOW_FP32_ARGS = """
+    --torch_dtype float32
+""".split()
+LMFLOW_BF16_ARGS = """
+    --bf16
+    --torch_dtype bfloat16
+""".split()
+LMFLOW_LORA_ARGS = """
+    --use_lora 1
+    --lora_r 8
+    --lora_alpha 32
+    --lora_dropout 0.1
+""".split()
+LMFLOW_QLORA_ARGS = """
+    --use_qlora 1
+    --quant_bit 4
+    --lora_r 8
+    --lora_alpha 32
+    --lora_dropout 0.1
+""".split()
+LMFLOW_LISA_ARGS = """
+    --lisa_activated_layers 1
+    --lisa_interval_steps 20
+""".split()
+LMFLOW_CUSTOM_OPTIM_ARGS = """
+    --use_customized_optim 1
+    --customized_optim adabelief
+    --optim_beta1 0.9
+    --optim_beta2 0.99
+    --optim_weight_decay 0
+""".split()
+
+
+class AccelerateBackend(Enum):
+    FSDP = "fsdp"
+    DEEPSPEED_ZERO3 = "deepspeed_zero3"
+
+
+class DeepSpeedZeroStage(Enum):
+    ZERO3 = "zero3"
+
+
+class PeftMethod(Enum):
+    NO = "full"
+    LORA = "lora"
+    QLORA = "qlora"
+    LISA = "lisa"
+
+
+class TestDtype(Enum):
+    FP32 = "fp32"
+    BF16 = "bf16"
+
+
+ACCELERATE_BACKEND_ARGS_MAPPING = {
+    AccelerateBackend.FSDP: ACCELERATE_FSDP_ARGS,
+    AccelerateBackend.DEEPSPEED_ZERO3: ACCELERATE_DSZ3_ARGS,
+}
+DEEPSPEED_ZERO_CONFIG_MAPPING = {
+    DeepSpeedZeroStage.ZERO3: DEEPSPEED_ZERO3_CONFIG_ARGS,
+}
+LMFLOW_PEFT_ARGS_MAPPING = {
+    PeftMethod.LORA: LMFLOW_LORA_ARGS,
+    PeftMethod.QLORA: LMFLOW_QLORA_ARGS,
+    PeftMethod.LISA: LMFLOW_LISA_ARGS,
+}
+
+
+class TestFinetunerBase(TestCasePlus):
+    """Full finetune, and other base functionality tests"""
+
+    def setUp(self):
+        os.environ["WANDB_PROJECT"] = "lmflow-pytest"
+        self.lmflow_dir = get_lmflow_dir()
+        self.lmflow_examples_dir = self.lmflow_dir / "examples"
+        return super().setUp()
+
+    def _make_cmd(self, run_name: str, args: list[list[str]]) -> list[str]:
+        cmd = []
+        for arg in args:
+            assert isinstance(arg, list)
+            cmd.extend(arg)
+
+        self.output_dir = self.get_auto_remove_tmp_dir(
+            tmp_dir=f"./tests_out/{run_name}",
+            before=True,
+            after=not KEEP_TEST_FILES,
+        )
+        logging_args = f"""
+        --run_name {TEST_START_TIME}_{run_name}
+        --output_dir {self.output_dir}
+        """.split()
+
+        return cmd + logging_args
+
+    def _log_std(self, res: _RunOutput):
+        if LOG_TEST_STD:
+            with open(f"{self.output_dir}/stdout.log", "w") as f:
+                for line in res.stdout:
+                    f.write(line + "\n")
+            with open(f"{self.output_dir}/stderr.log", "w") as f:
+                for line in res.stderr:
+                    f.write(line + "\n")
+
+    def _load_trainer_state(self, output_dir: str) -> dict:
+        with open(f"{output_dir}/trainer_state.json") as f:
+            trainer_state = json.load(f)
+        return trainer_state
+
+    def _run_with_accelerate(
+        self,
+        backend: AccelerateBackend,
+        *,
+        dtype: Optional[TestDtype] = TestDtype.FP32,
+        extra_args: Optional[list[str]] = None,
+        run_name_note: Optional[str] = None,
+    ) -> dict:
+        assert isinstance(backend, AccelerateBackend)
+        run_name = f"test_finetuner_accelerate_{backend.value}"
+        if run_name_note:
+            run_name += f"_{str(run_name_note)}"
+
+        all_args = [
+            ["accelerate", "launch"],
+            ACCELERATE_ARGS,
+            ACCELERATE_BACKEND_ARGS_MAPPING[backend],
+            [f"{self.lmflow_examples_dir}/finetune.py"],
+            LMFLOW_BASE_ARGS,
+        ]
+        if dtype == TestDtype.FP32:
+            all_args.append(LMFLOW_FP32_ARGS)
+        elif dtype == TestDtype.BF16:
+            all_args.append(LMFLOW_BF16_ARGS)
+        else:
+            raise ValueError(f"Unsupported dtype: {dtype}")
+
+        if extra_args and len(extra_args) > 0:
+            all_args.append(extra_args)
+
+        cmd = self._make_cmd(run_name, all_args)
+        res = execute_subprocess_async(cmd)
+        self._log_std(res)
+        return self._load_trainer_state(self.output_dir)
+
+    def _run_with_deepspeed(
+        self,
+        zero_stage: DeepSpeedZeroStage,
+        *,
+        dtype: Optional[TestDtype] = TestDtype.FP32,
+        extra_args: Optional[list[str]] = None,
+        run_name_note: Optional[str] = None,
+    ) -> dict:
+        assert isinstance(zero_stage, DeepSpeedZeroStage)
+        run_name = f"test_finetuner_deepspeed_{zero_stage.value}"
+        if run_name_note:
+            run_name += f"_{str(run_name_note)}"
+
+        all_args = [
+            ["deepspeed"],
+            DEEPSPEED_BASE_ARGS,
+            [f"{self.lmflow_examples_dir}/finetune.py"],
+            DEEPSPEED_ZERO_CONFIG_MAPPING[zero_stage],
+            LMFLOW_BASE_ARGS,
+        ]
+        if dtype == TestDtype.FP32:
+            all_args.append(LMFLOW_FP32_ARGS)
+        elif dtype == TestDtype.BF16:
+            all_args.append(LMFLOW_BF16_ARGS)
+        else:
+            raise ValueError(f"Unsupported dtype: {dtype}")
+
+        if extra_args and len(extra_args) > 0:
+            all_args.append(extra_args)
+
+        cmd = self._make_cmd(run_name, all_args)
+        res = execute_subprocess_async(cmd)
+        self._log_std(res)
+        return self._load_trainer_state(self.output_dir)
+
+    def _compare_loss(self, trainer_state1: dict, trainer_state2: dict):
+        loss1 = []
+        loss2 = []
+        for step_idx in range(TEST_STEPS):
+            assert trainer_state1["log_history"][step_idx]["step"] == trainer_state2["log_history"][step_idx]["step"], (
+                "log_history step mismatch, check traner_state.json"
+            )
+            loss1.append(trainer_state1["log_history"][step_idx]["loss"])
+            loss2.append(trainer_state2["log_history"][step_idx]["loss"])
+
+        self.assertTrue(torch.allclose(torch.tensor(loss1), torch.tensor(loss2), rtol=1e-2, atol=0))
+
+    @pytest.mark.lmflow_core
+    @pytest.mark.gpu
+    @pytest.mark.slow
+    @pytest.mark.skipif(not is_deepspeed_available(), reason="Deepspeed is not available")
+    def test_loss_accelerate_dsz3_vs_fsdp(self):
+        dsz3_trainer_state = self._run_with_accelerate(
+            backend=AccelerateBackend.DEEPSPEED_ZERO3, run_name_note=PeftMethod.NO.value
+        )
+        fsdp_trainer_state = self._run_with_accelerate(
+            backend=AccelerateBackend.FSDP, run_name_note=PeftMethod.NO.value
+        )
+        self._compare_loss(dsz3_trainer_state, fsdp_trainer_state)
+
+    @pytest.mark.lmflow_core
+    @pytest.mark.gpu
+    @pytest.mark.slow
+    @pytest.mark.skipif(not is_deepspeed_available(), reason="Deepspeed is not available")
+    def test_loss_deepspeed_z3_vs_accelerate_dsz3(self):
+        deepspeed_z3_trainer_state = self._run_with_deepspeed(
+            zero_stage=DeepSpeedZeroStage.ZERO3, run_name_note=PeftMethod.NO.value
+        )
+        accelerate_dsz3_train_state = self._run_with_accelerate(
+            backend=AccelerateBackend.DEEPSPEED_ZERO3, run_name_note=PeftMethod.NO.value
+        )
+        self._compare_loss(deepspeed_z3_trainer_state, accelerate_dsz3_train_state)
+
+
+class TestFinetunerLora(TestFinetunerBase):
+    @pytest.mark.lmflow_core
+    @pytest.mark.gpu
+    @pytest.mark.slow
+    @pytest.mark.skipif(not is_deepspeed_available(), reason="Deepspeed is not available")
+    def test_loss_accelerate_dsz3_vs_fsdp(self):
+        dsz3_trainer_state = self._run_with_accelerate(
+            backend=AccelerateBackend.DEEPSPEED_ZERO3,
+            extra_args=LMFLOW_PEFT_ARGS_MAPPING[PeftMethod.LORA],
+            run_name_note=PeftMethod.LORA.value,
+        )
+        fsdp_trainer_state = self._run_with_accelerate(
+            backend=AccelerateBackend.FSDP,
+            extra_args=LMFLOW_PEFT_ARGS_MAPPING[PeftMethod.LORA],
+            run_name_note=PeftMethod.LORA.value,
+        )
+        self._compare_loss(dsz3_trainer_state, fsdp_trainer_state)
+
+    @pytest.mark.lmflow_core
+    @pytest.mark.gpu
+    @pytest.mark.slow
+    @pytest.mark.skipif(not is_deepspeed_available(), reason="Deepspeed is not available")
+    def test_loss_deepspeed_z3_vs_accelerate_dsz3(self):
+        deepspeed_z3_trainer_state = self._run_with_deepspeed(
+            zero_stage=DeepSpeedZeroStage.ZERO3,
+            extra_args=LMFLOW_PEFT_ARGS_MAPPING[PeftMethod.LORA],
+            run_name_note=PeftMethod.LORA.value,
+        )
+        accelerate_dsz3_train_state = self._run_with_accelerate(
+            backend=AccelerateBackend.DEEPSPEED_ZERO3,
+            extra_args=LMFLOW_PEFT_ARGS_MAPPING[PeftMethod.LORA],
+            run_name_note=PeftMethod.LORA.value,
+        )
+        self._compare_loss(deepspeed_z3_trainer_state, accelerate_dsz3_train_state)
+
+
+class TestFinetunerQlora(TestFinetunerBase):
+    """Currently only supports in bf16"""
+
+    @pytest.mark.lmflow_core
+    @pytest.mark.gpu
+    @pytest.mark.slow
+    @pytest.mark.skipif(not is_deepspeed_available(), reason="Deepspeed is not available")
+    def test_loss_accelerate_dsz3_vs_fsdp(self):
+        dsz3_trainer_state = self._run_with_accelerate(
+            backend=AccelerateBackend.DEEPSPEED_ZERO3,
+            dtype=TestDtype.BF16,
+            extra_args=LMFLOW_PEFT_ARGS_MAPPING[PeftMethod.QLORA],
+            run_name_note=PeftMethod.QLORA.value,
+        )
+        fsdp_trainer_state = self._run_with_accelerate(
+            backend=AccelerateBackend.FSDP,
+            dtype=TestDtype.BF16,
+            extra_args=LMFLOW_PEFT_ARGS_MAPPING[PeftMethod.QLORA],
+            run_name_note=PeftMethod.QLORA.value,
+        )
+        self._compare_loss(dsz3_trainer_state, fsdp_trainer_state)
+
+    @pytest.mark.lmflow_core
+    @pytest.mark.gpu
+    @pytest.mark.slow
+    @pytest.mark.skipif(not is_deepspeed_available(), reason="Deepspeed is not available")
+    def test_loss_deepspeed_z3_vs_accelerate_dsz3(self):
+        deepspeed_z3_trainer_state = self._run_with_deepspeed(
+            zero_stage=DeepSpeedZeroStage.ZERO3,
+            dtype=TestDtype.BF16,
+            extra_args=LMFLOW_PEFT_ARGS_MAPPING[PeftMethod.QLORA],
+            run_name_note=PeftMethod.QLORA.value,
+        )
+        accelerate_dsz3_train_state = self._run_with_accelerate(
+            backend=AccelerateBackend.DEEPSPEED_ZERO3,
+            dtype=TestDtype.BF16,
+            extra_args=LMFLOW_PEFT_ARGS_MAPPING[PeftMethod.QLORA],
+            run_name_note=PeftMethod.QLORA.value,
+        )
+        self._compare_loss(deepspeed_z3_trainer_state, accelerate_dsz3_train_state)
+
+
+class TestFinetunerCustomOptim(TestFinetunerBase):
+    @pytest.mark.lmflow_core
+    @pytest.mark.gpu
+    @pytest.mark.slow
+    @pytest.mark.skipif(not is_deepspeed_available(), reason="Deepspeed is not available")
+    def test_loss_accelerate_dsz3_vs_fsdp(self):
+        dsz3_trainer_state = self._run_with_accelerate(
+            backend=AccelerateBackend.DEEPSPEED_ZERO3, extra_args=LMFLOW_CUSTOM_OPTIM_ARGS, run_name_note="custom_optim"
+        )
+        fsdp_trainer_state = self._run_with_accelerate(
+            backend=AccelerateBackend.FSDP, extra_args=LMFLOW_CUSTOM_OPTIM_ARGS, run_name_note="custom_optim"
+        )
+        self._compare_loss(dsz3_trainer_state, fsdp_trainer_state)
+
+    @pytest.mark.lmflow_core
+    @pytest.mark.gpu
+    @pytest.mark.slow
+    @pytest.mark.skipif(not is_deepspeed_available(), reason="Deepspeed is not available")
+    def test_loss_deepspeed_z3_vs_accelerate_dsz3(self):
+        deepspeed_z3_trainer_state = self._run_with_deepspeed(
+            zero_stage=DeepSpeedZeroStage.ZERO3, extra_args=LMFLOW_CUSTOM_OPTIM_ARGS, run_name_note="custom_optim"
+        )
+        accelerate_dsz3_train_state = self._run_with_accelerate(
+            backend=AccelerateBackend.DEEPSPEED_ZERO3, extra_args=LMFLOW_CUSTOM_OPTIM_ARGS, run_name_note="custom_optim"
+        )
+        self._compare_loss(deepspeed_z3_trainer_state, accelerate_dsz3_train_state)
diff --git a/tests/pipeline/test_memory_safe_vllm_inferencer.py b/tests/pipeline/test_memory_safe_vllm_inferencer.py
index 19c9a6a1e..540b208b9 100644
--- a/tests/pipeline/test_memory_safe_vllm_inferencer.py
+++ b/tests/pipeline/test_memory_safe_vllm_inferencer.py
@@ -1,23 +1,25 @@
-# cannot use unittest, since memory safe vllm inference uses stdout, 
+# cannot use unittest, since memory safe vllm inference uses stdout,
 # which has conflicts with unittest stdout.
 import logging
-import json
 
-from lmflow.args import DatasetArguments, ModelArguments, InferencerArguments
+import pytest
+
+vllm = pytest.importorskip("vllm")
+
+from lmflow.args import DatasetArguments, InferencerArguments, ModelArguments
+from lmflow.datasets import Dataset
 from lmflow.models.hf_decoder_model import HFDecoderModel
 from lmflow.pipeline.vllm_inferencer import MemorySafeVLLMInferencer
-from lmflow.datasets import Dataset
-
 
 logger = logging.getLogger(__name__)
 
 
 model_args = ModelArguments(
-    'Qwen/Qwen2-0.5B', 
-    torch_dtype='auto',
+    "Qwen/Qwen2-0.5B",
+    torch_dtype="auto",
 )
 data_args = DatasetArguments(
-    './data/alpaca/test_conversation',
+    "./data/alpaca/test_conversation",
     preprocessing_num_workers=4,
 )
 inferencer_args = InferencerArguments(
@@ -27,7 +29,7 @@
     temperature=1.0,
     max_new_tokens=1024,
     save_results=True,
-    results_path='./data/mem_safe_vllm_res.json',
+    results_path="./data/mem_safe_vllm_res.json",
     use_vllm=True,
     enable_decode_inference_result=False,
     vllm_gpu_memory_utilization=0.95,
@@ -48,15 +50,17 @@ def test_init(self):
 
     def test_inference(self):
         res = self.inferencer.inference()
-        test_res = all([
-            isinstance(res, list), 
-            isinstance(res[0], list), 
-            isinstance(res[0][0], list), 
-            isinstance(res[0][0][0], int),
-        ])
+        test_res = all(
+            [
+                isinstance(res, list),
+                isinstance(res[0], list),
+                isinstance(res[0][0], list),
+                isinstance(res[0][0][0], int),
+            ]
+        )
         self.status.append(test_res)
         logger.warning(f"test_inference: {test_res}")
-        
+
     def test_inference_detokenize(self):
         inferencer_args.enable_decode_inference_result = True
         self.inferencer = MemorySafeVLLMInferencer(
@@ -65,21 +69,23 @@ def test_inference_detokenize(self):
             inferencer_args=inferencer_args,
         )
         res = self.inferencer.inference()
-        test_res = all([
-            isinstance(res, list), 
-            isinstance(res[0], list), 
-            isinstance(res[0][0], str), 
-        ])
+        test_res = all(
+            [
+                isinstance(res, list),
+                isinstance(res[0], list),
+                isinstance(res[0][0], str),
+            ]
+        )
         self.status.append(test_res)
         logger.warning(f"test_inference_detokenize: {test_res}")
-        
+
     def summary(self):
         logger.warning(f"MemorySafeVLLMInferencerTest: {all(self.status)}")
-        
-        
+
+
 if __name__ == "__main__":
     test = MemorySafeVLLMInferencerTest()
     test.test_init()
     test.test_inference()
     test.test_inference_detokenize()
-    test.summary()
\ No newline at end of file
+    test.summary()
diff --git a/tests/utils/test_conversation_formatter.py b/tests/utils/test_conversation_formatter.py
index 39425cbc7..bb433fa3e 100644
--- a/tests/utils/test_conversation_formatter.py
+++ b/tests/utils/test_conversation_formatter.py
@@ -1,25 +1,25 @@
 import unittest
+
 from lmflow.utils.conversation_template.base import StringFormatter, TemplateComponent
 
 
 class StringFormatterTest(unittest.TestCase):
-
     def test_format_string_component(self):
         formatter = StringFormatter(
             template=[
-                TemplateComponent(type='token', content='bos_token'),
-                TemplateComponent(type='string', content='[INST] {{content}} [/INST]'),
-                TemplateComponent(type='token', content='eos_token')
+                TemplateComponent(type="token", content="bos_token"),
+                TemplateComponent(type="string", content="[INST] {{content}} [/INST]"),
+                TemplateComponent(type="token", content="eos_token"),
             ]
         )
-        formatted_components = formatter.format(content='Who are you?')
+        formatted_components = formatter.format(content="Who are you?")
         expected_components = [
-            TemplateComponent(type='token', content='bos_token'),
-            TemplateComponent(type='string', content='[INST] Who are you? [/INST]'),
-            TemplateComponent(type='token', content='eos_token')
+            TemplateComponent(type="token", content="bos_token"),
+            TemplateComponent(type="string", content="[INST] Who are you? [/INST]"),
+            TemplateComponent(type="token", content="eos_token"),
         ]
         self.assertEqual(formatted_components, expected_components)
 
 
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/utils/test_conversation_template.py b/tests/utils/test_conversation_template.py
index 69fa1f02e..bc93b4a37 100644
--- a/tests/utils/test_conversation_template.py
+++ b/tests/utils/test_conversation_template.py
@@ -1,186 +1,187 @@
-from __future__ import absolute_import
 import unittest
 
 from transformers import AutoTokenizer
 
 from lmflow.utils.conversation_template import PRESET_TEMPLATES
 
-
 CONVERSATION_SINGLETURN = {
     "system": "sysinfo",
-    "messages": [
-        {
-            "role": "user",
-            "content": "Hello"
-        },
-        {
-            "role": "assistant",
-            "content": "Hi!"
-        }
-    ]
+    "messages": [{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi!"}],
 }
 
 CONVERSATION_SINGLETURN_LLAMA2 = [
-    {
-        "role": "user",
-        "content": "[INST] <<SYS>>\nsysinfo\n<</SYS>>\n\nHello [/INST]"
-    },
-    {
-        "role": "assistant",
-        "content": "Hi!"
-    }
+    {"role": "user", "content": "[INST] <<SYS>>\nsysinfo\n<</SYS>>\n\nHello [/INST]"},
+    {"role": "assistant", "content": "Hi!"},
 ]
 
 CONVERSATION_SINGLETURN_LLAMA2_IDS = [
     (
-        [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 9675, 3888, 13, 
-         29966, 829, 14816, 29903, 6778, 13, 13, 10994, 518, 29914, 25580, 29962],
-        [6324, 29991, 2]
+        [
+            1,
+            518,
+            25580,
+            29962,
+            3532,
+            14816,
+            29903,
+            6778,
+            13,
+            9675,
+            3888,
+            13,
+            29966,
+            829,
+            14816,
+            29903,
+            6778,
+            13,
+            13,
+            10994,
+            518,
+            29914,
+            25580,
+            29962,
+        ],
+        [6324, 29991, 2],
     )
 ]
 
 CONVERSATION_MULTITURN = {
     "system": "sysinfo",
     "messages": [
-        {
-            "role": "user",
-            "content": "Hello"
-        },
-        {
-            "role": "assistant",
-            "content": "Hi!"
-        },
-        {
-            "role": "user",
-            "content": "How are you?"
-        },
-        {
-            "role": "assistant",
-            "content": "I'm good, thanks!"
-        }
-    ]
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hi!"},
+        {"role": "user", "content": "How are you?"},
+        {"role": "assistant", "content": "I'm good, thanks!"},
+    ],
 }
 
 CONVERSATION_MULTITURN_LLAMA2 = [
-    {
-        "role": "user",
-        "content": "[INST] <<SYS>>\nsysinfo\n<</SYS>>\n\nHello [/INST]"
-    },
-    {
-        "role": "assistant",
-        "content": "Hi!"
-    },
-    {
-        "role": "user",
-        "content": "[INST] How are you? [/INST]"
-    },
-    {
-        "role": "assistant",
-        "content": "I'm good, thanks!"
-    }
+    {"role": "user", "content": "[INST] <<SYS>>\nsysinfo\n<</SYS>>\n\nHello [/INST]"},
+    {"role": "assistant", "content": "Hi!"},
+    {"role": "user", "content": "[INST] How are you? [/INST]"},
+    {"role": "assistant", "content": "I'm good, thanks!"},
 ]
 
 CONVERSATION_MULTITURN_LLAMA2_IDS = [
     (
-        [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 9675, 3888, 13, 
-         29966, 829, 14816, 29903, 6778, 13, 13, 10994, 518, 29914, 25580, 29962], 
-        [6324, 29991, 2]
-    ), 
+        [
+            1,
+            518,
+            25580,
+            29962,
+            3532,
+            14816,
+            29903,
+            6778,
+            13,
+            9675,
+            3888,
+            13,
+            29966,
+            829,
+            14816,
+            29903,
+            6778,
+            13,
+            13,
+            10994,
+            518,
+            29914,
+            25580,
+            29962,
+        ],
+        [6324, 29991, 2],
+    ),
     (
-        [1, 518, 25580, 29962, 1128, 526, 366, 29973, 518, 29914, 25580, 29962], 
-        [306, 29915, 29885, 1781, 29892, 3969, 29991, 2]
-    )
+        [1, 518, 25580, 29962, 1128, 526, 366, 29973, 518, 29914, 25580, 29962],
+        [306, 29915, 29885, 1781, 29892, 3969, 29991, 2],
+    ),
 ]
 
 
 class EmptyConversationTemplateTest(unittest.TestCase):
     def setUp(self):
-        MODEL_PATH = 'meta-llama/Llama-2-7b-hf'
+        MODEL_PATH = "meta-llama/Llama-2-7b-hf"
         self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
-        self.conversation_template = PRESET_TEMPLATES['empty']
+        self.conversation_template = PRESET_TEMPLATES["empty"]
 
     def test_encode_conversation_singleturn_llama2(self):
         res = self.conversation_template.encode_conversation(
-            tokenizer=self.tokenizer,
-            messages=CONVERSATION_SINGLETURN_LLAMA2,
-            system=None,
-            tools=None
+            tokenizer=self.tokenizer, messages=CONVERSATION_SINGLETURN_LLAMA2, system=None, tools=None
         )
         self.assertEqual(res, CONVERSATION_SINGLETURN_LLAMA2_IDS)
 
     def test_encode_conversation_multiturn_llama2(self):
         res = self.conversation_template.encode_conversation(
-            tokenizer=self.tokenizer,
-            messages=CONVERSATION_MULTITURN_LLAMA2,
-            system=None,
-            tools=None
+            tokenizer=self.tokenizer, messages=CONVERSATION_MULTITURN_LLAMA2, system=None, tools=None
         )
         self.assertEqual(res, CONVERSATION_MULTITURN_LLAMA2_IDS)
-        
-        
+
+
 class Llama2ConversationTemplateTest(unittest.TestCase):
     def setUp(self):
-        MODEL_PATH = 'meta-llama/Llama-2-7b-hf'
+        MODEL_PATH = "meta-llama/Llama-2-7b-hf"
         self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
-        self.conversation_template = PRESET_TEMPLATES['llama2']
-        
+        self.conversation_template = PRESET_TEMPLATES["llama2"]
+
     def test_encode_conversation_singleturn(self):
         res = self.conversation_template.encode_conversation(
             tokenizer=self.tokenizer,
-            messages=CONVERSATION_SINGLETURN['messages'],
-            system=CONVERSATION_SINGLETURN['system'],
-            tools=None
+            messages=CONVERSATION_SINGLETURN["messages"],
+            system=CONVERSATION_SINGLETURN["system"],
+            tools=None,
         )
         self.assertEqual(res, CONVERSATION_SINGLETURN_LLAMA2_IDS)
-        
+
     def test_encode_conversation_multiturn(self):
         res = self.conversation_template.encode_conversation(
             tokenizer=self.tokenizer,
-            messages=CONVERSATION_MULTITURN['messages'],
-            system=CONVERSATION_MULTITURN['system'],
-            tools=None
+            messages=CONVERSATION_MULTITURN["messages"],
+            system=CONVERSATION_MULTITURN["system"],
+            tools=None,
         )
         self.assertEqual(res, CONVERSATION_MULTITURN_LLAMA2_IDS)
-        
-        
+
+
 class Qwen2ConversationTemplateTest(unittest.TestCase):
     def setUp(self):
-        MODEL_PATH = 'Qwen/Qwen1.5-0.5B-Chat'
+        MODEL_PATH = "Qwen/Qwen1.5-0.5B-Chat"
         self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
-        self.conversation_template = PRESET_TEMPLATES['qwen2']
-        
+        self.conversation_template = PRESET_TEMPLATES["qwen2"]
+
     def test_encode_conversation_singleturn(self):
         res = self.conversation_template.encode_conversation(
             tokenizer=self.tokenizer,
-            messages=CONVERSATION_SINGLETURN['messages'],
-            system=CONVERSATION_SINGLETURN['system'],
-            tools=None
+            messages=CONVERSATION_SINGLETURN["messages"],
+            system=CONVERSATION_SINGLETURN["system"],
+            tools=None,
         )
         print(res)
-        
+
     def test_encode_conversation_multiturn(self):
         res = self.conversation_template.encode_conversation(
             tokenizer=self.tokenizer,
-            messages=CONVERSATION_MULTITURN['messages'],
-            system=CONVERSATION_MULTITURN['system'],
-            tools=None
+            messages=CONVERSATION_MULTITURN["messages"],
+            system=CONVERSATION_MULTITURN["system"],
+            tools=None,
         )
         print(res)
-            
-        print('===')
-        print(self.tokenizer.apply_chat_template(
-            CONVERSATION_MULTITURN['messages'],
-            tokenize=True,
-            add_generation_prompt=False
-        ))
-        print('===')
-        print(self.tokenizer.apply_chat_template(
-            CONVERSATION_MULTITURN['messages'],
-            tokenize=False,
-            add_generation_prompt=False
-        ))
-        print('===')
-
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
+
+        print("===")
+        print(
+            self.tokenizer.apply_chat_template(
+                CONVERSATION_MULTITURN["messages"], tokenize=True, add_generation_prompt=False
+            )
+        )
+        print("===")
+        print(
+            self.tokenizer.apply_chat_template(
+                CONVERSATION_MULTITURN["messages"], tokenize=False, add_generation_prompt=False
+            )
+        )
+        print("===")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/utils/test_data_utils.py b/tests/utils/test_data_utils.py
index 13960fba4..bd615fa8d 100644
--- a/tests/utils/test_data_utils.py
+++ b/tests/utils/test_data_utils.py
@@ -1,55 +1,57 @@
 #!/bin/env/python3
-# coding=utf-8
-from __future__ import absolute_import
 import unittest
-from lmflow.utils.data_utils import load_data, batchlize, answer_extraction
-from lmflow.args import DatasetArguments
 
-groundtruth_inputs = ['The Transformer architecture [START_REF]',
-          'The Schwarzschild radius is defined as: \\[',
-          'A force of 0.6N is applied to an object, which accelerates at 3m/s. What is its mass? <work>', 
-          '[START_I_SMILES]',
-          '[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords', 
-          'The reason why Transformers replaced RNNs was because', 
-          'Question: What is the notch signaling pathway?\n\nAnswer:', 
-          '# Multi-Head Attention\n\n',
-          'Title: Self-Supervised Learning, A Survey\n\nAuthors: John Smith\n\n', 
-          'Lecture 1: The Ising Model\n\n', 
-          'Information overload is a major obstacle to scientific progress. The explosive growth in scientific literature and data has made it ever harder to discover useful insights in a large mass of information. Today scientific knowledge is accessed through search engines, but they are unable to organize scientific knowledge alone. In this paper we introduce Galactica: a large language model that can store, combine and reason about scientific knowledge. We train on a large scientific corpus of papers, reference material, knowledge bases and many other sources. We outperform existing models on a range of scientific tasks. On technical knowledge probes such as LaTeX equations, Galactica outperforms the latest GPT-3 by 68.2% versus 49.0%. Galactica also performs well on reasoning, outperforming Chinchilla on mathematical MMLU by 41.3% to 35.7%, and PaLM 540B on MATH with a score of 20.4% versus 8.8%. It also sets a new state-of-the-art on downstream tasks such as PubMedQA and MedMCQA dev of 77.6% and 52.9%. And despite not being trained on a general corpus, Galactica outperforms BLOOM and OPT-175B on BIG-bench. We believe these results demonstrate the potential for language models as a new interface for science. We open source the model for the benefit of the scientific community.\n\nTLDR:',
-          '[START_I_SMILES]C(C(=O)O)N[END_I_SMILES]\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for',
-          'what is the capital of US?',
-          ]
+from lmflow.utils.data_utils import answer_extraction, batchlize, load_data
+
+groundtruth_inputs = [
+    "The Transformer architecture [START_REF]",
+    "The Schwarzschild radius is defined as: \\[",
+    "A force of 0.6N is applied to an object, which accelerates at 3m/s. What is its mass? <work>",
+    "[START_I_SMILES]",
+    "[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO] ## Keywords",
+    "The reason why Transformers replaced RNNs was because",
+    "Question: What is the notch signaling pathway?\n\nAnswer:",
+    "# Multi-Head Attention\n\n",
+    "Title: Self-Supervised Learning, A Survey\n\nAuthors: John Smith\n\n",
+    "Lecture 1: The Ising Model\n\n",
+    "Information overload is a major obstacle to scientific progress. The explosive growth in scientific literature and data has made it ever harder to discover useful insights in a large mass of information. Today scientific knowledge is accessed through search engines, but they are unable to organize scientific knowledge alone. In this paper we introduce Galactica: a large language model that can store, combine and reason about scientific knowledge. We train on a large scientific corpus of papers, reference material, knowledge bases and many other sources. We outperform existing models on a range of scientific tasks. On technical knowledge probes such as LaTeX equations, Galactica outperforms the latest GPT-3 by 68.2% versus 49.0%. Galactica also performs well on reasoning, outperforming Chinchilla on mathematical MMLU by 41.3% to 35.7%, and PaLM 540B on MATH with a score of 20.4% versus 8.8%. It also sets a new state-of-the-art on downstream tasks such as PubMedQA and MedMCQA dev of 77.6% and 52.9%. And despite not being trained on a general corpus, Galactica outperforms BLOOM and OPT-175B on BIG-bench. We believe these results demonstrate the potential for language models as a new interface for science. We open source the model for the benefit of the scientific community.\n\nTLDR:",
+    "[START_I_SMILES]C(C(=O)O)N[END_I_SMILES]\n\n## Chemical and Physical Properties\n\nThe following are chemical properties for",
+    "what is the capital of US?",
+]
 
 groundtruth_outputs = ["NA"] * 13
 
-mc_output = ['Answer: (C) Generation of free radicals',
-             'Answer: C Generation of free radicals',
-             'Answer: C',
-             'Answer: (C)',
-             'A: C',
-             'A: (C)',
-             'Output: (C) Generation of free radicals',
-             'Output: C Generation of free radicals',
-             'Output: C',
-             'Output: (C)',
-            ]
+mc_output = [
+    "Answer: (C) Generation of free radicals",
+    "Answer: C Generation of free radicals",
+    "Answer: C",
+    "Answer: (C)",
+    "A: C",
+    "A: (C)",
+    "Output: (C) Generation of free radicals",
+    "Output: C Generation of free radicals",
+    "Output: C",
+    "Output: (C)",
+]
+
+mc_answer = ["c"] * 10
 
-mc_answer = ['c'] * 10
+qa_output = [
+    "Yes.",
+    "Answer: Yes",
+    "Answer: Yes.",
+    "Yes ",
+    "No.",
+    "Answer: No",
+    "Answer: No.",
+    "No ",
+    "Maybe.",
+    "Answer: Maybe",
+    "Answer: Maybe.",
+    "Maybe ",
+]
+qa_answer = ["yes"] * 4 + ["no"] * 4 + ["maybe"] * 4
 
-qa_output = ['Yes.',
-             'Answer: Yes',
-             'Answer: Yes.',
-             'Yes ',
-             'No.',
-             'Answer: No',
-             'Answer: No.',
-             'No ',
-             'Maybe.',
-             'Answer: Maybe',
-             'Answer: Maybe.',
-             'Maybe ', 
-            ]
-qa_answer = ['yes'] * 4 + ['no'] * 4 + ['maybe'] * 4
 
 class DataUtilsTest(unittest.TestCase):
     def test_load_data(self):
@@ -57,31 +59,31 @@ def test_load_data(self):
 
         inputs, outputs, datasize = load_data(file_name=file_name)
         # Test for inputs
-        for i in range(0,len(inputs)):
+        for i in range(0, len(inputs)):
             self.assertEqual(inputs[i], groundtruth_inputs[i])
         # Test for outputs
-        for i in range(0,len(outputs)):
+        for i in range(0, len(outputs)):
             self.assertEqual(outputs[i], groundtruth_outputs[i])
         # Test for datasize
         self.assertEqual(datasize, 13)
-    
+
     def test_batchlize(self):
         file_name = "data/example_dataset/test/test_13.json"
         inputs, outputs, datasize = load_data(file_name=file_name)
         dataset = []
         for idx in range(len(outputs)):
-            dataset.append({"input":inputs[idx], "output":outputs[idx], "input_idx":idx})
+            dataset.append({"input": inputs[idx], "output": outputs[idx], "input_idx": idx})
         # TODO: add test for random shuffle case
-        dataloader = batchlize(dataset, 4, random_shuffle= False)
-        self.assertEqual(len(dataloader),  13 // 4 + 1)
+        dataloader = batchlize(dataset, 4, random_shuffle=False)
+        self.assertEqual(len(dataloader), 13 // 4 + 1)
 
     def test_answer_extraction(self):
         # Test for medmcqa dataset
-        for i in range(0,len(mc_output)):
+        for i in range(0, len(mc_output)):
             self.assertEqual(answer_extraction(mc_output[i], answer_type="medmcqa"), mc_answer[i])
         # Test for usmle dataset
-        for i in range(0,len(mc_output)):
+        for i in range(0, len(mc_output)):
             self.assertEqual(answer_extraction(mc_output[i], answer_type="usmle"), mc_answer[i])
         # Test for pubmedqa dataset
-        for i in range(0,len(qa_output)):
+        for i in range(0, len(qa_output)):
             self.assertEqual(answer_extraction(qa_output[i], answer_type="pubmedqa"), qa_answer[i])
diff --git a/utils/apply_delta.py b/utils/apply_delta.py
deleted file mode 100644
index 757be9536..000000000
--- a/utils/apply_delta.py
+++ /dev/null
@@ -1,165 +0,0 @@
-"""
-Apply the delta weights on top of a base model.
-
-Usage:
-python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1
-"""
-import argparse
-import gc
-import glob
-import json
-import os
-import shutil
-import tempfile
-
-from huggingface_hub import snapshot_download
-import torch
-from torch import nn
-from tqdm import tqdm
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
-
-
-GB = 1 << 30
-
-
-def split_files(model_path, tmp_path, split_size):
-    if not os.path.exists(model_path):
-        model_path = snapshot_download(repo_id=model_path)
-    if not os.path.exists(tmp_path):
-        os.makedirs(tmp_path)
-
-    file_pattern = os.path.join(model_path, "pytorch_model-*.bin")
-    files = glob.glob(file_pattern)
-
-    part = 0
-    try:
-        for file_path in tqdm(files):
-            state_dict = torch.load(file_path)
-            new_state_dict = {}
-
-            current_size = 0
-            for name, param in state_dict.items():
-                param_size = param.numel() * param.element_size()
-
-                if current_size + param_size > split_size:
-                    new_file_name = f"pytorch_model-{part}.bin"
-                    new_file_path = os.path.join(tmp_path, new_file_name)
-                    torch.save(new_state_dict, new_file_path)
-                    current_size = 0
-                    new_state_dict = None
-                    gc.collect()
-                    new_state_dict = {}
-                    part += 1
-
-                new_state_dict[name] = param
-                current_size += param_size
-
-            new_file_name = f"pytorch_model-{part}.bin"
-            new_file_path = os.path.join(tmp_path, new_file_name)
-            torch.save(new_state_dict, new_file_path)
-            new_state_dict = None
-            gc.collect()
-            new_state_dict = {}
-            part += 1
-    except Exception as e:
-        print(f"An error occurred during split_files: {e}")
-        shutil.rmtree(tmp_path)
-        raise
-
-
-def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path):
-    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
-    delta_config = AutoConfig.from_pretrained(delta_path)
-
-    if os.path.exists(target_model_path):
-        shutil.rmtree(target_model_path)
-    os.makedirs(target_model_path)
-
-    split_size = 4 * GB
-
-    with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path:
-        print(f"Split files for the base model to {tmp_base_path}")
-        split_files(base_model_path, tmp_base_path, split_size)
-        print(f"Split files for the delta weights to {tmp_delta_path}")
-        split_files(delta_path, tmp_delta_path, split_size)
-
-        base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin")
-        base_files = glob.glob(base_pattern)
-        delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin")
-        delta_files = glob.glob(delta_pattern)
-        delta_state_dict = torch.load(delta_files[0])
-
-        print("Applying the delta")
-        weight_map = {}
-        total_size = 0
-
-        for i, base_file in tqdm(enumerate(base_files)):
-            state_dict = torch.load(base_file)
-            file_name = f"pytorch_model-{i}.bin"
-            for name, param in state_dict.items():
-                if name not in delta_state_dict:
-                    for delta_file in delta_files:
-                        delta_state_dict = torch.load(delta_file)
-                        gc.collect()
-                        if name in delta_state_dict:
-                            break
-
-                state_dict[name] += delta_state_dict[name]
-                weight_map[name] = file_name
-                total_size += param.numel() * param.element_size()
-                gc.collect()
-            torch.save(state_dict, os.path.join(target_model_path, file_name))
-
-        with open(
-            os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w"
-        ) as f:
-            json.dump(
-                {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f
-            )
-
-    print(f"Saving the target model to {target_model_path}")
-    delta_tokenizer.save_pretrained(target_model_path)
-    delta_config.save_pretrained(target_model_path)
-
-
-def apply_delta(base_model_path, target_model_path, delta_path):
-    print(f"Loading the delta weights from {delta_path}")
-    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False)
-    delta = AutoModelForCausalLM.from_pretrained(
-        delta_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
-    )
-
-    print(f"Loading the base model from {base_model_path}")
-    base = AutoModelForCausalLM.from_pretrained(
-        base_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
-    )
-
-    print("Applying the delta")
-    for name, param in tqdm(base.state_dict().items(), desc="Applying delta"):
-        assert name in delta.state_dict()
-        param.data += delta.state_dict()[name]
-
-    print(f"Saving the target model to {target_model_path}")
-    base.save_pretrained(target_model_path)
-    delta_tokenizer.save_pretrained(target_model_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base-model-path", type=str, required=True)
-    parser.add_argument("--target-model-path", type=str, required=True)
-    parser.add_argument("--delta-path", type=str, required=True)
-    parser.add_argument(
-        "--low-cpu-mem",
-        action="store_true",
-        help="Lower the cpu memory usage. This will split large files and use "
-        "disk as swap to reduce the memory usage below 10GB.",
-    )
-    args = parser.parse_args()
-
-    if args.low_cpu_mem:
-        apply_delta_low_cpu_mem(
-            args.base_model_path, args.target_model_path, args.delta_path
-        )
-    else:
-        apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
diff --git a/utils/convert_json_to_txt.py b/utils/convert_json_to_txt.py
deleted file mode 100644
index e2dd37965..000000000
--- a/utils/convert_json_to_txt.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-import argparse
-import logging
-
-import json
-from pathlib import Path
-
-logging.basicConfig(level=logging.WARNING)
-
-if __name__ == '__main__':
-    
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--dataset_path', default='./data/wiki_zh_eval', type=str, required=False)
-    parser.add_argument('--output_path', default='./data/wiki_zh_eval/converted_data.txt', type=str, required=False)
-    parser.add_argument('--overwrite', default=False, type=bool, required=False)
-    args = parser.parse_args()
-
-    dataset_path = args.dataset_path
-    outputfile = args.output_path
-
-    outputs_list = []
-    data_files = [
-                    x.absolute().as_posix()
-                    for x in Path(dataset_path).glob("*.json")
-                ]
-
-    for file_name in data_files:
-        with open(file_name) as fin:
-            json_data = json.load(fin)
-            type = json_data["type"]
-            for line in json_data["instances"]:
-                outputs_list.append(line["text"])
-                
-
-    if Path(outputfile).exists() and not args.overwrite:
-        logging.warning(f"File %s exists, will not overwrite.", outputfile)
-    else:
-        with open(outputfile, "w") as f:
-            for line in outputs_list:
-                f.write(line)
-
diff --git a/utils/convert_minigpt4_checkpoints.py b/utils/convert_minigpt4_checkpoints.py
deleted file mode 100644
index 56c65dc80..000000000
--- a/utils/convert_minigpt4_checkpoints.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import argparse
-import os.path as osp
-import torch
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Convert checkpoint from MiniGPT4")
-    parser.add_argument("--model_path", type=str, help="the model path for the to convert checkpoint")
-    parser.add_argument("--save_path", default=None, type=str, help="the save path for converted checkpoint")
-    args = parser.parse_args()
-    return args
-
-
-
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    model = torch.load(args.model_path)
-    model = model['model']
-    new_model = {}
-    for key, item in model.items():
-        key = key.replace("Qformer", "qformer")
-        key = key.replace("llama_proj", "language_projection")
-        key = key.replace("llama_model.model", "language_model.model")
-        new_model[key] = item
-    if args.save_path is None:
-        end_string = osp.splitext(args.model_path)
-        save_path = osp.dirname(args.model_path) + "/" + \
-                    osp.basename(args.model_path).replace(".pth", "") + \
-                    "-converted" + osp.splitext(args.model_path)[-1]
-    else:
-        save_path = args.save_path
-    print("save_path: {}".format(save_path))
-
-    torch.save(new_model, save_path)
diff --git a/utils/download_hf_file.py b/utils/download_hf_file.py
deleted file mode 100644
index 601d12f17..000000000
--- a/utils/download_hf_file.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import argparse
-from huggingface_hub import hf_hub_download
-import os
-import shutil
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Download huggingface file")
-    parser.add_argument("--repo_id", type=str, help="the repo id")
-    parser.add_argument("--filename", default=None, type=str, help="the file name for the download file")
-    parser.add_argument("--target_path", default="./", type=str, help="the target path for the download file")
-    parser.add_argument("--repo_type", default="dataset", type=str, help="the repo type")
-    args = parser.parse_args()
-    return args
-
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    print("Start downloading repo {} filename: {}".format(
-        args.repo_id, args.filename))
-    args.target_path = os.path.abspath(args.target_path)
-    source_path = hf_hub_download(repo_id=args.repo_id, filename=args.filename, repo_type=args.repo_type)
-    os.makedirs(args.target_path, exist_ok=True)
-    target_path = os.path.join(args.target_path, args.filename)
-    shutil.copyfile(source_path, target_path)
-    print("Finish downloading repo {} filename: {}".format(
-        args.repo_id, args.filename))
diff --git a/utils/lm_evaluator.py b/utils/lm_evaluator.py
deleted file mode 100644
index f84b8b85e..000000000
--- a/utils/lm_evaluator.py
+++ /dev/null
@@ -1,108 +0,0 @@
-import argparse
-import json
-import logging
-import fnmatch
-
-from lm_eval import tasks, evaluator
-
-logging.getLogger("openai").setLevel(logging.WARNING)
-
-
-class MultiChoice:
-    def __init__(self, choices):
-        self.choices = choices
-
-    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values):
-        for value in values.split(","):
-            if len(fnmatch.filter(self.choices, value)) == 0:
-                return False
-
-        return True
-
-    def __iter__(self):
-        for choice in self.choices:
-            yield choice
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", required=True)
-    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS))
-    parser.add_argument("--provide_description", action="store_true")
-    parser.add_argument("--num_fewshot", type=int, default=0)
-    parser.add_argument("--batch_size", type=int, default=None)
-    parser.add_argument("--device", type=str, default=None)
-    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=int, default=None)
-    parser.add_argument("--no_cache", action="store_true")
-    parser.add_argument("--decontamination_ngrams_path", default=None)
-    parser.add_argument("--description_dict_path", default=None)
-    parser.add_argument("--check_integrity", action="store_true")
-
-    return parser.parse_args()
-
-
-# Returns a list containing all values of the source_list that
-# match at least one of the patterns
-def pattern_match(patterns, source_list):
-    task_names = set()
-    for pattern in patterns:
-        for matching in fnmatch.filter(source_list, pattern):
-            task_names.add(matching)
-    return list(task_names)
-
-
-def main():
-    args = parse_args()
-
-    assert not args.provide_description  # not implemented
-
-    if args.limit:
-        print(
-            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
-        )
-
-    if args.tasks is None:
-        task_names = tasks.ALL_TASKS
-    else:
-        task_names = pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
-
-    print(f"Selected Tasks: {task_names}")
-
-    description_dict = {}
-    if args.description_dict_path:
-        with open(args.description_dict_path, "r") as f:
-            description_dict = json.load(f)
-
-    results = evaluator.simple_evaluate(
-        model=args.model,
-        model_args=args.model_args,
-        tasks=task_names,
-        num_fewshot=args.num_fewshot,
-        batch_size=args.batch_size,
-        device=args.device,
-        no_cache=args.no_cache,
-        limit=args.limit,
-        description_dict=description_dict,
-        decontamination_ngrams_path=args.decontamination_ngrams_path,
-        check_integrity=args.check_integrity,
-    )
-
-    dumped = json.dumps(results, indent=2)
-    print(dumped)
-
-    if args.output_path:
-        with open(args.output_path, "w") as f:
-            f.write(dumped)
-
-    print(
-        f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
-        f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
-    )
-    print(evaluator.make_table(results))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/utils/make_delta.py b/utils/make_delta.py
deleted file mode 100644
index c495ea61b..000000000
--- a/utils/make_delta.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""
-Make the delta weights by subtracting base weights.
-
-Usage:
-python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1
-"""
-import argparse
-
-import torch
-from tqdm import tqdm
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-
-def make_delta(base_model_path, target_model_path, delta_path):
-    print(f"Loading the base model from {base_model_path}")
-    base = AutoModelForCausalLM.from_pretrained(
-        base_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
-    )
-
-    print(f"Loading the target model from {target_model_path}")
-    target = AutoModelForCausalLM.from_pretrained(
-        target_model_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True
-    )
-    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False)
-
-    print("Calculating the delta")
-    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
-        assert name in base.state_dict()
-        param.data -= base.state_dict()[name]
-
-    print(f"Saving the delta to {delta_path}")
-    if args.hub_repo_id:
-        kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id}
-    else:
-        kwargs = {}
-    target.save_pretrained(delta_path, **kwargs)
-    target_tokenizer.save_pretrained(delta_path, **kwargs)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--base-model-path", type=str, required=True)
-    parser.add_argument("--target-model-path", type=str, required=True)
-    parser.add_argument("--delta-path", type=str, required=True)
-    parser.add_argument("--hub-repo-id", type=str)
-    args = parser.parse_args()
-
-    make_delta(args.base_model_path, args.target_model_path, args.delta_path)
diff --git a/utils/merge_tokenizer.py b/utils/merge_tokenizer.py
deleted file mode 100644
index 9931edcf6..000000000
--- a/utils/merge_tokenizer.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-import argparse
-import logging
-import os
-
-from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
-import sentencepiece as spm
-
-import torch
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-from transformers import AutoTokenizer,LlamaTokenizer
-
-logging.basicConfig(level=logging.INFO)
-
-if __name__ == '__main__':
-    
-    os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"]="python"
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--tokenizer_dir', default='openlm-research/open_llama_3b', type=str, required=False)
-    parser.add_argument('--chinese_sp_model_file', default='./output_models/new_tokenizer/example.model', type=str)
-    parser.add_argument('--output_dir', default='./output_models/merged_tokenizer', type=str, required=False)
-    args = parser.parse_args()
-
-    tokenizer_dir = args.tokenizer_dir
-    chinese_sp_model_file = args.chinese_sp_model_file
-    output_dir = args.output_dir
-    
-    # load
-    try:
-        old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, use_fast=False)
-    except RecursionError:
-        old_tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
-                                                    unk_token="<unk>",
-                                                    bos_token="<s>",
-                                                    eos_token="</s>",
-                                                    use_fast=False)
-        
-    if not isinstance(old_tokenizer,LlamaTokenizer):
-        raise ValueError("The tokenizer is not a LlamaTokenizer, we only support LlamaTokenizer for now.")
-
-    chinese_sp_model = spm.SentencePieceProcessor()
-    chinese_sp_model.Load(chinese_sp_model_file)
-
-    old_spm = sp_pb2_model.ModelProto()
-    old_spm.ParseFromString(old_tokenizer.sp_model.serialized_model_proto())
-    chinese_spm = sp_pb2_model.ModelProto()
-    chinese_spm.ParseFromString(chinese_sp_model.serialized_model_proto())
-    
-    ## Add Chinese tokens to old tokenizer
-    old_spm_tokens_set=set(p.piece for p in old_spm.pieces)
-    for p in chinese_spm.pieces:
-        piece = p.piece
-        if piece not in old_spm_tokens_set:
-            new_p = sp_pb2_model.ModelProto().SentencePiece()
-            new_p.piece = piece
-            new_p.score = 0
-            old_spm.pieces.append(new_p)
-
-    ## Save
-    output_sp_dir = output_dir + '/merged_tokenizer_sp'
-    output_hf_dir = output_dir + '/merged_tokenizer_hf' # the path to save tokenizer
-    os.makedirs(output_sp_dir,exist_ok=True)
-    with open(output_sp_dir+'/merged_tokenizer.model', 'wb') as f:
-        f.write(old_spm.SerializeToString())
-    
-    try:
-        tokenizer = AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path=tokenizer_dir,
-            vocab_file=output_sp_dir+'/merged_tokenizer.model',
-            use_fast=False
-        )
-    except RecursionError:
-        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=tokenizer_dir,
-                                                unk_token="<unk>",
-                                                bos_token="<s>",
-                                                eos_token="</s>",
-                                                vocab_file=output_sp_dir+'/merged_tokenizer.model',
-                                                use_fast=False)
-
-    tokenizer.save_pretrained(output_hf_dir)
-    logging.info(f"Merged tokenizer has been saved to %s",output_dir)
-
-
-    # Test
-    new_tokenizer = tokenizer
-    logging.info(f"Old tokenizer vocab size: %d",len(old_tokenizer))
-    logging.info(f"New tokenizer vocab size: %d",len(new_tokenizer))
-    
-    text='''白日依山尽，黄河入海流。欲穷千里目，更上一层楼。
-    The primary use of LLaMA is research on large language models, including'''
-    logging.info(f"Test text:\n %s",text)
-    logging.info(f"Tokenized by original tokenizer:%s",old_tokenizer.tokenize(text))
-    logging.info(f"Tokenized by merged tokenizer:%s",new_tokenizer.tokenize(text))
\ No newline at end of file
diff --git a/utils/preprocess_multimodal_data.py b/utils/preprocess_multimodal_data.py
deleted file mode 100644
index 10fc060a2..000000000
--- a/utils/preprocess_multimodal_data.py
+++ /dev/null
@@ -1,25 +0,0 @@
-import argparse
-import os.path as osp
-import torch
-import json
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Convert checkpoint from MiniGPT4")
-    parser.add_argument("--data_path", type=str, help="the model path for the to convert checkpoint")
-    parser.add_argument("--save_path", default=None, type=str, help="the save path for converted checkpoint")
-    parser.add_argument("--max_length", default=1000, type=int, help="the max length for the text file")
-    args = parser.parse_args()
-    return args
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    data = json.load(open(args.data_path))
-    for data_idx in data:
-        for item in data_idx['conversations']:
-            if len(item["value"]) > args.max_length:
-                item["value"] = item["value"][:args.max_length]
-    with open(args.save_path, 'w') as f:
-        json.dump(data, f)
-    print("finish processing the data.")
-            
\ No newline at end of file
diff --git a/utils/train_tokenizer.py b/utils/train_tokenizer.py
deleted file mode 100644
index 7b67f6c40..000000000
--- a/utils/train_tokenizer.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-
-import argparse
-import os
-import sentencepiece as spm
-
-if __name__ == '__main__':
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--dataset_path', default='./data/wiki_zh_eval/converted_data.txt', type=str, required=False)
-    parser.add_argument('--output_dir', default='./output_models/new_tokenizer', type=str, required=False)
-    parser.add_argument('--vocab_size', default=20000, type=int, required=False)
-    parser.add_argument('--model_type', default='bpe', type=str, required=False)
-    parser.add_argument('--user_defined_symbols', default='0,1,2,3,4,5,6,7,8,9,%', type=str, required=False)
-    parser.add_argument('--max_sentencepiece_length', default=4, type=int, required=False)
-    args = parser.parse_args()    
-
-    dataset_path = args.dataset_path
-    output_dir = args.output_dir
-    vocab_size = args.vocab_size
-    model_type = args.model_type
-    user_defined_symbols = args.user_defined_symbols
-    max_sentencepiece_length=args.max_sentencepiece_length
-    
-    def mkdir(path):
-        if not os.path.exists(path):
-            os.makedirs(path)
-    mkdir(output_dir)
-
-    spm.SentencePieceTrainer.train(
-    f'--input={dataset_path}'
-    f' --model_prefix={output_dir}/example'
-    f' --model_type={model_type}'
-    f' --vocab_size={vocab_size}'
-    f' --user_defined_symbols={user_defined_symbols}'
-    f' --max_sentencepiece_length={max_sentencepiece_length}'
-    f' --minloglevel=1'
-    )
\ No newline at end of file