From 7943c6a159eddeb0a0b35fbd71d279f45b510357 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Thu, 6 Mar 2025 18:01:36 +0800
Subject: [PATCH 1/3] [fix] update load lora model Readme;

---
 applications/ColossalChat/examples/README.md | 70 ++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index af10dea256ac..58ce7fb93640 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -892,6 +892,76 @@ The dialogues can by multiple turns and it can contain system prompt. For more d
 
 We use bf16 weights for finetuning. If you downloaded fp8 DeepSeek V3/R1 weights, you can use the [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) to convert the weights to bf16 via GPU. For Ascend NPU, you can use this [script](https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/LLM/DeepSeek/DeepSeek-V2/NPU_inference/fp8_cast_bf16.py).
 
+We also add details on how to load lora models using booster.
+```python
+import os
+import torch
+from peft import LoraConfig
+from torch import distributed as dist
+from torch.optim import AdamW
+import colossalai
+from colossalai.booster import Booster
+from colossalai.booster.plugin import HybridParallelPlugin, LowLevelZeroPlugin
+from transformers import AutoModelForCausalLM
+from peft import LoraConfig, prepare_model_for_kbit_training
+from tests.test_checkpoint_io.utils import shared_tempdir
+from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
+
+@clear_cache_before_run()
+def run_load_lora():
+    model_name = "Qwen/Qwen2.5-3B"
+
+    # 1.Load base model
+    base_model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        # quantization_config=bnb_config,
+        device_map="auto",
+        trust_remote_code=True
+    )
+
+    # 2.Set LoRA Config
+    peft_config = LoraConfig(
+        r=2,
+        lora_alpha=16,
+        target_modules=["q_proj", "o_proj", "k_proj", "v_proj","c_attn", "c_proj", "w1", "w2"],
+        lora_dropout=0.05,
+        bias="none",
+        task_type="CAUSAL_LM",
+        modules_to_save=["lm_head"]  # 保留语言模型头可训练
+    )
+    test_config = {
+        "lora_config": peft_config,
+        "quantize": False,
+    }
+
+    # 3.Init Optimizer, criterion, plugin
+    optimizer = AdamW(base_model.parameters(), lr=0.001)
+    def loss_fn(x):
+        (x * x).mean()
+    criterion = loss_fn
+    plugin=HybridParallelPlugin(tp_size=1,pp_size=1)
+    booster = Booster(plugin=plugin)
+
+    # 4.Load lora model via booster
+    peft_model = booster.enable_lora(base_model, **test_config)
+    model_save, optimizer, criterion, _, _ = booster.boost(peft_model, optimizer, criterion)
+
+    # 5.Save lora model
+    with shared_tempdir() as tempdir:
+        lora_ckpt_path = os.path.join(tempdir, "ckpt")
+        booster.save_lora_as_pretrained(model_save, lora_ckpt_path)
+        dist.barrier()
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_load_lora()
+
+
+@rerun_if_address_is_in_use()
+def test_torch_ddp_lora():
+    spawn(run_dist, 2)
+```
+
 #### Usage
 
 After preparing the dataset and model weights, you can run the script with the following command:

From c60f3eb36949fd20a041dfe7134b865c6bac086f Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Thu, 6 Mar 2025 18:35:48 +0800
Subject: [PATCH 2/3] [fix] update lora infer readme

---
 applications/ColossalChat/examples/README.md | 163 +++++++++++--------
 1 file changed, 98 insertions(+), 65 deletions(-)

diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index 58ce7fb93640..26f43ff8e81b 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -892,74 +892,107 @@ The dialogues can by multiple turns and it can contain system prompt. For more d
 
 We use bf16 weights for finetuning. If you downloaded fp8 DeepSeek V3/R1 weights, you can use the [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) to convert the weights to bf16 via GPU. For Ascend NPU, you can use this [script](https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/LLM/DeepSeek/DeepSeek-V2/NPU_inference/fp8_cast_bf16.py).
 
-We also add details on how to load lora models using booster.
+We have also added details on how to save and load lora models and reason with lora models.
 ```python
-import os
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig
+)
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+    PeftModel
+)
 import torch
-from peft import LoraConfig
-from torch import distributed as dist
-from torch.optim import AdamW
-import colossalai
-from colossalai.booster import Booster
-from colossalai.booster.plugin import HybridParallelPlugin, LowLevelZeroPlugin
-from transformers import AutoModelForCausalLM
-from peft import LoraConfig, prepare_model_for_kbit_training
-from tests.test_checkpoint_io.utils import shared_tempdir
-from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
-
-@clear_cache_before_run()
-def run_load_lora():
-    model_name = "Qwen/Qwen2.5-3B"
-
-    # 1.Load base model
-    base_model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        # quantization_config=bnb_config,
-        device_map="auto",
-        trust_remote_code=True
-    )
-
-    # 2.Set LoRA Config
-    peft_config = LoraConfig(
-        r=2,
-        lora_alpha=16,
-        target_modules=["q_proj", "o_proj", "k_proj", "v_proj","c_attn", "c_proj", "w1", "w2"],
-        lora_dropout=0.05,
-        bias="none",
-        task_type="CAUSAL_LM",
-        modules_to_save=["lm_head"]  # 保留语言模型头可训练
-    )
-    test_config = {
-        "lora_config": peft_config,
-        "quantize": False,
-    }
-
-    # 3.Init Optimizer, criterion, plugin
-    optimizer = AdamW(base_model.parameters(), lr=0.001)
-    def loss_fn(x):
-        (x * x).mean()
-    criterion = loss_fn
-    plugin=HybridParallelPlugin(tp_size=1,pp_size=1)
-    booster = Booster(plugin=plugin)
-
-    # 4.Load lora model via booster
-    peft_model = booster.enable_lora(base_model, **test_config)
-    model_save, optimizer, criterion, _, _ = booster.boost(peft_model, optimizer, criterion)
-
-    # 5.Save lora model
-    with shared_tempdir() as tempdir:
-        lora_ckpt_path = os.path.join(tempdir, "ckpt")
-        booster.save_lora_as_pretrained(model_save, lora_ckpt_path)
-        dist.barrier()
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
-    run_load_lora()
-
 
-@rerun_if_address_is_in_use()
-def test_torch_ddp_lora():
-    spawn(run_dist, 2)
+######
+# How to Create and Save a Lora Model
+######
+# 1.Set model path
+model_name = "Qwen/Qwen2.5-3B"
+lora_adapter = "./Qwen2.5-3B_lora"
+merged_model_path = "./qQwen2.5-3B_merged"
+
+# 2.Set quant config (optional)
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
+
+# 3.Load base model
+base_model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=bnb_config,
+    device_map="auto",
+    trust_remote_code=True
+)
+
+# 4.Quant base model
+base_model = prepare_model_for_kbit_training(
+    base_model,
+    use_gradient_checkpointing=True
+)
+
+# 5.Set Lora Config
+peft_config = LoraConfig(
+    r=2,
+    lora_alpha=16,
+    target_modules=["q_proj", "o_proj", "k_proj", "v_proj","c_attn", "c_proj", "w1", "w2"],
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+    modules_to_save=["lm_head"]
+)
+
+# 6.Create Lora model via base model
+peft_model = get_peft_model(base_model, peft_config)
+peft_model.print_trainable_parameters()
+
+# 7.Init Tokenizer
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
+    trust_remote_code=True,
+    pad_token="<|endoftext|>"
+)
+
+# 8.Save lora model
+peft_model.save_pretrained(lora_adapter)
+
+######
+# How to Load lora Model
+######
+# 9.Load base model and lora model
+base_model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True
+)
+
+peft_model = PeftModel.from_pretrained(
+    base_model,
+    lora_adapter,
+    torch_dtype=torch.bfloat16
+)
+
+# 10.merge lora model
+merged_model = peft_model.merge_and_unload()
+
+# 11. Save merged lora model
+merged_model.save_pretrained(
+    merged_model_path,
+    safe_serialization=True
+)
+tokenizer.save_pretrained(merged_model_path)
+
+# 12. test lora model output
+test_input = tokenizer("Instruction: Finding prime numbers up to 100\nAnswer:", return_tensors="pt").to("cuda")
+output = merged_model.generate(**test_input, max_new_tokens=100)
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 
 #### Usage

From e692586cd5a45a43b75f88a6127fe36151bec885 Mon Sep 17 00:00:00 2001
From: duanjunwen <935724073@qq.com>
Date: Fri, 7 Mar 2025 11:40:55 +0800
Subject: [PATCH 3/3] [fix] remove useless comments

---
 applications/ColossalChat/examples/README.md | 78 ++++----------------
 1 file changed, 16 insertions(+), 62 deletions(-)

diff --git a/applications/ColossalChat/examples/README.md b/applications/ColossalChat/examples/README.md
index 26f43ff8e81b..f111660a87e2 100755
--- a/applications/ColossalChat/examples/README.md
+++ b/applications/ColossalChat/examples/README.md
@@ -892,80 +892,26 @@ The dialogues can by multiple turns and it can contain system prompt. For more d
 
 We use bf16 weights for finetuning. If you downloaded fp8 DeepSeek V3/R1 weights, you can use the [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) to convert the weights to bf16 via GPU. For Ascend NPU, you can use this [script](https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/LLM/DeepSeek/DeepSeek-V2/NPU_inference/fp8_cast_bf16.py).
 
-We have also added details on how to save and load lora models and reason with lora models.
+We have also added details on how to load and reason with lora models.
 ```python
 from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
-    BitsAndBytesConfig
 )
 from peft import (
-    LoraConfig,
-    get_peft_model,
-    prepare_model_for_kbit_training,
     PeftModel
 )
 import torch
 
-######
-# How to Create and Save a Lora Model
-######
-# 1.Set model path
+# Set model path
 model_name = "Qwen/Qwen2.5-3B"
-lora_adapter = "./Qwen2.5-3B_lora"
-merged_model_path = "./qQwen2.5-3B_merged"
-
-# 2.Set quant config (optional)
-bnb_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.bfloat16
-)
-
-# 3.Load base model
-base_model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    quantization_config=bnb_config,
-    device_map="auto",
-    trust_remote_code=True
-)
-
-# 4.Quant base model
-base_model = prepare_model_for_kbit_training(
-    base_model,
-    use_gradient_checkpointing=True
-)
-
-# 5.Set Lora Config
-peft_config = LoraConfig(
-    r=2,
-    lora_alpha=16,
-    target_modules=["q_proj", "o_proj", "k_proj", "v_proj","c_attn", "c_proj", "w1", "w2"],
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-    modules_to_save=["lm_head"]
-)
-
-# 6.Create Lora model via base model
-peft_model = get_peft_model(base_model, peft_config)
-peft_model.print_trainable_parameters()
-
-# 7.Init Tokenizer
-tokenizer = AutoTokenizer.from_pretrained(
-    model_name,
-    trust_remote_code=True,
-    pad_token="<|endoftext|>"
-)
-
-# 8.Save lora model
-peft_model.save_pretrained(lora_adapter)
+lora_adapter = "Qwen2.5-3B_lora" # Your lora model Path
+merged_model_path = "Qwen2.5-3B_merged"
 
 ######
 # How to Load lora Model
 ######
-# 9.Load base model and lora model
+# 1.Load base model
 base_model = AutoModelForCausalLM.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
@@ -973,23 +919,31 @@ base_model = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 )
 
+# 2.Load lora model
 peft_model = PeftModel.from_pretrained(
     base_model,
     lora_adapter,
     torch_dtype=torch.bfloat16
 )
 
-# 10.merge lora model
+# 3.Merge lora model
 merged_model = peft_model.merge_and_unload()
 
-# 11. Save merged lora model
+# 4.Load tokenizer
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
+    trust_remote_code=True,
+    pad_token="<|endoftext|>"
+)
+
+# 5.Save merged lora model
 merged_model.save_pretrained(
     merged_model_path,
     safe_serialization=True
 )
 tokenizer.save_pretrained(merged_model_path)
 
-# 12. test lora model output
+# 6.Run Inference
 test_input = tokenizer("Instruction: Finding prime numbers up to 100\nAnswer:", return_tensors="pt").to("cuda")
 output = merged_model.generate(**test_input, max_new_tokens=100)
 print(tokenizer.decode(output[0], skip_special_tokens=True))