From 2f4edd279cf186670a730453549cd5e0592c83cf Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 22 Apr 2026 16:04:08 +0800
Subject: [PATCH 1/3] support q3.6-27b

---
 README.md                                     |   7 +-
 README_ZH.md                                  |   7 +-
 .../client/server/megatron/server_config.yaml |  26 ++--
 cookbook/client/tinker/modelscope/dpo.py      |   2 +-
 cookbook/client/tinker/modelscope/sample.py   |   2 +-
 .../tinker/modelscope/self_cognition.py       |   2 +-
 .../tinker/modelscope/short_math_grpo.py      |   2 +-
 cookbook/client/twinkle/modelscope/dpo.py     |   2 +-
 .../client/twinkle/modelscope/multi_modal.py  |   2 +-
 .../twinkle/modelscope/self_congnition.py     |   2 +-
 cookbook/rl/short_math_grpo_moe.py            |   2 +-
 cookbook/rl/short_math_grpo_multi_lora.py     |   4 +-
 .../Usage Guide/Train-as-a-Service.md         |   8 +-
 ...55\347\273\203\346\234\215\345\212\241.md" |   8 +-
 notebook/dpo.ipynb                            |  90 +++++++-----
 notebook/multi_modal.ipynb                    |  88 +++++++-----
 notebook/sample.ipynb                         |  65 ++++++---
 notebook/self_cognition.ipynb                 |  88 +++++++-----
 notebook/short_math_grpo.ipynb                | 133 ++++++++----------
 src/twinkle/server/gateway/server.py          |   2 +-
 20 files changed, 294 insertions(+), 248 deletions(-)

diff --git a/README.md b/README.md
index e6779d46..7c01765e 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,7 @@ sh INSTALL_MEGATRON.sh
 | Server startup scripts               | transformers/megatron | [Script](cookbook/client/server)                 |
 
 ## Changelog
+- 🎉2026-04-22 The ModelScope service has been deployed to [Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B) with a new release 0.2.1.
 - 🎉2026-04-14 The ModelScope service has been deployed to [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B) with a new release 0.2.0.
 - 🎉2026-03-28 Support DPO training with both Transformers and Megatron backends. See [dpo_full.py](cookbook/rl/dpo_full.py) and [dpo_lora.py](cookbook/rl/dpo_lora.py).
 - 🎉2026-03-24 Twinkle Web site is now live at https://modelscope.github.io/twinkle-web/
@@ -143,7 +144,7 @@ supported on Twinkle✨ framework.
 > For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it
 > is currently provided via the Tinker-compatible APIs. We will be rolling out services that support
 > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed
-> by one training base at a time, and currently it is [Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-35B-A3B).
+> by one training base at a time, and currently it is [Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-27B).
 
 | Model Type          | Model ID on [ModelScope](https://modelscope.cn)                                                                 |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
@@ -192,7 +193,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me
 
 def train():
     # to load model from Hugging Face, use 'hf://...'
-    base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
+    base_model = 'ms://Qwen/Qwen3.6-27B'
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
@@ -248,7 +249,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
+base_model = 'ms://Qwen/Qwen3.6-27B'
 base_url='your-base-url'
 api_key='your-api-key'
 
diff --git a/README_ZH.md b/README_ZH.md
index 64ce16a6..3496abb8 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -105,6 +105,7 @@ sh INSTALL_MEGATRON.sh
 Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Client等各场景下。其算法过程是外露的，非常便于修改和调试。完整的框架介绍请查看[快速开始](docs/source_zh/使用指引/快速开始.md)
 
 ## 更新日志
+🎉2026-04-22 ModelScope的训练服务部署为[Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B)，并发布了0.2.1版本.
 🎉2026-04-16 ModelScope的训练服务部署为[Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)，并发布了0.2.0版本.
 🎉2026-03-28 支持 DPO 训练，同时支持 Transformers 和 Megatron 后端。参考 [dpo_full.py](cookbook/rl/dpo_full.py) 和 [dpo_lora.py](cookbook/rl/dpo_lora.py)。
 🎉2026-03-24 Twinkle 站点上线，访问地址 https://modelscope.github.io/twinkle-web/
@@ -129,7 +130,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 随着新模型的发布，我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。
 
 >[!Note]
-> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)。
+> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-27B)。
 
 | Model Type          | Model ID 举例                                                                                                     |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
@@ -177,7 +178,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me
 
 def train():
     # to load model from Hugging Face, use 'hf://...'
-    base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
+    base_model = 'ms://Qwen/Qwen3.6-27B'
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
@@ -233,7 +234,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
+base_model = 'ms://Qwen/Qwen3.6-27B'
 base_url='your-base-url'
 api_key='your-api-key'
 
diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml
index d95bfa4e..69620020 100644
--- a/cookbook/client/server/megatron/server_config.yaml
+++ b/cookbook/client/server/megatron/server_config.yaml
@@ -39,15 +39,15 @@ applications:
   #    Used for generating text from the model (e.g., evaluating LoRA results).
   #    Config: TP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, ~37GB for KV cache + LoRA
   - name: sampler-Qwen3.6-35B-A3B
-    route_prefix: /api/v1/sampler/Qwen/Qwen3.6-35B-A3B
+    route_prefix: /api/v1/sampler/Qwen/Qwen3.6-27B
     import_path: sampler
     args:
-      model_id: "ms://Qwen/Qwen3.6-35B-A3B"  # ModelScope model identifier
+      model_id: "ms://Qwen/Qwen3.6-27B"  # ModelScope model identifier
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
-        max_model_len: 32000           # Maximum sequence length the engine supports
-        gpu_memory_utilization: 0.80   # 80% utilization, ~64GB/GPU, leaves buffer for safety
+        max_model_len: 65536           # Maximum sequence length the engine supports
+        gpu_memory_utilization: 0.75   # 80% utilization, ~64GB/GPU, leaves buffer for safety
         enable_lora: true             # Allow loading LoRA adapters during inference
         max_loras: 5                  # Max allowed loras working on vLLM at the same time
         max_lora_rank: 32             # Support up to rank 64 LoRA adapters
@@ -63,8 +63,8 @@ applications:
         tp_size: 2                  # 2 TP replicas for multi-tenant throughput
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 32000                            # Max tokens per second
-        max_input_tokens: 32000
+        tps_limit: 131072                            # Max tokens per second
+        max_input_tokens: 65536
     deployments:
       - name: SamplerManagement
         autoscaling_config:
@@ -81,12 +81,12 @@ applications:
   # 2. Model Service - Hosts the base model for training.
   #    Config: PP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, comfortable for LoRA training
   - name: models-Qwen3.6-35B-A3B
-    route_prefix: /api/v1/model/Qwen/Qwen3.6-35B-A3B
+    route_prefix: /api/v1/model/Qwen/Qwen3.6-27B
     import_path: model
     args:
       use_megatron: true                          # Use Megatron-LM backend
-      model_id: "ms://Qwen/Qwen3.6-35B-A3B" # ModelScope model identifier
-      max_length: 32000                           # model max length
+      model_id: "ms://Qwen/Qwen3.6-27B" # ModelScope model identifier
+      max_length: 65536                           # model max length
       max_loras: 3                                # model max loras
       nproc_per_node: 4                           # Number of GPU processes per node
       device_group:
@@ -95,15 +95,13 @@ applications:
         device_type: cuda
       device_mesh:
         device_type: cuda
-        tp_size: 2
-        ep_size: 2
+        dp_size: 2
         pp_size: 2
-        sequence_parallel: True
 
       queue_config:
         rps_limit: 20                               # Max requests per second
-        tps_limit: 32000                            # Max tokens per second
-        max_input_tokens: 32000
+        tps_limit: 131072                            # Max tokens per second
+        max_input_tokens: 65536
       adapter_config:
         adapter_timeout: 120                       # Seconds before idle adapter unload
         adapter_max_lifetime: 36000               # Maximum lifetime of an adapter in seconds (e.g., 10 hours)
diff --git a/cookbook/client/tinker/modelscope/dpo.py b/cookbook/client/tinker/modelscope/dpo.py
index a88b70a7..23cf5aae 100644
--- a/cookbook/client/tinker/modelscope/dpo.py
+++ b/cookbook/client/tinker/modelscope/dpo.py
@@ -39,7 +39,7 @@
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
-base_model = 'Qwen/Qwen3.6-35B-A3B'
+base_model = 'Qwen/Qwen3.6-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 api_key = os.environ.get('MODELSCOPE_TOKEN')
 dataset_id = 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji'
diff --git a/cookbook/client/tinker/modelscope/sample.py b/cookbook/client/tinker/modelscope/sample.py
index d12f3bd6..7e03e0bd 100644
--- a/cookbook/client/tinker/modelscope/sample.py
+++ b/cookbook/client/tinker/modelscope/sample.py
@@ -16,7 +16,7 @@
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3.6-35B-A3B'
+base_model = 'Qwen/Qwen3.6-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py
index 7780df60..f74ec073 100644
--- a/cookbook/client/tinker/modelscope/self_cognition.py
+++ b/cookbook/client/tinker/modelscope/self_cognition.py
@@ -23,7 +23,7 @@
 from tinker import ServiceClient
 
 # The base model to fine-tune / evaluate
-base_model = 'Qwen/Qwen3.6-35B-A3B'
+base_model = 'Qwen/Qwen3.6-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 
diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py
index cf210c46..bf57a942 100644
--- a/cookbook/client/tinker/modelscope/short_math_grpo.py
+++ b/cookbook/client/tinker/modelscope/short_math_grpo.py
@@ -38,7 +38,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'
+BASE_MODEL = 'Qwen/Qwen3.6-27B'
 NUM_GENERATIONS = 4
 MAX_NEW_TOKENS = 4096
 LEARNING_RATE = 2e-5
diff --git a/cookbook/client/twinkle/modelscope/dpo.py b/cookbook/client/twinkle/modelscope/dpo.py
index 17a69965..e9451e31 100644
--- a/cookbook/client/twinkle/modelscope/dpo.py
+++ b/cookbook/client/twinkle/modelscope/dpo.py
@@ -24,7 +24,7 @@
 logger = get_logger()
 
 # Configuration (direct values, not from env)
-base_model = 'Qwen/Qwen3.6-35B-A3B'
+base_model = 'Qwen/Qwen3.6-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 dataset_id = 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji'
 
diff --git a/cookbook/client/twinkle/modelscope/multi_modal.py b/cookbook/client/twinkle/modelscope/multi_modal.py
index 331352c8..106d85d4 100644
--- a/cookbook/client/twinkle/modelscope/multi_modal.py
+++ b/cookbook/client/twinkle/modelscope/multi_modal.py
@@ -24,7 +24,7 @@
 
 logger = get_logger()
 
-base_model = 'Qwen/Qwen3.6-35B-A3B'
+base_model = 'Qwen/Qwen3.6-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Initialize the Twinkle client to communicate with the remote server.
diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py
index 2248ddd4..5acd8a9a 100644
--- a/cookbook/client/twinkle/modelscope/self_congnition.py
+++ b/cookbook/client/twinkle/modelscope/self_congnition.py
@@ -21,7 +21,7 @@
 
 logger = get_logger()
 
-base_model = 'Qwen/Qwen3.6-35B-A3B'
+base_model = 'Qwen/Qwen3.6-27B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Initialize the Twinkle client to communicate with the remote server.
diff --git a/cookbook/rl/short_math_grpo_moe.py b/cookbook/rl/short_math_grpo_moe.py
index 9d870eac..6ad5cc2f 100644
--- a/cookbook/rl/short_math_grpo_moe.py
+++ b/cookbook/rl/short_math_grpo_moe.py
@@ -28,7 +28,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B')
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-27B')
 USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
 
 MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
diff --git a/cookbook/rl/short_math_grpo_multi_lora.py b/cookbook/rl/short_math_grpo_multi_lora.py
index 9dad8df3..96d7ef9b 100644
--- a/cookbook/rl/short_math_grpo_multi_lora.py
+++ b/cookbook/rl/short_math_grpo_multi_lora.py
@@ -5,7 +5,7 @@
 weights to a local directory, then passes the path to vLLMSampler via
 `adapter_path` so vLLM loads the latest adapter from disk.
 
-Model: Qwen/Qwen3.6-35B-A3B (MoE, 35B total / 3B active)
+Model: Qwen/Qwen3.6-27B (MoE, 35B total / 3B active)
 Model mesh: tp=2, ep=2, pp=2, sequence_parallel=True  (8 GPUs)
 Sampler mesh: dp=2, tp=2, gpus_per_worker=2            (4 GPUs)
 
@@ -35,7 +35,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B')
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-27B')
 
 MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
 SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 2))
diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md
index b3a3d736..57ba26fb 100644
--- a/docs/source_en/Usage Guide/Train-as-a-Service.md	
+++ b/docs/source_en/Usage Guide/Train-as-a-Service.md	
@@ -2,7 +2,7 @@
 
 Alongside the open-source release of the Twinkle framework, we also provide a hosted model training service (Training as a Service) powered by ModelScope's backend infrastructure. Developers can use this service to experience Twinkle's training API for free.
 
-The model currently running on the cluster is [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B). Below are the detailed usage instructions:
+The model currently running on the cluster is [Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B). Below are the detailed usage instructions:
 
 ## Step 1. Register a ModelScope Account and Obtain Your API Key
 
@@ -30,7 +30,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
+base_model = 'ms://Qwen/Qwen3.6-27B'
 base_url='https://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
@@ -64,7 +64,7 @@ for epoch in range(2):
     print(f'Saved checkpoint for epoch {epoch} to {result.path}')
 ```
 
-With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.6-35B-A3B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA:
+With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.6-27B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA:
 
 ```python
 import os
@@ -79,7 +79,7 @@ init_tinker_client()
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3.6-35B-A3B'
+base_model = 'Qwen/Qwen3.6-27B'
 base_url = 'https://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
index 001ad5e7..c5db28d3 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
@@ -3,7 +3,7 @@
 在 Twinkle 框架开源的同时，我们依托ModelScope的后台服务，也提供了托管的模型训练服务(Training as a Service），开发者可以通过这一服务，
 免费体验Twinkle的训练API。
 
-目前在集群中运行的模型是[Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)。下面介绍具体的使用方法：
+目前在集群中运行的模型是[Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B)。下面介绍具体的使用方法：
 
 ## Step 1. 注册ModelScope用户并获取 API Key
 
@@ -31,7 +31,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
+base_model = 'ms://Qwen/Qwen3.6-27B'
 base_url='https://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
@@ -65,7 +65,7 @@ for epoch in range(2):
     print(f'Saved checkpoint for epoch {epoch} to {result.path}')
 ```
 
-通过上述代码，你可以训练一个原模型为`Qwen/Qwen3.6-35B-A3B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理：
+通过上述代码，你可以训练一个原模型为`Qwen/Qwen3.6-27B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理：
 
 ```python
 import os
@@ -80,7 +80,7 @@ init_tinker_client()
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3.6-35B-A3B'
+base_model = 'Qwen/Qwen3.6-27B'
 base_url = 'https://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
diff --git a/notebook/dpo.ipynb b/notebook/dpo.ipynb
index c1dc132d..caeb6582 100644
--- a/notebook/dpo.ipynb
+++ b/notebook/dpo.ipynb
@@ -95,7 +95,7 @@
     "\n",
     "| 配置项 | 默认值 | 说明 |\n",
     "|--------|--------|------|\n",
-    "| `BASE_MODEL` | Qwen/Qwen3.6-35B-A3B | 基座模型 |\n",
+    "| `BASE_MODEL` | Qwen/Qwen3.6-27B | 基座模型 |\n",
     "| `BATCH_SIZE` | 4 | 每步处理的 DPO 样本对数 |\n",
     "| `LEARNING_RATE` | 1e-4 | 学习率 |\n",
     "| `DPO_BETA` | 0.1 | DPO 温度系数，控制偏好强度 |\n",
@@ -149,7 +149,7 @@
     "logger = get_logger()\n",
     "\n",
     "# ========== 全局配置 ==========\n",
-    "BASE_MODEL    = 'Qwen/Qwen3.6-35B-A3B'\n",
+    "BASE_MODEL    = 'Qwen/Qwen3.6-27B'\n",
     "BASE_URL      = 'http://www.modelscope.cn/twinkle'\n",
     "API_KEY       =  getpass(\"ModelScope Token: \")\n",
     "DATASET_ID    = 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji'\n",
@@ -464,13 +464,9 @@
     "\n",
     "训练完成后，可以直接使用 **线上服务** 进行推理，无需本地 GPU。\n",
     "\n",
-    "通过 `save_weights_and_get_sampling_client` 或 `create_sampling_client` 加载训练好的 LoRA 检查点，即可在线采样生成。\n",
+    "通过 `create_sampling_client` 加载训练好的 LoRA 检查点，即可在线采样生成。\n",
     "\n",
-    "> 将下方 `weight_path` 替换为训练输出的检查点路径（`twinkle://...` 格式）。\n",
-    "\n",
-    "### ⚠️ MoE 模型 LoRA 注意事项\n",
-    "\n",
-    "由于 `Qwen/Qwen3.6-35B-A3B` 是 MoE（Mixture of Experts）架构，在配合 vLLM 采样时存在已知兼容性问题。下面Sample部分仅作为示例代码。训练得到的 LoRA 权重可以上传值 ModelScope 并与原始模型合并，导出为完整的模型，方便后续部署和推理。"
+    "> 将下方 `weight_path` 替换为训练输出的检查点路径（`twinkle://...` 格式）。\n"
    ]
   },
   {
@@ -488,7 +484,7 @@
     "\n",
     "logger = get_logger()\n",
     "\n",
-    "BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'\n",
+    "BASE_MODEL = 'Qwen/Qwen3.6-27B'\n",
     "\n",
     "# TODO: 替换为训练输出的检查点路径\n",
     "weight_path = '<替换为你的 twinkle:// 检查点路径>'  # 例如: save_result.path\n",
@@ -535,44 +531,60 @@
   },
   {
    "cell_type": "markdown",
-   "id": "13d43fcc",
    "metadata": {},
    "source": [
-    "## 合并权重并导出\n",
+    "## 合并 LoRA 权重（可选）\n",
+    "\n",
+    "如果需要将 LoRA 权重合并为完整模型（用于无 LoRA 支持的部署场景），可以使用 PEFT 提供的合并功能。\n",
+    "\n",
+    "> **注意**：合并操作需要加载完整模型，请在有足够显存的环境下执行。\n"
+   ],
+   "id": "peft_merge_0"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "from peft import PeftModel\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "\n",
+    "base_model_id = 'Qwen/Qwen3.6-27B'\n",
+    "lora_path = '<替换为你的 LoRA 检查点路径>'\n",
+    "output_dir = '<替换为输出目录>'\n",
+    "\n",
+    "# 加载基座模型\n",
+    "base_model = AutoModelForCausalLM.from_pretrained(\n",
+    "    base_model_id, torch_dtype='auto', device_map='auto'\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n",
     "\n",
-    "训练得到的 LoRA 权重可以与原始模型合并，导出为完整的 HuggingFace 模型，方便后续部署和推理。\n",
+    "# 加载 LoRA 适配器并合并\n",
+    "model = PeftModel.from_pretrained(base_model, lora_path)\n",
+    "merged_model = model.merge_and_unload()\n",
     "\n",
-    "> **注意**：合并操作需要 GPU 资源（需要加载完整模型），请在有足够显存的环境下执行。\n",
+    "# 保存合并后的完整模型\n",
+    "merged_model.save_pretrained(output_dir)\n",
+    "tokenizer.save_pretrained(output_dir)\n",
+    "print(f'合并完成，模型已保存到 {output_dir}')\n"
+   ],
+   "outputs": [],
+   "execution_count": null,
+   "id": "peft_merge_1"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "合并后的模型为标准 HuggingFace 格式，可直接用 vLLM、Transformers 等框架加载推理：\n",
     "\n",
     "```bash\n",
-    "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n",
-    "NPROC_PER_NODE=4 \\\n",
-    "/opt/conda/envs/twinkle/bin/megatron export \\\n",
-    "    --model Qwen/Qwen3.6-35B-A3B \\\n",
-    "    --adapters <替换为你的 LoRA 检查点路径> \\\n",
-    "    --output_dir <替换为输出目录> \\\n",
-    "    --merge_lora true \\\n",
-    "    --to_hf true \\\n",
-    "    --tensor_model_parallel_size 2 \\\n",
-    "    --expert_model_parallel_size 2 \\\n",
-    "    --pipeline_model_parallel_size 2\n",
+    "# 使用 vLLM 部署合并后的模型\n",
+    "vllm serve <输出目录> --tensor-parallel-size 2\n",
     "```\n",
     "\n",
-    "**参数说明**：\n",
-    "\n",
-    "| 参数 | 说明 |\n",
-    "|------|------|\n",
-    "| `--model` | 基座模型 ID |\n",
-    "| `--adapters` | 训练保存的 LoRA 检查点路径 |\n",
-    "| `--output_dir` | 合并后的完整模型输出目录 |\n",
-    "| `--merge_lora true` | 将 LoRA 权重合并到基座模型中 |\n",
-    "| `--to_hf true` | 导出为 HuggingFace 格式 |\n",
-    "| `--tensor_model_parallel_size` | 张量并行大小 |\n",
-    "| `--expert_model_parallel_size` | 专家并行大小（MoE 模型专用） |\n",
-    "| `--pipeline_model_parallel_size` | 流水线并行大小 |\n",
-    "\n",
-    "合并完成后，输出目录中即为完整的 HuggingFace 模型，可直接用于推理或部署。"
-   ]
+    "> **提示**：对于 Dense 模型（如 Qwen3.6-27B），LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载，无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n"
+   ],
+   "id": "peft_merge_2"
   },
   {
    "cell_type": "markdown",
diff --git a/notebook/multi_modal.ipynb b/notebook/multi_modal.ipynb
index 41d32478..4c88d585 100644
--- a/notebook/multi_modal.ipynb
+++ b/notebook/multi_modal.ipynb
@@ -149,7 +149,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "base_model = 'Qwen/Qwen3.6-35B-A3B'\n",
+    "base_model = 'Qwen/Qwen3.6-27B'\n",
     "base_url = 'http://www.modelscope.cn/twinkle'\n",
     "\n",
     "client = init_twinkle_client(base_url=base_url, api_key=api_key)\n",
@@ -393,13 +393,9 @@
     "\n",
     "训练完成后，可以直接使用 **线上服务** 进行推理，无需本地 GPU。\n",
     "\n",
-    "通过 `save_weights_and_get_sampling_client` 或 `create_sampling_client` 加载训练好的 LoRA 检查点，即可在线采样生成。\n",
+    "通过 `create_sampling_client` 加载训练好的 LoRA 检查点，即可在线采样生成。\n",
     "\n",
-    "> 将下方 `weight_path` 替换为训练输出的检查点路径（`twinkle://...` 格式）。\n",
-    "\n",
-    "### ⚠️ MoE 模型 LoRA 注意事项\n",
-    "\n",
-    "由于 `Qwen/Qwen3.6-35B-A3B` 是 MoE（Mixture of Experts）架构，在配合 vLLM 采样时存在已知兼容性问题。下面Sample部分仅作为示例代码。训练得到的 LoRA 权重可以上传值 ModelScope 并与原始模型合并，导出为完整的模型，方便后续部署和推理。"
+    "> 将下方 `weight_path` 替换为训练输出的检查点路径（`twinkle://...` 格式）。\n"
    ]
   },
   {
@@ -417,7 +413,7 @@
     "\n",
     "logger = get_logger()\n",
     "\n",
-    "BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'\n",
+    "BASE_MODEL = 'Qwen/Qwen3.6-27B'\n",
     "\n",
     "# TODO: 替换为训练输出的检查点路径\n",
     "weight_path = '<替换为你的 twinkle:// 检查点路径>'\n",
@@ -463,44 +459,60 @@
   },
   {
    "cell_type": "markdown",
-   "id": "792c879b",
    "metadata": {},
    "source": [
-    "## 合并权重并导出\n",
+    "## 合并 LoRA 权重（可选）\n",
+    "\n",
+    "如果需要将 LoRA 权重合并为完整模型（用于无 LoRA 支持的部署场景），可以使用 PEFT 提供的合并功能。\n",
+    "\n",
+    "> **注意**：合并操作需要加载完整模型，请在有足够显存的环境下执行。\n"
+   ],
+   "id": "peft_merge_0"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "from peft import PeftModel\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "\n",
+    "base_model_id = 'Qwen/Qwen3.6-27B'\n",
+    "lora_path = '<替换为你的 LoRA 检查点路径>'\n",
+    "output_dir = '<替换为输出目录>'\n",
+    "\n",
+    "# 加载基座模型\n",
+    "base_model = AutoModelForCausalLM.from_pretrained(\n",
+    "    base_model_id, torch_dtype='auto', device_map='auto'\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n",
     "\n",
-    "训练得到的 LoRA 权重可以与原始模型合并，导出为完整的 HuggingFace 模型，方便后续部署和推理。\n",
+    "# 加载 LoRA 适配器并合并\n",
+    "model = PeftModel.from_pretrained(base_model, lora_path)\n",
+    "merged_model = model.merge_and_unload()\n",
     "\n",
-    "> **注意**：合并操作需要 GPU 资源（需要加载完整模型），请在有足够显存的环境下执行。\n",
+    "# 保存合并后的完整模型\n",
+    "merged_model.save_pretrained(output_dir)\n",
+    "tokenizer.save_pretrained(output_dir)\n",
+    "print(f'合并完成，模型已保存到 {output_dir}')\n"
+   ],
+   "outputs": [],
+   "execution_count": null,
+   "id": "peft_merge_1"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "合并后的模型为标准 HuggingFace 格式，可直接用 vLLM、Transformers 等框架加载推理：\n",
     "\n",
     "```bash\n",
-    "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n",
-    "NPROC_PER_NODE=4 \\\n",
-    "/opt/conda/envs/twinkle/bin/megatron export \\\n",
-    "    --model Qwen/Qwen3.6-35B-A3B \\\n",
-    "    --adapters <替换为你的 LoRA 检查点路径> \\\n",
-    "    --output_dir <替换为输出目录> \\\n",
-    "    --merge_lora true \\\n",
-    "    --to_hf true \\\n",
-    "    --tensor_model_parallel_size 2 \\\n",
-    "    --expert_model_parallel_size 2 \\\n",
-    "    --pipeline_model_parallel_size 2\n",
+    "# 使用 vLLM 部署合并后的模型\n",
+    "vllm serve <输出目录> --tensor-parallel-size 2\n",
     "```\n",
     "\n",
-    "**参数说明**：\n",
-    "\n",
-    "| 参数 | 说明 |\n",
-    "|------|------|\n",
-    "| `--model` | 基座模型 ID |\n",
-    "| `--adapters` | 训练保存的 LoRA 检查点路径 |\n",
-    "| `--output_dir` | 合并后的完整模型输出目录 |\n",
-    "| `--merge_lora true` | 将 LoRA 权重合并到基座模型中 |\n",
-    "| `--to_hf true` | 导出为 HuggingFace 格式 |\n",
-    "| `--tensor_model_parallel_size` | 张量并行大小 |\n",
-    "| `--expert_model_parallel_size` | 专家并行大小（MoE 模型专用） |\n",
-    "| `--pipeline_model_parallel_size` | 流水线并行大小 |\n",
-    "\n",
-    "合并完成后，输出目录中即为完整的 HuggingFace 模型，可直接用于推理或部署。"
-   ]
+    "> **提示**：对于 Dense 模型（如 Qwen3.6-27B），LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载，无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n"
+   ],
+   "id": "peft_merge_2"
   },
   {
    "cell_type": "markdown",
diff --git a/notebook/sample.ipynb b/notebook/sample.ipynb
index c7cfedfe..34bd2d97 100644
--- a/notebook/sample.ipynb
+++ b/notebook/sample.ipynb
@@ -108,7 +108,7 @@
     "\n",
     "logger = get_logger()\n",
     "\n",
-    "BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'\n",
+    "BASE_MODEL = 'Qwen/Qwen3.6-27B'\n",
     "\n",
     "# TODO: 替换为你的训练检查点路径\n",
     "weight_path = '<替换为你的 twinkle:// 检查点路径>'  # 例如: 'twinkle://xxx/weights/twinkle-lora-2'\n",
@@ -244,31 +244,60 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6643d9ae",
    "metadata": {},
    "source": [
-    "## 合并权重并导出\n",
+    "## 合并 LoRA 权重（可选）\n",
     "\n",
-    "训练得到的 LoRA 权重可以与原始模型合并，导出为完整的 HuggingFace 模型，方便后续部署和推理。\n",
+    "如果需要将 LoRA 权重合并为完整模型（用于无 LoRA 支持的部署场景），可以使用 PEFT 提供的合并功能。\n",
     "\n",
-    "> **注意**：合并操作需要 GPU 资源（需要加载完整模型），请在有足够显存的环境下执行。\n",
+    "> **注意**：合并操作需要加载完整模型，请在有足够显存的环境下执行。\n"
+   ],
+   "id": "peft_merge_0"
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "source": [
+    "from peft import PeftModel\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "\n",
+    "base_model_id = 'Qwen/Qwen3.6-27B'\n",
+    "lora_path = '<替换为你的 LoRA 检查点路径>'\n",
+    "output_dir = '<替换为输出目录>'\n",
+    "\n",
+    "# 加载基座模型\n",
+    "base_model = AutoModelForCausalLM.from_pretrained(\n",
+    "    base_model_id, torch_dtype='auto', device_map='auto'\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n",
+    "\n",
+    "# 加载 LoRA 适配器并合并\n",
+    "model = PeftModel.from_pretrained(base_model, lora_path)\n",
+    "merged_model = model.merge_and_unload()\n",
+    "\n",
+    "# 保存合并后的完整模型\n",
+    "merged_model.save_pretrained(output_dir)\n",
+    "tokenizer.save_pretrained(output_dir)\n",
+    "print(f'合并完成，模型已保存到 {output_dir}')\n"
+   ],
+   "outputs": [],
+   "execution_count": null,
+   "id": "peft_merge_1"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "合并后的模型为标准 HuggingFace 格式，可直接用 vLLM、Transformers 等框架加载推理：\n",
     "\n",
     "```bash\n",
-    "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n",
-    "NPROC_PER_NODE=4 \\\n",
-    "/opt/conda/envs/twinkle/bin/megatron export \\\n",
-    "    --model Qwen/Qwen3.6-35B-A3B \\\n",
-    "    --adapters <替换为你的 LoRA 检查点路径> \\\n",
-    "    --output_dir <替换为输出目录> \\\n",
-    "    --merge_lora true \\\n",
-    "    --to_hf true \\\n",
-    "    --tensor_model_parallel_size 2 \\\n",
-    "    --expert_model_parallel_size 2 \\\n",
-    "    --pipeline_model_parallel_size 2\n",
+    "# 使用 vLLM 部署合并后的模型\n",
+    "vllm serve <输出目录> --tensor-parallel-size 2\n",
     "```\n",
     "\n",
-    "合并完成后，输出目录中即为完整的 HuggingFace 模型，可直接用于推理或部署。"
-   ]
+    "> **提示**：对于 Dense 模型（如 Qwen3.6-27B），LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载，无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n"
+   ],
+   "id": "peft_merge_2"
   },
   {
    "cell_type": "markdown",
diff --git a/notebook/self_cognition.ipynb b/notebook/self_cognition.ipynb
index b5cc300c..5b1d91a7 100644
--- a/notebook/self_cognition.ipynb
+++ b/notebook/self_cognition.ipynb
@@ -153,7 +153,7 @@
     "\n",
     "from tinker import ServiceClient\n",
     "\n",
-    "base_model = 'Qwen/Qwen3.6-35B-A3B'\n",
+    "base_model = 'Qwen/Qwen3.6-27B'\n",
     "base_url = 'http://www.modelscope.cn/twinkle'\n",
     "api_key = getpass(\"ModelScope Token: \")"
    ]
@@ -297,13 +297,9 @@
     "\n",
     "加载训练好的 LoRA 检查点，向模型提问「你是谁？」，观察模型是否以自定义身份回答。\n",
     "\n",
-    "### ⚠️ MoE 模型 LoRA 注意事项\n",
-    "\n",
-    "由于 `Qwen/Qwen3.6-35B-A3B` 是 MoE（Mixture of Experts）架构，在配合 vLLM 采样时存在已知兼容性问题。下面Sample部分仅作为示例代码。训练得到的 LoRA 权重可以上传至 ModelScope 并与原始模型合并，导出为完整的模型，方便后续部署和推理。\n",
-    "\n",
     "### 2.1 加载检查点并创建采样客户端\n",
     "\n",
-    "> 将下方 `weight_path` 替换为 Part 1 训练输出的检查点路径。"
+    "> 将下方 `weight_path` 替换为 Part 1 训练输出的检查点路径。\n"
    ]
   },
   {
@@ -314,11 +310,11 @@
    "outputs": [],
    "source": [
     "# TODO: 替换为 Part 1 输出的检查点路径\n",
-    "weight_path = 'twinkle://20260420_142308-Qwen_Qwen3_6-35B-A3B-38777028/weights/twinkle-lora-0'\n",
+    "weight_path = '<替换为 Part 1 输出的检查点路径>'\n",
     "\n",
     "service_client = ServiceClient(base_url=base_url, api_key=api_key)\n",
     "sampling_client = service_client.create_sampling_client(model_path=weight_path, base_model=base_model)\n",
-    "print('采样客户端创建成功')"
+    "print('采样客户端创建成功')\n"
    ]
   },
   {
@@ -416,40 +412,56 @@
    "id": "66f91299",
    "metadata": {},
    "source": [
-    "## 合并权重并导出\n",
+    "## 合并 LoRA 权重（可选）\n",
+    "\n",
+    "如果需要将 LoRA 权重合并为完整模型（用于无 LoRA 支持的部署场景），可以使用 PEFT 提供的合并功能。\n",
+    "\n",
+    "> **注意**：合并操作需要加载完整模型，请在有足够显存的环境下执行。\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "metadata": {},
+   "id": "peft_merge_code",
+   "source": [
+    "from peft import PeftModel\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "\n",
+    "base_model_id = 'Qwen/Qwen3.6-27B'\n",
+    "lora_path = '<替换为你的 LoRA 检查点路径>'\n",
+    "output_dir = '<替换为输出目录>'\n",
     "\n",
-    "训练得到的 LoRA 权重可以与原始模型合并，导出为完整的 HuggingFace 模型，方便后续部署和推理。\n",
+    "# 加载基座模型\n",
+    "base_model_hf = AutoModelForCausalLM.from_pretrained(\n",
+    "    base_model_id, torch_dtype='auto', device_map='auto'\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n",
     "\n",
-    "> **注意**：合并操作需要 GPU 资源（需要加载完整模型），请在有足够显存的环境下执行。\n",
+    "# 加载 LoRA 适配器并合并\n",
+    "peft_model = PeftModel.from_pretrained(base_model_hf, lora_path)\n",
+    "merged_model = peft_model.merge_and_unload()\n",
+    "\n",
+    "# 保存合并后的完整模型\n",
+    "merged_model.save_pretrained(output_dir)\n",
+    "tokenizer.save_pretrained(output_dir)\n",
+    "print(f'合并完成，模型已保存到 {output_dir}')\n"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "id": "peft_merge_note",
+   "source": [
+    "合并后的模型为标准 HuggingFace 格式，可直接用 vLLM、Transformers 等框架加载推理：\n",
     "\n",
     "```bash\n",
-    "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n",
-    "NPROC_PER_NODE=4 \\\n",
-    "/opt/conda/envs/twinkle/bin/megatron export \\\n",
-    "    --model Qwen/Qwen3.6-35B-A3B \\\n",
-    "    --adapters <替换为你的 LoRA 检查点路径> \\\n",
-    "    --output_dir <替换为输出目录> \\\n",
-    "    --merge_lora true \\\n",
-    "    --to_hf true \\\n",
-    "    --tensor_model_parallel_size 2 \\\n",
-    "    --expert_model_parallel_size 2 \\\n",
-    "    --pipeline_model_parallel_size 2\n",
+    "# 使用 vLLM 部署合并后的模型\n",
+    "vllm serve <输出目录> --tensor-parallel-size 2\n",
     "```\n",
     "\n",
-    "**参数说明**：\n",
-    "\n",
-    "| 参数 | 说明 |\n",
-    "|------|------|\n",
-    "| `--model` | 基座模型 ID |\n",
-    "| `--adapters` | 训练保存的 LoRA 检查点路径 |\n",
-    "| `--output_dir` | 合并后的完整模型输出目录 |\n",
-    "| `--merge_lora true` | 将 LoRA 权重合并到基座模型中 |\n",
-    "| `--to_hf true` | 导出为 HuggingFace 格式 |\n",
-    "| `--tensor_model_parallel_size` | 张量并行大小 |\n",
-    "| `--expert_model_parallel_size` | 专家并行大小（MoE 模型专用） |\n",
-    "| `--pipeline_model_parallel_size` | 流水线并行大小 |\n",
-    "\n",
-    "合并完成后，输出目录中即为完整的 HuggingFace 模型，可直接用于推理或部署。"
+    "> **提示**：对于 Dense 模型（如 Qwen3.6-27B），LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载，无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n"
    ]
   },
   {
@@ -470,7 +482,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "vllm19",
    "language": "python",
    "name": "python3"
   },
@@ -484,7 +496,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.13"
+   "version": "3.12.0"
   }
  },
  "nbformat": 4,
diff --git a/notebook/short_math_grpo.ipynb b/notebook/short_math_grpo.ipynb
index 8e8b1b31..f7344bcf 100644
--- a/notebook/short_math_grpo.ipynb
+++ b/notebook/short_math_grpo.ipynb
@@ -86,46 +86,18 @@
     "## 第一步：导入依赖与全局配置\n",
     "\n",
     "> **为什么使用 Twinkle 客户端语法？**\n",
-    "> Twinkle 提供 `tinker` 和 `twinkle` 两套客户端 API。其中 **tinker** 接口不支持设置 `target_modules`、`LoraConfig` 等细节调控，而 GRPO 训练在 MoE 模型上需要显式指定 LoRA 的 target modules（否则会触发 vLLM 兼容性问题）。\n",
-    "> 因此本 Notebook 使用 **twinkle 客户端语法**，以获得对训练参数的完整控制。\n",
+    "> Twinkle 提供 `tinker` 和 `twinkle` 两套客户端 API。其中 **twinkle** 客户端提供更完整的参数控制（如 `LoraConfig`、`target_modules`），适合 GRPO 等需要精细调参的训练场景。\n",
     "\n",
     "| 配置项 | 默认值 | 说明 |\n",
     "|--------|--------|------|\n",
-    "| `MODEL_ID` | ms://Qwen/Qwen3.6-35B-A3B | 基座模型（需加 `ms://` 前缀） |\n",
+    "| `MODEL_ID` | ms://Qwen/Qwen3.6-27B | 基座模型（需加 `ms://` 前缀） |\n",
     "| `NUM_GENERATIONS` | 4 | 每个 prompt 生成几条回答 |\n",
     "| `MAX_NEW_TOKENS` | 1024 | 单条回答最大 token 数 |\n",
     "| `LEARNING_RATE` | 2e-5 | 学习率 |\n",
     "| `MAX_STEPS` | 100 | 最大训练步数 |\n",
     "| `BATCH_SIZE` | 2 | 每步的 prompt 数量（实际训练样本 = BATCH_SIZE × NUM_GENERATIONS） |\n",
     "| `TEMPERATURE` | 1.0 | 采样温度，RL 训练中通常设为 1.0 保持多样性 |\n",
-    "| `SYNC_INTERVAL` | 1 | 每隔多少步同步权重到采样端 |"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ef3af352",
-   "metadata": {},
-   "source": [
-    "### ⚠️ MoE 模型 LoRA 注意事项\n",
-    "\n",
-    "由于 `Qwen/Qwen3.6-35B-A3B` 是 MoE（Mixture of Experts）架构，在配合 vLLM 采样时存在已知兼容性问题。\n",
-    "如果你在本地使用 Megatron 进行 GRPO 训练，建议显式指定 `target_modules`（而非 `all-linear`）：\n",
-    "\n",
-    "```python\n",
-    "target_modules:\n",
-    "    - mlp.linear_fc1\n",
-    "    - mlp.linear_fc2\n",
-    "    - attn.proj\n",
-    "    - shared_experts.linear_fc1\n",
-    "    - shared_experts.linear_fc2\n",
-    "    - linear_qkv\n",
-    "    - in_proj\n",
-    "    - out_proj\n",
-    "    - linear_proj\n",
-    "```\n",
-    "\n",
-    "> **注意**：此配置是一个示例，由于问题来自 vLLM 侧的 MoE LoRA 支持尚不完善，实际训练效果可能受限。\n",
-    "> 如果不需要在线采样（vLLM），使用 `all-linear` 仍然可以正常训练。"
+    "| `SYNC_INTERVAL` | 1 | 每隔多少步同步权重到采样端 |\n"
    ]
   },
   {
@@ -177,7 +149,7 @@
     "logger = get_logger()\n",
     "\n",
     "# ========== 全局配置 ==========\n",
-    "MODEL_ID = 'ms://Qwen/Qwen3.6-35B-A3B'\n",
+    "MODEL_ID = 'ms://Qwen/Qwen3.6-27B'\n",
     "NUM_GENERATIONS = 4\n",
     "MAX_NEW_TOKENS = 1024\n",
     "LEARNING_RATE = 2e-5\n",
@@ -355,9 +327,7 @@
     "\n",
     "- **`MultiLoraTransformersModel`**：支持 LoRA 适配器、损失函数、优化器、模板等全部设置\n",
     "- **`vLLMSampler`**：采样端，支持 `adapter_uri` 动态加载最新 LoRA 权重\n",
-    "- **`LoraConfig`**：可以精确控制 `target_modules`，这是使用 twinkle 语法的关键优势\n",
-    "\n",
-    "> 对于 MoE 模型，必须显式指定 `target_modules` 而非 `all-linear`，以避免 vLLM 兼容性问题。\n"
+    "- **`LoraConfig`**：可以精确控制 `target_modules`、`rank` 等参数\n"
    ]
   },
   {
@@ -376,15 +346,10 @@
     "# 配置训练模型\n",
     "model = MultiLoraTransformersModel(model_id=MODEL_ID)\n",
     "\n",
-    "# LoRA 配置 —— 显式指定 target_modules（MoE 模型关键）\n",
+    "# LoRA 配置\n",
     "lora_config = LoraConfig(\n",
-    "    target_modules=[\n",
-    "        'mlp.linear_fc1', 'mlp.linear_fc2',\n",
-    "        'attn.proj',\n",
-    "        'shared_experts.linear_fc1', 'shared_experts.linear_fc2',\n",
-    "        'linear_qkv', 'in_proj', 'out_proj', 'linear_proj',\n",
-    "    ],\n",
-    "    r=8,\n",
+    "    target_modules='all-linear',\n",
+    "    r=16,\n",
     "    lora_alpha=32,\n",
     "    lora_dropout=0.05,\n",
     ")\n",
@@ -420,7 +385,7 @@
     "    'logprobs': 1,\n",
     "}\n",
     "\n",
-    "print('模型和采样端配置完成')"
+    "print('模型和采样端配置完成')\n"
    ]
   },
   {
@@ -594,9 +559,9 @@
     "\n",
     "训练完成后，可以直接使用 **线上服务** 进行推理，无需本地 GPU。\n",
     "\n",
-    "通过 `save_weights_and_get_sampling_client` 或 `create_sampling_client` 加载训练好的 LoRA 检查点，即可在线采样生成。\n",
+    "通过 `create_sampling_client` 加载训练好的 LoRA 检查点，即可在线采样生成。\n",
     "\n",
-    "> 将下方 `weight_path` 替换为训练输出的检查点路径（`twinkle://...` 格式）。"
+    "> 将下方 `weight_path` 替换为训练输出的检查点路径（`twinkle://...` 格式）。\n"
    ]
   },
   {
@@ -614,7 +579,7 @@
     "\n",
     "logger = get_logger()\n",
     "\n",
-    "BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'\n",
+    "BASE_MODEL = 'Qwen/Qwen3.6-27B'\n",
     "\n",
     "# TODO: 替换为训练输出的检查点路径\n",
     "weight_path = '<替换为你的 twinkle:// 检查点路径>'  # 例如: save_result.path\n",
@@ -661,43 +626,59 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4bc85a32",
+   "id": "peft_merge_0",
+   "metadata": {},
+   "source": [
+    "## 合并 LoRA 权重（可选）\n",
+    "\n",
+    "如果需要将 LoRA 权重合并为完整模型（用于无 LoRA 支持的部署场景），可以使用 PEFT 提供的合并功能。\n",
+    "\n",
+    "> **注意**：合并操作需要加载完整模型，请在有足够显存的环境下执行。\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "peft_merge_1",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## 合并权重并导出\n",
+    "from peft import PeftModel\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
     "\n",
-    "训练得到的 LoRA 权重可以与原始模型合并，导出为完整的 HuggingFace 模型，方便后续部署和推理。\n",
+    "base_model_id = 'Qwen/Qwen3.6-27B'\n",
+    "lora_path = '<替换为你的 LoRA 检查点路径>'\n",
+    "output_dir = '<替换为输出目录>'\n",
     "\n",
-    "> **注意**：合并操作需要 GPU 资源（需要加载完整模型），请在有足够显存的环境下执行。\n",
+    "# 加载基座模型\n",
+    "base_model = AutoModelForCausalLM.from_pretrained(\n",
+    "    base_model_id, torch_dtype='auto', device_map='auto'\n",
+    ")\n",
+    "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n",
+    "\n",
+    "# 加载 LoRA 适配器并合并\n",
+    "model = PeftModel.from_pretrained(base_model, lora_path)\n",
+    "merged_model = model.merge_and_unload()\n",
+    "\n",
+    "# 保存合并后的完整模型\n",
+    "merged_model.save_pretrained(output_dir)\n",
+    "tokenizer.save_pretrained(output_dir)\n",
+    "print(f'合并完成，模型已保存到 {output_dir}')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "peft_merge_2",
+   "metadata": {},
+   "source": [
+    "合并后的模型为标准 HuggingFace 格式，可直接用 vLLM、Transformers 等框架加载推理：\n",
     "\n",
     "```bash\n",
-    "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n",
-    "NPROC_PER_NODE=4 \\\n",
-    "/opt/conda/envs/twinkle/bin/megatron export \\\n",
-    "    --model Qwen/Qwen3.6-35B-A3B \\\n",
-    "    --adapters <替换为你的 LoRA 检查点路径> \\\n",
-    "    --output_dir <替换为输出目录> \\\n",
-    "    --merge_lora true \\\n",
-    "    --to_hf true \\\n",
-    "    --tensor_model_parallel_size 2 \\\n",
-    "    --expert_model_parallel_size 2 \\\n",
-    "    --pipeline_model_parallel_size 2\n",
+    "# 使用 vLLM 部署合并后的模型\n",
+    "vllm serve <输出目录> --tensor-parallel-size 2\n",
     "```\n",
     "\n",
-    "**参数说明**：\n",
-    "\n",
-    "| 参数 | 说明 |\n",
-    "|------|------|\n",
-    "| `--model` | 基座模型 ID |\n",
-    "| `--adapters` | 训练保存的 LoRA 检查点路径 |\n",
-    "| `--output_dir` | 合并后的完整模型输出目录 |\n",
-    "| `--merge_lora true` | 将 LoRA 权重合并到基座模型中 |\n",
-    "| `--to_hf true` | 导出为 HuggingFace 格式 |\n",
-    "| `--tensor_model_parallel_size` | 张量并行大小 |\n",
-    "| `--expert_model_parallel_size` | 专家并行大小（MoE 模型专用） |\n",
-    "| `--pipeline_model_parallel_size` | 流水线并行大小 |\n",
-    "\n",
-    "合并完成后，输出目录中即为完整的 HuggingFace 模型，可直接用于推理或部署。"
+    "> **提示**：对于 Dense 模型（如 Qwen3.6-27B），LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载，无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n"
    ]
   },
   {
diff --git a/src/twinkle/server/gateway/server.py b/src/twinkle/server/gateway/server.py
index dd89b84c..755c5d2b 100644
--- a/src/twinkle/server/gateway/server.py
+++ b/src/twinkle/server/gateway/server.py
@@ -39,7 +39,7 @@ def __init__(self,
         self.http_options = http_options or {}
         self.proxy = ServiceProxy(http_options=http_options, route_prefix=self.route_prefix)
         self.supported_models = self._normalize_models(supported_models) or [
-            types.SupportedModel(model_name='Qwen/Qwen3.6-35B-A3B'),
+            types.SupportedModel(model_name='Qwen/Qwen3.6-27B'),
         ]
         self._modelscope_config_lock = asyncio.Lock()
 

From 8d0c2c1b78d9596a2cd5a97f2fdea5a7a3c17869 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 22 Apr 2026 16:07:00 +0800
Subject: [PATCH 2/3] fix

---
 cookbook/rl/short_math_grpo_moe.py        | 2 +-
 cookbook/rl/short_math_grpo_multi_lora.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cookbook/rl/short_math_grpo_moe.py b/cookbook/rl/short_math_grpo_moe.py
index 6ad5cc2f..9d870eac 100644
--- a/cookbook/rl/short_math_grpo_moe.py
+++ b/cookbook/rl/short_math_grpo_moe.py
@@ -28,7 +28,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-27B')
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B')
 USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1')))
 
 MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
diff --git a/cookbook/rl/short_math_grpo_multi_lora.py b/cookbook/rl/short_math_grpo_multi_lora.py
index 96d7ef9b..9dad8df3 100644
--- a/cookbook/rl/short_math_grpo_multi_lora.py
+++ b/cookbook/rl/short_math_grpo_multi_lora.py
@@ -5,7 +5,7 @@
 weights to a local directory, then passes the path to vLLMSampler via
 `adapter_path` so vLLM loads the latest adapter from disk.
 
-Model: Qwen/Qwen3.6-27B (MoE, 35B total / 3B active)
+Model: Qwen/Qwen3.6-35B-A3B (MoE, 35B total / 3B active)
 Model mesh: tp=2, ep=2, pp=2, sequence_parallel=True  (8 GPUs)
 Sampler mesh: dp=2, tp=2, gpus_per_worker=2            (4 GPUs)
 
@@ -35,7 +35,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-27B')
+MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B')
 
 MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4))
 SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 2))

From 5aaf97801bb4cfaac45f2b2cd7400a112d433517 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 22 Apr 2026 16:12:08 +0800
Subject: [PATCH 3/3] fix

---
 README.md    | 2 +-
 README_ZH.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 7c01765e..1ee5fc7e 100644
--- a/README.md
+++ b/README.md
@@ -144,7 +144,7 @@ supported on Twinkle✨ framework.
 > For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it
 > is currently provided via the Tinker-compatible APIs. We will be rolling out services that support
 > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed
-> by one training base at a time, and currently it is [Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-27B).
+> by one training base at a time, and currently it is [Qwen3.6-27B](https://modelscope.cn/models/Qwen/Qwen3.6-27B).
 
 | Model Type          | Model ID on [ModelScope](https://modelscope.cn)                                                                 |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
diff --git a/README_ZH.md b/README_ZH.md
index 3496abb8..b52c85c1 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -130,7 +130,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 随着新模型的发布，我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。
 
 >[!Note]
-> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-27B)。
+> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3.6-27B](https://modelscope.cn/models/Qwen/Qwen3.6-27B)。
 
 | Model Type          | Model ID 举例                                                                                                     |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|