From 2843f7bc4b391030a45e52b6089e12ec432698af Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 15 Apr 2026 16:54:43 +0800
Subject: [PATCH 1/8] wip

---
 README.md                                            |  8 ++++----
 README_ZH.md                                         |  8 ++++----
 cookbook/client/server/megatron/server_config.yaml   | 12 ++++++------
 cookbook/client/tinker/modelscope/sample.py          |  4 ++--
 cookbook/client/tinker/modelscope/self_cognition.py  |  2 +-
 cookbook/client/tinker/modelscope/short_math_grpo.py |  2 +-
 cookbook/client/twinkle/modelscope/multi_modal.py    |  2 +-
 .../client/twinkle/modelscope/self_congnition.py     |  2 +-
 docs/source_en/Usage Guide/Train-as-a-Service.md     | 10 +++++-----
 ...0\256\255\347\273\203\346\234\215\345\212\241.md" | 10 +++++-----
 src/twinkle/server/gateway/server.py                 |  2 +-
 11 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index 4351f346..4713f442 100644
--- a/README.md
+++ b/README.md
@@ -104,7 +104,7 @@ sh INSTALL_MEGATRON.sh
 | twinkle client finetuning         | transformer     | [Script](cookbook/client/twinkle/transformer)        |
 
 ## Changelog
-- 🎉2026-04-14 The ModelScope service has been deployed to [Qwen/Qwen3.5-27B](https://www.modelscope.cn/models/Qwen/Qwen3.5-27B) with a new release 0.2.0.
+- 🎉2026-04-14 The ModelScope service has been deployed to [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B) with a new release 0.2.0.
 - 🎉2026-03-28 Support DPO training with both Transformers and Megatron backends. See [dpo_full.py](cookbook/rl/dpo_full.py) and [dpo_lora.py](cookbook/rl/dpo_lora.py).
 - 🎉2026-03-24 Twinkle Web site is now live at https://modelscope.github.io/twinkle-web/
 - 🎉2026-03-19 Support GKD training ，please refer to this [cookbook](cookbook/rl/gkd_on_policy.py).
@@ -135,7 +135,7 @@ supported on Twinkle✨ framework.
 > For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it
 > is currently provided via the Tinker-compatible APIs. We will be rolling out services that support
 > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed
-> by one training base at a time, and currently it is [Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B).
+> by one training base at a time, and currently it is [Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-35B-A3B).
 
 | Model Type          | Model ID on [ModelScope](https://modelscope.cn)                                                                 |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
@@ -184,7 +184,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me
 
 def train():
     # to load model from Hugging Face, use 'hf://...'
-    base_model = 'ms://Qwen/Qwen3.5-27B'
+    base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
@@ -240,7 +240,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.5-27B'
+base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
 base_url='your-base-url'
 api_key='your-api-key'
 
diff --git a/README_ZH.md b/README_ZH.md
index f34e0635..a6a2662f 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -94,7 +94,7 @@ sh INSTALL_MEGATRON.sh
 Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Client等各场景下。其算法过程是外露的，非常便于修改和调试。完整的框架介绍请查看[快速开始](docs/source_zh/使用指引/快速开始.md)
 
 ## 更新日志
-🎉2026-04-14 ModelScope的训练服务部署为[Qwen/Qwen3.5-27B](https://www.modelscope.cn/models/Qwen/Qwen3.5-27B)，并发布了0.2.0版本.
+🎉2026-04-16 ModelScope的训练服务部署为[Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)，并发布了0.2.0版本.
 🎉2026-03-28 支持 DPO 训练，同时支持 Transformers 和 Megatron 后端。参考 [dpo_full.py](cookbook/rl/dpo_full.py) 和 [dpo_lora.py](cookbook/rl/dpo_lora.py)。
 🎉2026-03-24 Twinkle 站点上线，访问地址 https://modelscope.github.io/twinkle-web/
 🎉2026-03-19 支持GKD蒸馏能力，参考[cookbook](cookbook/rl/gkd_on_policy.py)。
@@ -118,7 +118,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 随着新模型的发布，我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。
 
 >[!Note]
-> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3.5-27B](https://modelscope.cn/models/Qwen/Qwen3.5-27B)。
+> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务，目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持，目前使用的是[Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)。
 
 | Model Type          | Model ID 举例                                                                                                     |               Model Size                | Requires             | Support Megatron |                                                HF Model ID                                                |
 |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|
@@ -166,7 +166,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me
 
 def train():
     # to load model from Hugging Face, use 'hf://...'
-    base_model = 'ms://Qwen/Qwen3.5-27B'
+    base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
     # 1000 samples
     dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
     # Set template to prepare encoding
@@ -222,7 +222,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.5-27B'
+base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
 base_url='your-base-url'
 api_key='your-api-key'
 
diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml
index 08375d7d..abedb10a 100644
--- a/cookbook/client/server/megatron/server_config.yaml
+++ b/cookbook/client/server/megatron/server_config.yaml
@@ -38,11 +38,11 @@ applications:
   # 3. Sampler Service - Runs inference / sampling using vLLM engine
   #    Used for generating text from the model (e.g., evaluating LoRA results).
   #    Config: TP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, ~37GB for KV cache + LoRA
-  - name: sampler-Qwen3.5-27B
-    route_prefix: /api/v1/sampler/Qwen/Qwen3.5-27B
+  - name: sampler-Qwen3.6-35B-A3B
+    route_prefix: /api/v1/sampler/Qwen/Qwen3.6-35B-A3B
     import_path: sampler
     args:
-      model_id: "ms://Qwen/Qwen3.5-27B"  # ModelScope model identifier
+      model_id: "ms://Qwen/Qwen3.6-35B-A3B"  # ModelScope model identifier
       nproc_per_node: 4               # Number of GPU processes per node
       sampler_type: vllm              # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler)
       engine_args:                    # vLLM engine-specific settings
@@ -80,12 +80,12 @@ applications:
 
   # 2. Model Service - Hosts the base model for training.
   #    Config: PP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, comfortable for LoRA training
-  - name: models-Qwen3.5-27B
-    route_prefix: /api/v1/model/Qwen/Qwen3.5-27B
+  - name: models-Qwen3.6-35B-A3B
+    route_prefix: /api/v1/model/Qwen/Qwen3.6-35B-A3B
     import_path: model
     args:
       use_megatron: true                          # Use Megatron-LM backend
-      model_id: "ms://Qwen/Qwen3.5-27B" # ModelScope model identifier
+      model_id: "ms://Qwen/Qwen3.6-35B-A3B" # ModelScope model identifier
       max_length: 32000                           # model max length
       max_loras: 5                                # model max loras
       nproc_per_node: 4                           # Number of GPU processes per node
diff --git a/cookbook/client/tinker/modelscope/sample.py b/cookbook/client/tinker/modelscope/sample.py
index 40c9b327..ca3aa62b 100644
--- a/cookbook/client/tinker/modelscope/sample.py
+++ b/cookbook/client/tinker/modelscope/sample.py
@@ -16,7 +16,7 @@
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3.5-27B'
+base_model = 'Qwen/Qwen3.6-35B-A3B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
@@ -29,7 +29,7 @@
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3.5-27B-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.6-35B-A3B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py
index 2347c7fc..ac98be89 100644
--- a/cookbook/client/tinker/modelscope/self_cognition.py
+++ b/cookbook/client/tinker/modelscope/self_cognition.py
@@ -23,7 +23,7 @@
 from tinker import ServiceClient
 
 # The base model to fine-tune / evaluate
-base_model = 'Qwen/Qwen3.5-27B'
+base_model = 'Qwen/Qwen3.6-35B-A3B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 
diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py
index c361b934..cf210c46 100644
--- a/cookbook/client/tinker/modelscope/short_math_grpo.py
+++ b/cookbook/client/tinker/modelscope/short_math_grpo.py
@@ -38,7 +38,7 @@
 logger = get_logger()
 
 # ========== Configuration ==========
-BASE_MODEL = 'Qwen/Qwen3.5-27B'
+BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'
 NUM_GENERATIONS = 4
 MAX_NEW_TOKENS = 4096
 LEARNING_RATE = 2e-5
diff --git a/cookbook/client/twinkle/modelscope/multi_modal.py b/cookbook/client/twinkle/modelscope/multi_modal.py
index f7a54ccf..331352c8 100644
--- a/cookbook/client/twinkle/modelscope/multi_modal.py
+++ b/cookbook/client/twinkle/modelscope/multi_modal.py
@@ -24,7 +24,7 @@
 
 logger = get_logger()
 
-base_model = 'Qwen/Qwen3.5-27B'
+base_model = 'Qwen/Qwen3.6-35B-A3B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Initialize the Twinkle client to communicate with the remote server.
diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py
index aeef5606..2248ddd4 100644
--- a/cookbook/client/twinkle/modelscope/self_congnition.py
+++ b/cookbook/client/twinkle/modelscope/self_congnition.py
@@ -21,7 +21,7 @@
 
 logger = get_logger()
 
-base_model = 'Qwen/Qwen3.5-27B'
+base_model = 'Qwen/Qwen3.6-35B-A3B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Initialize the Twinkle client to communicate with the remote server.
diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md
index 692ef3f4..f176e7bb 100644
--- a/docs/source_en/Usage Guide/Train-as-a-Service.md	
+++ b/docs/source_en/Usage Guide/Train-as-a-Service.md	
@@ -2,7 +2,7 @@
 
 Alongside the open-source release of the Twinkle framework, we also provide a hosted model training service (Training as a Service) powered by ModelScope's backend infrastructure. Developers can use this service to experience Twinkle's training API for free.
 
-The model currently running on the cluster is [Qwen/Qwen3.5-27B](https://www.modelscope.cn/models/Qwen/Qwen3.5-27B). Below are the detailed usage instructions:
+The model currently running on the cluster is [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B). Below are the detailed usage instructions:
 
 ## Step 1. Register a ModelScope Account and Apply to Join the twinkle-explorers Organization
 
@@ -30,7 +30,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.5-27B'
+base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
 base_url='http://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
@@ -64,7 +64,7 @@ for epoch in range(2):
     print(f'Saved checkpoint for epoch {epoch} to {result.path}')
 ```
 
-With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.5-27B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA:
+With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.6-35B-A3B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA:
 
 ```python
 import os
@@ -79,7 +79,7 @@ init_tinker_client()
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3.5-27B'
+base_model = 'Qwen/Qwen3.6-35B-A3B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
@@ -92,7 +92,7 @@ service_client = ServiceClient(
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3.5-27B-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.6-35B-A3B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
index 0c7afc44..66676d4e 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
@@ -3,7 +3,7 @@
 在 Twinkle 框架开源的同时，我们依托ModelScope的后台服务，也提供了托管的模型训练服务(Training as a Service），开发者可以通过这一服务，
 免费体验Twinkle的训练API。
 
-目前在集群中运行的模型是[Qwen/Qwen3.5-27B](https://www.modelscope.cn/models/Qwen/Qwen3.5-27B)。下面介绍具体的使用方法：
+目前在集群中运行的模型是[Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)。下面介绍具体的使用方法：
 
 ## Step 1. 注册ModelScope用户并申请加入 twinkle-explorers 组织
 
@@ -33,7 +33,7 @@ from twinkle.dataset import Dataset, DatasetMeta
 from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
-base_model = 'ms://Qwen/Qwen3.5-27B'
+base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
 base_url='http://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
@@ -67,7 +67,7 @@ for epoch in range(2):
     print(f'Saved checkpoint for epoch {epoch} to {result.path}')
 ```
 
-通过上述代码，你可以训练一个原模型为`Qwen/Qwen3.5-27B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理：
+通过上述代码，你可以训练一个原模型为`Qwen/Qwen3.6-35B-A3B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理：
 
 ```python
 import os
@@ -82,7 +82,7 @@ init_tinker_client()
 
 from tinker import ServiceClient
 
-base_model = 'Qwen/Qwen3.5-27B'
+base_model = 'Qwen/Qwen3.6-35B-A3B'
 base_url = 'http://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
@@ -95,7 +95,7 @@ service_client = ServiceClient(
 # The model_path is a twinkle:// URI pointing to a previously saved LoRA checkpoint.
 # The server will load the base model and apply the LoRA adapter weights.
 sampling_client = service_client.create_sampling_client(
-    model_path='twinkle://xxx-Qwen_Qwen3.5-27B-xxx/weights/twinkle-lora-1',
+    model_path='twinkle://xxx-Qwen_Qwen3.6-35B-A3B-xxx/weights/twinkle-lora-1',
     base_model=base_model
 )
 
diff --git a/src/twinkle/server/gateway/server.py b/src/twinkle/server/gateway/server.py
index 84910baf..98e2dba5 100644
--- a/src/twinkle/server/gateway/server.py
+++ b/src/twinkle/server/gateway/server.py
@@ -37,7 +37,7 @@ def __init__(self,
         self.http_options = http_options or {}
         self.proxy = ServiceProxy(http_options=http_options, route_prefix=self.route_prefix)
         self.supported_models = self._normalize_models(supported_models) or [
-            types.SupportedModel(model_name='Qwen/Qwen3.5-27B'),
+            types.SupportedModel(model_name='Qwen/Qwen3.6-35B-A3B'),
         ]
         self._modelscope_config_lock = asyncio.Lock()
 

From cd3429ee6176b35829d81559ff5204e9ff3a02ad Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 15 Apr 2026 17:01:39 +0800
Subject: [PATCH 2/8] wip

---
 cookbook/client/tinker/modelscope/self_cognition.py           | 2 +-
 cookbook/client/tinker/self_host/self_cognition.py            | 2 +-
 docs/source_en/Components/Template/Template.md                | 4 ----
 docs/source_en/Usage Guide/Train-as-a-Service.md              | 2 +-
 .../\350\256\255\347\273\203\346\234\215\345\212\241.md"      | 2 +-
 .../\346\250\241\346\235\277/Template.md"                     | 4 ----
 src/twinkle/sampler/base.py                                   | 4 ++--
 src/twinkle/template/base.py                                  | 4 ++--
 8 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py
index ac98be89..7780df60 100644
--- a/cookbook/client/tinker/modelscope/self_cognition.py
+++ b/cookbook/client/tinker/modelscope/self_cognition.py
@@ -107,7 +107,7 @@ def eval():
         ]
     )
 
-    input_feature = template.encode(trajectory, add_generation_prompt=True)
+    input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
 
     input_ids = input_feature['input_ids'].tolist()
 
diff --git a/cookbook/client/tinker/self_host/self_cognition.py b/cookbook/client/tinker/self_host/self_cognition.py
index 691662e6..6d33b6c8 100644
--- a/cookbook/client/tinker/self_host/self_cognition.py
+++ b/cookbook/client/tinker/self_host/self_cognition.py
@@ -109,7 +109,7 @@ def eval():
         ]
     )
 
-    input_feature = template.encode(trajectory, add_generation_prompt=True)
+    input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
 
     input_ids = input_feature['input_ids'].tolist()
 
diff --git a/docs/source_en/Components/Template/Template.md b/docs/source_en/Components/Template/Template.md
index 32709361..60962a33 100644
--- a/docs/source_en/Components/Template/Template.md
+++ b/docs/source_en/Components/Template/Template.md
@@ -13,10 +13,6 @@ class Template:
                  default_system: Optional[str] = None):
         ...
 
-    def encode(self, trajectory: Trajectory, add_generation_prompt: bool = False) -> InputFeature:
-        # Encode a single sample
-        ...
-
     def batch_encode(self, trajectories: Union[Dict[str, Any], List[Trajectory]]) -> List[InputFeature]:
         # Batch encode samples
         ...
diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md
index f176e7bb..7ed8db76 100644
--- a/docs/source_en/Usage Guide/Train-as-a-Service.md	
+++ b/docs/source_en/Usage Guide/Train-as-a-Service.md	
@@ -108,7 +108,7 @@ trajectory = Trajectory(
     ]
 )
 
-input_feature = template.encode(trajectory, add_generation_prompt=True)
+input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
 
 input_ids = input_feature['input_ids'].tolist()
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
index 66676d4e..9ad901de 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
@@ -111,7 +111,7 @@ trajectory = Trajectory(
     ]
 )
 
-input_feature = template.encode(trajectory, add_generation_prompt=True)
+input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
 
 input_ids = input_feature['input_ids'].tolist()
 
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md"
index d9cdba97..6e77b415 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\250\241\346\235\277/Template.md"
@@ -13,10 +13,6 @@ class Template:
                  default_system: Optional[str] = None):
         ...
 
-    def encode(self, trajectory: Trajectory, add_generation_prompt: bool = False) -> InputFeature:
-        # 编码单条样本
-        ...
-
     def batch_encode(self, trajectories: Union[Dict[str, Any], List[Trajectory]]) -> List[InputFeature]:
         # 批量编码样本
         ...
diff --git a/src/twinkle/sampler/base.py b/src/twinkle/sampler/base.py
index d8222ead..47959b92 100644
--- a/src/twinkle/sampler/base.py
+++ b/src/twinkle/sampler/base.py
@@ -79,11 +79,11 @@ def encode_trajectory(self,
         if template is None:
             raise ValueError(f"Template not set for adapter '{adapter_name}'. Use set_template() first.")
 
-        encoded = template.encode(trajectory, add_generation_prompt=add_generation_prompt)
+        encoded = template._encode(trajectory, add_generation_prompt=add_generation_prompt)
 
         input_ids = encoded.get('input_ids')
         if input_ids is None:
-            raise ValueError("Template.encode() must return 'input_ids'")
+            raise ValueError("Template._encode() must return 'input_ids'")
         if hasattr(input_ids, 'tolist'):
             input_ids = input_ids.tolist()
 
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index 9772f42c..cb2419b1 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -548,7 +548,7 @@ def _encode_messages(self, trajectory: Trajectory, add_generation_prompt: bool =
         trajectory.update(input_feature)
         return trajectory
 
-    def encode(self, trajectory: Trajectory, add_generation_prompt: bool = False, **kwargs) -> InputFeature:
+    def _encode(self, trajectory: Trajectory, add_generation_prompt: bool = False, **kwargs) -> InputFeature:
         return self._encode_messages(trajectory, add_generation_prompt, **kwargs)
 
     @staticmethod
@@ -645,7 +645,7 @@ def batch_encode(
         from concurrent.futures import ThreadPoolExecutor
         from functools import partial
         encode_fn = partial(
-            self.encode,
+            self._encode,
             add_generation_prompt=add_generation_prompt,
             **kwargs,
         )

From 1e7df011938b38b0fffc7c366efbd03797d58c79 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 15 Apr 2026 20:35:03 +0800
Subject: [PATCH 3/8] fix

---
 docs/source_en/Components/Gym/Gym.md          | 26 ++++++
 docs/source_en/Components/Gym/index.rst       |  6 ++
 docs/source_en/Components/Hub/Hub.md          | 20 ++++
 docs/source_en/Components/Hub/index.rst       |  6 ++
 .../Components/Loss Scale/LossScale.md        | 16 ++++
 .../source_en/Components/Loss Scale/index.rst |  6 ++
 .../Components/Loss/ChunkedCrossEntropy.md    | 22 +++++
 docs/source_en/Components/Loss/DPOLoss.md     | 70 ++++++++++++++
 docs/source_en/Components/Loss/GKDLoss.md     | 27 ++++++
 docs/source_en/Components/Loss/GRPOLoss.md    | 75 +++++++++++++++
 docs/source_en/Components/Loss/MSELoss.md     | 12 +++
 docs/source_en/Components/Loss/index.rst      |  5 +
 .../Metrics/CompletionRewardMetric.md         | 27 ++++++
 .../source_en/Components/Metrics/DPOMetric.md | 27 ++++++
 docs/source_en/Components/Metrics/index.rst   |  2 +
 .../Built-in-Preprocessors.md                 | 92 +++++++++++++++++++
 .../Preprocessor and Filter/index.rst         |  1 +
 .../Components/Reward/GSM8KReward.md          | 34 +++++++
 .../Components/Reward/MultiModalReward.md     | 22 +++++
 .../Components/Reward/OlympiadBenchReward.md  | 55 +++++++++++
 docs/source_en/Components/Reward/index.rst    |  3 +
 .../Task Processor/GRPOProcessor.md           | 19 ++++
 .../Components/Task Processor/index.rst       |  1 +
 docs/source_en/index.rst                      |  3 +
 docs/source_zh/index.rst                      |  3 +
 .../\347\273\204\344\273\266/Gym/Gym.md"      | 26 ++++++
 .../\347\273\204\344\273\266/Gym/index.rst"   |  6 ++
 .../\347\273\204\344\273\266/Hub/Hub.md"      | 20 ++++
 .../\347\273\204\344\273\266/Hub/index.rst"   |  6 ++
 .../GRPOProcessor.md"                         | 19 ++++
 .../index.rst"                                |  1 +
 .../\345\245\226\345\212\261/GSM8KReward.md"  | 34 +++++++
 .../MultiModalReward.md"                      | 22 +++++
 .../OlympiadBenchReward.md"                   | 55 +++++++++++
 .../\345\245\226\345\212\261/index.rst"       |  3 +
 .../CompletionRewardMetric.md"                | 27 ++++++
 .../\346\214\207\346\240\207/DPOMetric.md"    | 27 ++++++
 .../\346\214\207\346\240\207/index.rst"       |  2 +
 .../ChunkedCrossEntropy.md"                   | 22 +++++
 .../\346\215\237\345\244\261/DPOLoss.md"      | 70 ++++++++++++++
 .../\346\215\237\345\244\261/GKDLoss.md"      | 27 ++++++
 .../\346\215\237\345\244\261/GRPOLoss.md"     | 75 +++++++++++++++
 .../\346\215\237\345\244\261/MSELoss.md"      | 12 +++
 .../\346\215\237\345\244\261/index.rst"       |  5 +
 .../LossScale.md"                             | 16 ++++
 .../index.rst"                                |  6 ++
 .../index.rst"                                |  1 +
 ...04\345\244\204\347\220\206\345\231\250.md" | 92 +++++++++++++++++++
 48 files changed, 1154 insertions(+)
 create mode 100644 docs/source_en/Components/Gym/Gym.md
 create mode 100644 docs/source_en/Components/Gym/index.rst
 create mode 100644 docs/source_en/Components/Hub/Hub.md
 create mode 100644 docs/source_en/Components/Hub/index.rst
 create mode 100644 docs/source_en/Components/Loss Scale/LossScale.md
 create mode 100644 docs/source_en/Components/Loss Scale/index.rst
 create mode 100644 docs/source_en/Components/Loss/ChunkedCrossEntropy.md
 create mode 100644 docs/source_en/Components/Loss/DPOLoss.md
 create mode 100644 docs/source_en/Components/Loss/GKDLoss.md
 create mode 100644 docs/source_en/Components/Loss/GRPOLoss.md
 create mode 100644 docs/source_en/Components/Loss/MSELoss.md
 create mode 100644 docs/source_en/Components/Metrics/CompletionRewardMetric.md
 create mode 100644 docs/source_en/Components/Metrics/DPOMetric.md
 create mode 100644 docs/source_en/Components/Preprocessor and Filter/Built-in-Preprocessors.md
 create mode 100644 docs/source_en/Components/Reward/GSM8KReward.md
 create mode 100644 docs/source_en/Components/Reward/MultiModalReward.md
 create mode 100644 docs/source_en/Components/Reward/OlympiadBenchReward.md
 create mode 100644 docs/source_en/Components/Task Processor/GRPOProcessor.md
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/Gym/Gym.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/Gym/index.rst"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/Hub/Hub.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/Hub/index.rst"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/GRPOProcessor.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/GSM8KReward.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/MultiModalReward.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/OlympiadBenchReward.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/CompletionRewardMetric.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/DPOMetric.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/ChunkedCrossEntropy.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/DPOLoss.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/GKDLoss.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/GRPOLoss.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/MSELoss.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261\347\274\251\346\224\276/LossScale.md"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261\347\274\251\346\224\276/index.rst"
 create mode 100644 "docs/source_zh/\347\273\204\344\273\266/\351\242\204\345\244\204\347\220\206\345\231\250\345\222\214\350\277\207\346\273\244\345\231\250/\345\206\205\347\275\256\351\242\204\345\244\204\347\220\206\345\231\250.md"

diff --git a/docs/source_en/Components/Gym/Gym.md b/docs/source_en/Components/Gym/Gym.md
new file mode 100644
index 00000000..4db355b8
--- /dev/null
+++ b/docs/source_en/Components/Gym/Gym.md
@@ -0,0 +1,26 @@
+# Gym
+
+The Gym component provides an interface for reinforcement learning environments in Twinkle.
+
+```python
+from twinkle.gym import Gym
+
+class CustomGym(Gym):
+
+    def step(self, trajectories, **kwargs):
+        """
+        Execute one RL step: evaluate trajectories and return rewards.
+
+        Args:
+            trajectories: Model-generated trajectories to evaluate
+            **kwargs: Additional arguments
+
+        Returns:
+            Reward values for each trajectory
+        """
+        ...
+```
+
+The Gym abstraction allows you to plug in custom RL environments that interact with the training loop. It decouples reward computation and environment interaction from the core training logic.
+
+> Gym is typically used in on-policy RL training where the environment needs to provide feedback on model-generated outputs.
diff --git a/docs/source_en/Components/Gym/index.rst b/docs/source_en/Components/Gym/index.rst
new file mode 100644
index 00000000..85d941b9
--- /dev/null
+++ b/docs/source_en/Components/Gym/index.rst
@@ -0,0 +1,6 @@
+Gym
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Gym.md
diff --git a/docs/source_en/Components/Hub/Hub.md b/docs/source_en/Components/Hub/Hub.md
new file mode 100644
index 00000000..f0380993
--- /dev/null
+++ b/docs/source_en/Components/Hub/Hub.md
@@ -0,0 +1,20 @@
+# Hub
+
+The Hub component provides unified access to model and dataset hubs, supporting both ModelScope and Hugging Face.
+
+```python
+from twinkle.hub import Hub
+
+# Download from ModelScope
+Hub.download('ms://Qwen/Qwen3.5-4B', local_dir='./models')
+
+# Download from Hugging Face
+Hub.download('hf://Qwen/Qwen3.5-4B', local_dir='./models')
+
+# Upload checkpoints
+Hub.upload(local_path='./my-model', repo_id='my-org/my-model', hub='ms')
+```
+
+The `ms://` and `hf://` prefixes determine which hub to use. Hub handles authentication, caching, and progress tracking automatically.
+
+> Hub is used internally by Dataset, Model, and other components. You can also use it directly for custom download/upload workflows.
diff --git a/docs/source_en/Components/Hub/index.rst b/docs/source_en/Components/Hub/index.rst
new file mode 100644
index 00000000..3d6140e2
--- /dev/null
+++ b/docs/source_en/Components/Hub/index.rst
@@ -0,0 +1,6 @@
+Hub
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Hub.md
diff --git a/docs/source_en/Components/Loss Scale/LossScale.md b/docs/source_en/Components/Loss Scale/LossScale.md
new file mode 100644
index 00000000..98f31194
--- /dev/null
+++ b/docs/source_en/Components/Loss Scale/LossScale.md	
@@ -0,0 +1,16 @@
+# Loss Scale
+
+The LossScale component controls loss scaling for numerical stability during training, particularly useful for mixed-precision training.
+
+```python
+from twinkle.loss_scale import LossScale
+
+loss_scale = LossScale()
+
+# Apply scaling to loss before backward
+scaled_loss = loss_scale(loss, num_tokens)
+```
+
+LossScale handles the normalization of loss values by the number of valid tokens, ensuring consistent gradient magnitudes across different batch sizes and sequence lengths.
+
+> LossScale is used internally by the model's training pipeline. It is automatically applied when using `model.forward_backward()`.
diff --git a/docs/source_en/Components/Loss Scale/index.rst b/docs/source_en/Components/Loss Scale/index.rst
new file mode 100644
index 00000000..83e4fd9e
--- /dev/null
+++ b/docs/source_en/Components/Loss Scale/index.rst	
@@ -0,0 +1,6 @@
+Loss Scale
+===============
+.. toctree::
+   :maxdepth: 1
+
+   LossScale.md
diff --git a/docs/source_en/Components/Loss/ChunkedCrossEntropy.md b/docs/source_en/Components/Loss/ChunkedCrossEntropy.md
new file mode 100644
index 00000000..11fd6535
--- /dev/null
+++ b/docs/source_en/Components/Loss/ChunkedCrossEntropy.md
@@ -0,0 +1,22 @@
+# Chunked Cross Entropy
+
+A memory-efficient variant of cross-entropy loss that processes the vocabulary dimension in chunks to reduce peak GPU memory usage.
+
+```python
+from twinkle.loss import ChunkedCrossEntropyLoss
+
+loss_fn = ChunkedCrossEntropyLoss(
+    chunk_size=1024,           # vocabulary chunk size
+    reduction='mean',
+)
+
+model.set_loss(loss_fn)
+```
+
+**Parameters:**
+- `chunk_size`: Number of vocabulary tokens to process per chunk (default: 1024)
+- `reduction`: Reduction mode — `sum`, `mean`, or `none`
+
+The implementation uses a custom autograd function that splits the logit-to-loss computation into chunks along the vocabulary dimension. This avoids materializing the full `[batch*seq_len, vocab_size]` probability tensor, significantly reducing memory for large vocabularies.
+
+> Useful when training with large vocabulary models where standard cross-entropy causes OOM errors.
diff --git a/docs/source_en/Components/Loss/DPOLoss.md b/docs/source_en/Components/Loss/DPOLoss.md
new file mode 100644
index 00000000..c85cbf40
--- /dev/null
+++ b/docs/source_en/Components/Loss/DPOLoss.md
@@ -0,0 +1,70 @@
+# DPO Loss
+
+Direct Preference Optimization (DPO) and its variants are used for aligning models with human preferences without requiring a separate reward model.
+
+## DPOLoss
+
+The standard DPO loss supports multiple loss types and optional reference-free mode.
+
+```python
+from twinkle.loss import DPOLoss
+
+loss_fn = DPOLoss(
+    loss_type='sigmoid',  # 'sigmoid', 'hinge', 'ipo', 'kto'
+    beta=0.1,
+    sft_weight=0.0,       # optional SFT regularization weight
+    reference_free=False,
+)
+
+model.set_loss(loss_fn)
+```
+
+**Parameters:**
+- `loss_type`: DPO variant — `sigmoid` (default), `hinge`, `ipo`, or `kto`
+- `beta`: Temperature parameter controlling preference strength
+- `sft_weight`: Weight for an additional SFT loss term on chosen responses
+- `reference_free`: If `True`, skips reference model log-probabilities
+
+The loss expects interleaved chosen/rejected pairs in the batch. It computes sequence-level log-probabilities and optimizes the policy to prefer chosen over rejected responses.
+
+## SimPOLoss
+
+Simplified Preference Optimization that removes the need for a reference model by using length-normalized log-probabilities.
+
+```python
+from twinkle.loss import SimPOLoss
+
+loss_fn = SimPOLoss(beta=2.0, gamma=1.0)
+```
+
+**Parameters:**
+- `beta`: Scaling factor for the logit difference
+- `gamma`: Margin term added to preference gap
+
+## CPOLoss
+
+Contrastive Preference Optimization combines preference learning with behavior cloning.
+
+```python
+from twinkle.loss import CPOLoss
+
+loss_fn = CPOLoss(beta=0.1, cpo_alpha=1.0)
+```
+
+**Parameters:**
+- `beta`: Temperature for the preference loss
+- `cpo_alpha`: Weight of the behavior cloning (NLL) loss on chosen responses
+
+## ORPOLoss
+
+Odds Ratio Preference Optimization unifies SFT and preference alignment in a single loss.
+
+```python
+from twinkle.loss import ORPOLoss
+
+loss_fn = ORPOLoss(beta=0.1)
+```
+
+The loss combines a standard NLL term on chosen responses with a log-odds-ratio penalty that pushes the model away from rejected responses.
+
+> All preference losses inherit shared utilities from `PreferenceLossBase`, including log-probability computation, chosen/rejected splitting, and sequence-level aggregation.
diff --git a/docs/source_en/Components/Loss/GKDLoss.md b/docs/source_en/Components/Loss/GKDLoss.md
new file mode 100644
index 00000000..07a4fe48
--- /dev/null
+++ b/docs/source_en/Components/Loss/GKDLoss.md
@@ -0,0 +1,27 @@
+# GKD Loss
+
+Generalized Knowledge Distillation (GKD) loss uses Jensen-Shannon Divergence for distilling knowledge from a teacher model to a student model.
+
+```python
+from twinkle.loss import GKDLoss
+
+loss_fn = GKDLoss(
+    teacher_mode='full',  # 'full', 'topk_local', 'topk_remote'
+    beta=0.5,             # interpolation weight for JSD
+    temperature=1.0,
+)
+
+model.set_loss(loss_fn)
+```
+
+**Parameters:**
+- `teacher_mode`: How teacher logits are obtained
+  - `full`: Full vocabulary logits from a local teacher model
+  - `topk_local`: Top-k logits from a local teacher with chunked computation for memory efficiency
+  - `topk_remote`: Top-k logits from a remote API teacher
+- `beta`: Interpolation weight between student and teacher distributions in JSD (0 = pure student, 1 = pure teacher)
+- `temperature`: Softmax temperature for both student and teacher distributions
+
+The GKD loss implements chunked computation internally to reduce peak memory usage when working with large vocabularies.
+
+> GKD is useful for training smaller student models that mimic the behavior of larger teacher models, and supports both local and remote teacher setups.
diff --git a/docs/source_en/Components/Loss/GRPOLoss.md b/docs/source_en/Components/Loss/GRPOLoss.md
new file mode 100644
index 00000000..9817d491
--- /dev/null
+++ b/docs/source_en/Components/Loss/GRPOLoss.md
@@ -0,0 +1,75 @@
+# GRPO Loss
+
+Group Relative Policy Optimization (GRPO) and its variants implement policy gradient losses with PPO-style clipping and KL regularization.
+
+## GRPOLoss
+
+The standard GRPO loss with importance sampling, PPO clipping, and optional KL penalty.
+
+```python
+from twinkle.loss import GRPOLoss
+
+loss_fn = GRPOLoss(
+    clip_range=0.2,
+    beta=0.01,        # KL penalty coefficient
+)
+
+model.set_loss(loss_fn)
+```
+
+**Parameters:**
+- `clip_range`: PPO clipping range for importance weights (default: 0.2)
+- `beta`: KL divergence penalty coefficient. Set to 0 to disable KL regularization
+
+The loss handles both standard batches and packed sequences (detected via `position_ids`). It computes per-token importance weights, applies PPO clipping, and optionally adds a KL penalty term against the reference policy.
+
+## Variants
+
+Twinkle provides several GRPO variants:
+
+### GSPOLoss
+
+Sequence-level importance sampling variant that computes importance weights at the sequence level rather than token level.
+
+```python
+from twinkle.loss import GSPOLoss
+loss_fn = GSPOLoss(clip_range=0.2, beta=0.01)
+```
+
+### SAPOLoss
+
+Soft-gated Advantage Policy Optimization applies a sigmoid gate on the advantage to control the optimization direction.
+
+```python
+from twinkle.loss import SAPOLoss
+loss_fn = SAPOLoss(clip_range=0.2, beta=0.01, tau=1.0)
+```
+
+### CISPOLoss
+
+Clipped Importance Sampling Policy Optimization applies explicit clipping to importance weights before multiplying with advantages.
+
+```python
+from twinkle.loss import CISPOLoss
+loss_fn = CISPOLoss(clip_range=0.2, beta=0.01)
+```
+
+### BNPOLoss
+
+Batch-Normalized Policy Optimization normalizes per-token loss across the batch before aggregation.
+
+```python
+from twinkle.loss import BNPOLoss
+loss_fn = BNPOLoss(clip_range=0.2, beta=0.01)
+```
+
+### DRGRPOLoss
+
+Dynamic Ratio GRPO with fixed normalization that uses a fixed denominator for importance weight computation.
+
+```python
+from twinkle.loss import DRGRPOLoss
+loss_fn = DRGRPOLoss(clip_range=0.2, beta=0.01)
+```
+
+> All GRPO variants share the same base pipeline for packed-sequence handling, log-probability alignment, and KL penalty computation. They differ primarily in how importance weights and advantages are combined.
diff --git a/docs/source_en/Components/Loss/MSELoss.md b/docs/source_en/Components/Loss/MSELoss.md
new file mode 100644
index 00000000..576fd80d
--- /dev/null
+++ b/docs/source_en/Components/Loss/MSELoss.md
@@ -0,0 +1,12 @@
+# MSE Loss
+
+Mean Squared Error loss for regression-style training tasks.
+
+```python
+from twinkle.loss import MSELoss
+
+loss_fn = MSELoss()
+model.set_loss(loss_fn)
+```
+
+MSELoss computes the mean squared error between model output logits and the target labels. It is useful for tasks such as reward model training or value function estimation.
diff --git a/docs/source_en/Components/Loss/index.rst b/docs/source_en/Components/Loss/index.rst
index bf014466..dceaf20f 100644
--- a/docs/source_en/Components/Loss/index.rst
+++ b/docs/source_en/Components/Loss/index.rst
@@ -4,4 +4,9 @@ Loss
    :maxdepth: 1
 
    CrossEntropy.md
+   ChunkedCrossEntropy.md
+   DPOLoss.md
+   GKDLoss.md
+   GRPOLoss.md
+   MSELoss.md
    Building-Loss.md
diff --git a/docs/source_en/Components/Metrics/CompletionRewardMetric.md b/docs/source_en/Components/Metrics/CompletionRewardMetric.md
new file mode 100644
index 00000000..79742861
--- /dev/null
+++ b/docs/source_en/Components/Metrics/CompletionRewardMetric.md
@@ -0,0 +1,27 @@
+# CompletionRewardMetric
+
+The CompletionRewardMetric aggregates key statistics during RLHF training, including generation time, weight synchronization time, reward scores, and completion lengths.
+
+```python
+from twinkle.metric import CompletionRewardMetric
+
+metric = CompletionRewardMetric(device_mesh=..., process_group=...)
+
+# Accumulate during training loop
+metric.accumulate(
+    inputs,
+    outputs,
+    generation_time=gen_time,
+    weight_sync_time=sync_time,
+    rewards=reward_values,
+    completions=completion_texts,
+)
+
+# Calculate aggregated metrics
+result = metric.calculate()
+# result contains: generation_time, weight_sync_time, mean_reward, mean_completion_length, etc.
+```
+
+This metric is designed for GRPO and other RL training loops where monitoring generation quality and system performance is essential.
+
+> CompletionRewardMetric performs DP-aware aggregation, correctly averaging metrics across all data-parallel ranks.
diff --git a/docs/source_en/Components/Metrics/DPOMetric.md b/docs/source_en/Components/Metrics/DPOMetric.md
new file mode 100644
index 00000000..809d3083
--- /dev/null
+++ b/docs/source_en/Components/Metrics/DPOMetric.md
@@ -0,0 +1,27 @@
+# DPOMetric
+
+The DPOMetric tracks preference optimization-specific statistics during DPO training.
+
+```python
+from twinkle.metric import DPOMetric
+
+metric = DPOMetric(device_mesh=..., process_group=...)
+
+# Accumulate after each forward pass
+metric.accumulate(inputs, outputs, ref_outputs=ref_outputs)
+
+# Calculate aggregated metrics
+result = metric.calculate()
+```
+
+**Tracked metrics:**
+- `chosen_logps`: Average log-probability of chosen responses
+- `rejected_logps`: Average log-probability of rejected responses
+- `ref_chosen_logps`: Reference model log-probability of chosen responses
+- `ref_rejected_logps`: Reference model log-probability of rejected responses
+- `rewards/chosen`: Implicit reward for chosen responses
+- `rewards/rejected`: Implicit reward for rejected responses
+- `accuracy`: Fraction of pairs where chosen is preferred over rejected
+- `margin`: Average reward margin between chosen and rejected
+
+> DPOMetric performs DP-aware aggregation across all data-parallel ranks. It supports both interleaved and separate chosen/rejected batch formats.
diff --git a/docs/source_en/Components/Metrics/index.rst b/docs/source_en/Components/Metrics/index.rst
index 4bc035b1..5d50e183 100644
--- a/docs/source_en/Components/Metrics/index.rst
+++ b/docs/source_en/Components/Metrics/index.rst
@@ -6,4 +6,6 @@ Metrics
    TrainMetric.md
    LossMetric.md
    Accuracy.md
+   CompletionRewardMetric.md
+   DPOMetric.md
    Building-Metrics.md
diff --git a/docs/source_en/Components/Preprocessor and Filter/Built-in-Preprocessors.md b/docs/source_en/Components/Preprocessor and Filter/Built-in-Preprocessors.md
new file mode 100644
index 00000000..eb6482de
--- /dev/null
+++ b/docs/source_en/Components/Preprocessor and Filter/Built-in-Preprocessors.md	
@@ -0,0 +1,92 @@
+# Built-in Preprocessors
+
+Twinkle provides a collection of built-in preprocessors for common dataset formats. Each converts raw data into standardized `Trajectory` objects.
+
+## LLM Preprocessors
+
+### CompetitionMathProcessor
+
+Converts competition math datasets with `problem` and `solution` fields.
+
+```python
+dataset.map('CompetitionMathProcessor')
+# Input: {'problem': '...', 'solution': '...'}
+# Output: Trajectory with user message (problem) and assistant message (solution)
+```
+
+### CompetitionMathGRPOProcessor
+
+Similar to CompetitionMathProcessor but stores the solution in `user_data` for use as ground truth in GRPO reward computation.
+
+```python
+dataset.map('CompetitionMathGRPOProcessor')
+```
+
+### SelfCognitionProcessor
+
+Replaces template placeholders with model identity information for self-cognition training.
+
+```python
+dataset.map('SelfCognitionProcessor', model_name='MyModel', model_author='MyOrg')
+```
+
+### AlpacaProcessor
+
+Converts Alpaca-format datasets with `instruction`, `input`, and `output` fields.
+
+```python
+dataset.map('AlpacaProcessor')
+# Input: {'instruction': '...', 'input': '...', 'output': '...'}
+```
+
+### CountdownProcessor
+
+Generates countdown arithmetic problems for reasoning training.
+
+```python
+dataset.map('CountdownProcessor')
+```
+
+### GSM8KProcessor
+
+Preprocesses GSM8K math datasets, extracting ground truth answers from the `#### answer` format.
+
+```python
+dataset.map('GSM8KProcessor')
+# Extracts answer from '#### 42' format and stores in user_data
+```
+
+## DPO Preprocessor
+
+### EmojiDPOProcessor
+
+Converts emoji-based preference datasets into positive/negative trajectory pairs for DPO training.
+
+```python
+dataset.map('EmojiDPOProcessor')
+# Input: {'prompt': '...', 'chosen': '...', 'rejected': '...'}
+# Output: Interleaved chosen and rejected Trajectory pairs
+```
+
+## Multimodal Preprocessors
+
+### CLEVRProcessor
+
+Preprocesses CLEVR visual reasoning datasets with image handling.
+
+```python
+dataset.map('CLEVRProcessor')
+# Input: {'question': '...', 'answer': '...', 'image': PIL.Image}
+# Output: Trajectory with multimodal content (image + text)
+```
+
+### OlympiadBenchProcessor
+
+Preprocesses OlympiadBench multimodal math/physics problems with image collection and metadata storage.
+
+```python
+dataset.map('OlympiadBenchProcessor')
+# Handles multiple images per problem, stores ground truth and metadata in user_data
+```
+
+> All preprocessors follow the same interface: `__call__(rows) -> List[Trajectory]`. You can register custom preprocessors following the same pattern (see [Preprocessor](Preprocessor.md)).
diff --git a/docs/source_en/Components/Preprocessor and Filter/index.rst b/docs/source_en/Components/Preprocessor and Filter/index.rst
index 0a142af0..a91194c8 100644
--- a/docs/source_en/Components/Preprocessor and Filter/index.rst	
+++ b/docs/source_en/Components/Preprocessor and Filter/index.rst	
@@ -4,4 +4,5 @@ Preprocessor and Filter
    :maxdepth: 1
 
    Preprocessor.md
+   Built-in-Preprocessors.md
    Filter.md
diff --git a/docs/source_en/Components/Reward/GSM8KReward.md b/docs/source_en/Components/Reward/GSM8KReward.md
new file mode 100644
index 00000000..5c3c7558
--- /dev/null
+++ b/docs/source_en/Components/Reward/GSM8KReward.md
@@ -0,0 +1,34 @@
+# GSM8K Reward
+
+Reward functions specifically designed for evaluating GSM8K math problem solutions.
+
+## GSM8KAccuracyReward
+
+Evaluates the correctness of GSM8K answers by extracting boxed or hash-formatted (`####`) answers and performing numeric/string comparison.
+
+```python
+from twinkle.reward import GSM8KAccuracyReward
+
+reward_fn = GSM8KAccuracyReward()
+rewards = reward_fn(generated_trajectories, ground_truth_trajectories)
+# rewards: List[float], 1.0 for correct, 0.0 for incorrect
+```
+
+The reward function:
+1. Extracts the answer from `\boxed{...}` or `#### ...` format in the model's completion
+2. Extracts the ground truth answer from the reference trajectory
+3. Performs numeric comparison (with tolerance) or exact string matching
+
+## GSM8KFormatReward
+
+Checks whether the model output contains a properly formatted answer section.
+
+```python
+from twinkle.reward import GSM8KFormatReward
+
+reward_fn = GSM8KFormatReward()
+rewards = reward_fn(trajectories, ground_truths)
+# rewards: List[float], 1.0 if format is valid, 0.0 otherwise
+```
+
+> Use GSM8KAccuracyReward and GSM8KFormatReward together as a composite reward for GRPO training on math problem solving tasks.
diff --git a/docs/source_en/Components/Reward/MultiModalReward.md b/docs/source_en/Components/Reward/MultiModalReward.md
new file mode 100644
index 00000000..0d50d183
--- /dev/null
+++ b/docs/source_en/Components/Reward/MultiModalReward.md
@@ -0,0 +1,22 @@
+# MultiModal Reward
+
+Reward function for evaluating multimodal visual question answering (VQA) tasks.
+
+## MultiModalAccuracyReward
+
+Evaluates the correctness of multimodal VQA answers with a fallback to symbolic math verification.
+
+```python
+from twinkle.reward import MultiModalAccuracyReward
+
+reward_fn = MultiModalAccuracyReward()
+rewards = reward_fn(generated_trajectories, ground_truth_trajectories)
+# rewards: List[float], 1.0 for correct, 0.0 for incorrect
+```
+
+The reward function:
+1. Extracts the model's answer from the completion text
+2. Compares with ground truth using exact string matching
+3. Falls back to `math_verify` for symbolic expression comparison when string matching fails
+
+> Designed for visual reasoning tasks such as CLEVR, where answers may be numeric, boolean, or short text.
diff --git a/docs/source_en/Components/Reward/OlympiadBenchReward.md b/docs/source_en/Components/Reward/OlympiadBenchReward.md
new file mode 100644
index 00000000..2150138e
--- /dev/null
+++ b/docs/source_en/Components/Reward/OlympiadBenchReward.md
@@ -0,0 +1,55 @@
+# OlympiadBench Reward
+
+A family of reward functions for evaluating OlympiadBench math and physics competition problems.
+
+## OlympiadBenchAccuracyReward
+
+Evaluates answer correctness with support for LaTeX normalization, numeric tolerance, and partial matching.
+
+```python
+from twinkle.reward import OlympiadBenchAccuracyReward
+
+reward_fn = OlympiadBenchAccuracyReward()
+rewards = reward_fn(generated_trajectories, ground_truth_trajectories)
+# rewards: List[float], 1.0 for correct, 0.0 for incorrect
+```
+
+The reward function:
+1. Extracts boxed answers from `\boxed{...}` with nested brace handling
+2. Normalizes both prediction and ground truth (LaTeX, units, fractions)
+3. Compares via normalized string matching or numeric comparison with tolerance
+
+## OlympiadBenchFormatReward
+
+Validates the structural format of model outputs.
+
+```python
+from twinkle.reward import OlympiadBenchFormatReward
+
+reward_fn = OlympiadBenchFormatReward()
+rewards = reward_fn(trajectories, ground_truths)
+# rewards: List[float], scores based on format quality
+```
+
+Scoring criteria:
+- Presence of `\boxed{...}` answer
+- Answer positioning (should appear near the end)
+- Answer uniqueness and consistency
+
+## OlympiadBenchQualityReward
+
+A composite quality reward combining multiple aspects of response quality.
+
+```python
+from twinkle.reward import OlympiadBenchQualityReward
+
+reward_fn = OlympiadBenchQualityReward()
+rewards = reward_fn(trajectories, ground_truths)
+```
+
+Quality dimensions:
+- **Reasoning structure**: Detection of step-by-step reasoning patterns
+- **Length appropriateness**: Smooth penalty curve for responses that are too short or too long
+- **Content uniqueness**: Penalizes repetitive content within the response
+
+> These rewards can be used individually or combined as a composite reward for GRPO training on olympiad-level math and physics problems.
diff --git a/docs/source_en/Components/Reward/index.rst b/docs/source_en/Components/Reward/index.rst
index 401d9c89..20e93d62 100644
--- a/docs/source_en/Components/Reward/index.rst
+++ b/docs/source_en/Components/Reward/index.rst
@@ -4,3 +4,6 @@ Reward
    :maxdepth: 1
 
    Reward.md
+   GSM8KReward.md
+   MultiModalReward.md
+   OlympiadBenchReward.md
diff --git a/docs/source_en/Components/Task Processor/GRPOProcessor.md b/docs/source_en/Components/Task Processor/GRPOProcessor.md
new file mode 100644
index 00000000..adff73c4
--- /dev/null
+++ b/docs/source_en/Components/Task Processor/GRPOProcessor.md	
@@ -0,0 +1,19 @@
+# GRPOLossProcessor
+
+GRPOLossProcessor is a task processor wrapper designed for GRPO reinforcement learning training. It extends InputProcessor with GRPO-specific data preparation.
+
+```python
+from twinkle.processor import GRPOLossProcessor
+
+processor = GRPOLossProcessor(
+    device_mesh=...,
+    padding_free=False,
+    framework='transformers',
+)
+
+model.set_processor(processor)
+```
+
+GRPOLossProcessor wraps the base `InputProcessor` and adds handling for GRPO-specific fields such as advantages, old log-probabilities, and reference log-probabilities that are required by the GRPO loss function.
+
+> For standard SFT tasks, use `InputProcessor` directly. Use `GRPOLossProcessor` when your training loop involves GRPO or its variants.
diff --git a/docs/source_en/Components/Task Processor/index.rst b/docs/source_en/Components/Task Processor/index.rst
index 1e9d600a..1f20fdbc 100644
--- a/docs/source_en/Components/Task Processor/index.rst	
+++ b/docs/source_en/Components/Task Processor/index.rst	
@@ -4,3 +4,4 @@ Task Processor
    :maxdepth: 1
 
    InputProcessor.md
+   GRPOProcessor.md
diff --git a/docs/source_en/index.rst b/docs/source_en/index.rst
index f35db745..ef477f7f 100644
--- a/docs/source_en/index.rst
+++ b/docs/source_en/index.rst
@@ -30,9 +30,12 @@ Twinkle DOCUMENTATION
    Components/Sampler/index.rst
    Components/Reward/index.rst
    Components/Advantage/index.rst
+   Components/Gym/index.rst
+   Components/Hub/index.rst
    Components/Checkpoint Engine/index.rst
    Components/Metrics/index.rst
    Components/Loss/index.rst
+   Components/Loss Scale/index.rst
    Components/LRScheduler/index.rst
    Components/Patch/index.rst
    Components/Plugin/index.rst
diff --git a/docs/source_zh/index.rst b/docs/source_zh/index.rst
index 175beb7b..3d07d4b2 100644
--- a/docs/source_zh/index.rst
+++ b/docs/source_zh/index.rst
@@ -30,9 +30,12 @@ Twinkle DOCUMENTATION
    组件/采样器/index.rst
    组件/奖励/index.rst
    组件/优势/index.rst
+   组件/Gym/index.rst
+   组件/Hub/index.rst
    组件/检查点引擎/index.rst
    组件/指标/index.rst
    组件/损失/index.rst
+   组件/损失缩放/index.rst
    组件/LRScheduler/index.rst
    组件/补丁/index.rst
    组件/组件化/index.rst
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Gym/Gym.md" "b/docs/source_zh/\347\273\204\344\273\266/Gym/Gym.md"
new file mode 100644
index 00000000..63dc87aa
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Gym/Gym.md"
@@ -0,0 +1,26 @@
+# Gym
+
+Gym 组件为 Twinkle 中的强化学习环境提供接口。
+
+```python
+from twinkle.gym import Gym
+
+class CustomGym(Gym):
+
+    def step(self, trajectories, **kwargs):
+        """
+        执行一个 RL 步骤：评估轨迹并返回奖励。
+
+        Args:
+            trajectories: 模型生成的待评估轨迹
+            **kwargs: 额外参数
+
+        Returns:
+            每个轨迹的奖励值
+        """
+        ...
+```
+
+Gym 抽象允许你插入自定义 RL 环境与训练循环交互。它将奖励计算和环境交互与核心训练逻辑解耦。
+
+> Gym 通常用于在线策略 RL 训练中，环境需要对模型生成的输出提供反馈。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Gym/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/Gym/index.rst"
new file mode 100644
index 00000000..85d941b9
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Gym/index.rst"
@@ -0,0 +1,6 @@
+Gym
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Gym.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Hub/Hub.md" "b/docs/source_zh/\347\273\204\344\273\266/Hub/Hub.md"
new file mode 100644
index 00000000..1d0c8b55
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Hub/Hub.md"
@@ -0,0 +1,20 @@
+# Hub
+
+Hub 组件提供对模型和数据集仓库的统一访问，同时支持魔搭和 Hugging Face。
+
+```python
+from twinkle.hub import Hub
+
+# 从魔搭下载
+Hub.download('ms://Qwen/Qwen3.5-4B', local_dir='./models')
+
+# 从 Hugging Face 下载
+Hub.download('hf://Qwen/Qwen3.5-4B', local_dir='./models')
+
+# 上传检查点
+Hub.upload(local_path='./my-model', repo_id='my-org/my-model', hub='ms')
+```
+
+`ms://` 和 `hf://` 前缀决定使用哪个仓库。Hub 自动处理认证、缓存和进度跟踪。
+
+> Hub 被 Dataset、Model 和其他组件内部使用。你也可以直接使用它进行自定义的下载/上传工作流。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/Hub/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/Hub/index.rst"
new file mode 100644
index 00000000..3d6140e2
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/Hub/index.rst"
@@ -0,0 +1,6 @@
+Hub
+===============
+.. toctree::
+   :maxdepth: 1
+
+   Hub.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/GRPOProcessor.md" "b/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/GRPOProcessor.md"
new file mode 100644
index 00000000..afb8f094
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/GRPOProcessor.md"
@@ -0,0 +1,19 @@
+# GRPOLossProcessor
+
+GRPOLossProcessor 是专为 GRPO 强化学习训练设计的任务处理器包装器。它在 InputProcessor 基础上扩展了 GRPO 特有的数据准备功能。
+
+```python
+from twinkle.processor import GRPOLossProcessor
+
+processor = GRPOLossProcessor(
+    device_mesh=...,
+    padding_free=False,
+    framework='transformers',
+)
+
+model.set_processor(processor)
+```
+
+GRPOLossProcessor 包装了基础 `InputProcessor`，并添加了 GRPO 特有字段的处理，如优势值、旧对数概率和参考对数概率，这些是 GRPO 损失函数所需要的。
+
+> 对于标准 SFT 任务，直接使用 `InputProcessor`。当训练循环涉及 GRPO 或其变体时，使用 `GRPOLossProcessor`。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/index.rst"
index a2c88eaf..1eb839f0 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\344\273\273\345\212\241\345\244\204\347\220\206\345\231\250/index.rst"
@@ -4,3 +4,4 @@
    :maxdepth: 1
 
    InputProcessor.md
+   GRPOProcessor.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/GSM8KReward.md" "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/GSM8KReward.md"
new file mode 100644
index 00000000..cc5b7d02
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/GSM8KReward.md"
@@ -0,0 +1,34 @@
+# GSM8K 奖励
+
+专为评估 GSM8K 数学问题求解设计的奖励函数。
+
+## GSM8KAccuracyReward
+
+通过提取 boxed 或 hash 格式（`####`）的答案并进行数值/字符串比较来评估 GSM8K 答案的正确性。
+
+```python
+from twinkle.reward import GSM8KAccuracyReward
+
+reward_fn = GSM8KAccuracyReward()
+rewards = reward_fn(generated_trajectories, ground_truth_trajectories)
+# rewards: List[float], 1.0 表示正确, 0.0 表示错误
+```
+
+奖励函数的工作流程:
+1. 从模型补全中提取 `\boxed{...}` 或 `#### ...` 格式的答案
+2. 从参考轨迹中提取真实答案
+3. 执行数值比较（带容差）或精确字符串匹配
+
+## GSM8KFormatReward
+
+检查模型输出是否包含正确格式的答案部分。
+
+```python
+from twinkle.reward import GSM8KFormatReward
+
+reward_fn = GSM8KFormatReward()
+rewards = reward_fn(trajectories, ground_truths)
+# rewards: List[float], 1.0 表示格式有效, 0.0 表示无效
+```
+
+> 在数学问题求解的 GRPO 训练中，将 GSM8KAccuracyReward 和 GSM8KFormatReward 组合使用作为复合奖励。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/MultiModalReward.md" "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/MultiModalReward.md"
new file mode 100644
index 00000000..ef09d460
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/MultiModalReward.md"
@@ -0,0 +1,22 @@
+# 多模态奖励
+
+用于评估多模态视觉问答（VQA）任务的奖励函数。
+
+## MultiModalAccuracyReward
+
+评估多模态 VQA 答案的正确性，支持回退到符号数学验证。
+
+```python
+from twinkle.reward import MultiModalAccuracyReward
+
+reward_fn = MultiModalAccuracyReward()
+rewards = reward_fn(generated_trajectories, ground_truth_trajectories)
+# rewards: List[float], 1.0 表示正确, 0.0 表示错误
+```
+
+奖励函数的工作流程:
+1. 从补全文本中提取模型的答案
+2. 使用精确字符串匹配与真实答案比较
+3. 当字符串匹配失败时回退到 `math_verify` 进行符号表达式比较
+
+> 专为 CLEVR 等视觉推理任务设计，答案可能是数字、布尔值或短文本。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/OlympiadBenchReward.md" "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/OlympiadBenchReward.md"
new file mode 100644
index 00000000..ac1d9bd1
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/OlympiadBenchReward.md"
@@ -0,0 +1,55 @@
+# OlympiadBench 奖励
+
+用于评估 OlympiadBench 数学和物理竞赛问题的奖励函数族。
+
+## OlympiadBenchAccuracyReward
+
+评估答案正确性，支持 LaTeX 归一化、数值容差和部分匹配。
+
+```python
+from twinkle.reward import OlympiadBenchAccuracyReward
+
+reward_fn = OlympiadBenchAccuracyReward()
+rewards = reward_fn(generated_trajectories, ground_truth_trajectories)
+# rewards: List[float], 1.0 表示正确, 0.0 表示错误
+```
+
+奖励函数的工作流程:
+1. 从 `\boxed{...}` 中提取答案，支持嵌套大括号处理
+2. 归一化预测和真实答案（LaTeX、单位、分数）
+3. 通过归一化字符串匹配或带容差的数值比较进行判断
+
+## OlympiadBenchFormatReward
+
+验证模型输出的结构格式。
+
+```python
+from twinkle.reward import OlympiadBenchFormatReward
+
+reward_fn = OlympiadBenchFormatReward()
+rewards = reward_fn(trajectories, ground_truths)
+# rewards: List[float], 基于格式质量的分数
+```
+
+评分标准:
+- `\boxed{...}` 答案的存在性
+- 答案位置（应出现在末尾附近）
+- 答案的唯一性和一致性
+
+## OlympiadBenchQualityReward
+
+结合多个维度评估响应质量的复合奖励。
+
+```python
+from twinkle.reward import OlympiadBenchQualityReward
+
+reward_fn = OlympiadBenchQualityReward()
+rewards = reward_fn(trajectories, ground_truths)
+```
+
+质量维度:
+- **推理结构**: 检测逐步推理模式
+- **长度适当性**: 对过短或过长响应的平滑惩罚曲线
+- **内容唯一性**: 惩罚响应中的重复内容
+
+> 这些奖励可以单独使用或组合为复合奖励，用于竞赛级数学和物理问题的 GRPO 训练。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/index.rst"
index 084262b2..d1e87993 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\345\245\226\345\212\261/index.rst"
@@ -4,3 +4,6 @@
    :maxdepth: 1
 
    Reward.md
+   GSM8KReward.md
+   MultiModalReward.md
+   OlympiadBenchReward.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/CompletionRewardMetric.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/CompletionRewardMetric.md"
new file mode 100644
index 00000000..9f1c8258
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/CompletionRewardMetric.md"
@@ -0,0 +1,27 @@
+# CompletionRewardMetric
+
+CompletionRewardMetric 在 RLHF 训练过程中聚合关键统计数据，包括生成时间、权重同步时间、奖励分数和补全长度。
+
+```python
+from twinkle.metric import CompletionRewardMetric
+
+metric = CompletionRewardMetric(device_mesh=..., process_group=...)
+
+# 在训练循环中累积
+metric.accumulate(
+    inputs,
+    outputs,
+    generation_time=gen_time,
+    weight_sync_time=sync_time,
+    rewards=reward_values,
+    completions=completion_texts,
+)
+
+# 计算聚合指标
+result = metric.calculate()
+# result 包含: generation_time, weight_sync_time, mean_reward, mean_completion_length 等
+```
+
+此指标专为 GRPO 和其他 RL 训练循环设计，用于监控生成质量和系统性能。
+
+> CompletionRewardMetric 执行 DP 感知的聚合，在所有数据并行 rank 上正确地取平均值。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/DPOMetric.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/DPOMetric.md"
new file mode 100644
index 00000000..8e574ed4
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/DPOMetric.md"
@@ -0,0 +1,27 @@
+# DPOMetric
+
+DPOMetric 在 DPO 训练过程中跟踪偏好优化相关的统计数据。
+
+```python
+from twinkle.metric import DPOMetric
+
+metric = DPOMetric(device_mesh=..., process_group=...)
+
+# 每次前向传播后累积
+metric.accumulate(inputs, outputs, ref_outputs=ref_outputs)
+
+# 计算聚合指标
+result = metric.calculate()
+```
+
+**跟踪的指标:**
+- `chosen_logps`: chosen 响应的平均对数概率
+- `rejected_logps`: rejected 响应的平均对数概率
+- `ref_chosen_logps`: 参考模型对 chosen 响应的对数概率
+- `ref_rejected_logps`: 参考模型对 rejected 响应的对数概率
+- `rewards/chosen`: chosen 响应的隐式奖励
+- `rewards/rejected`: rejected 响应的隐式奖励
+- `accuracy`: chosen 优于 rejected 的样本对比例
+- `margin`: chosen 和 rejected 之间的平均奖励差距
+
+> DPOMetric 在所有数据并行 rank 上执行 DP 感知的聚合。支持交替排列和分开排列的 chosen/rejected 批次格式。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/index.rst"
index a8d9d6c5..6e03f97c 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\214\207\346\240\207/index.rst"
@@ -6,4 +6,6 @@
    TrainMetric.md
    LossMetric.md
    Accuracy.md
+   CompletionRewardMetric.md
+   DPOMetric.md
    构建指标.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/ChunkedCrossEntropy.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/ChunkedCrossEntropy.md"
new file mode 100644
index 00000000..b6c04f99
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/ChunkedCrossEntropy.md"
@@ -0,0 +1,22 @@
+# 分块交叉熵
+
+交叉熵损失的内存优化变体，通过在词表维度上分块处理来减少 GPU 峰值内存使用。
+
+```python
+from twinkle.loss import ChunkedCrossEntropyLoss
+
+loss_fn = ChunkedCrossEntropyLoss(
+    chunk_size=1024,           # 词表分块大小
+    reduction='mean',
+)
+
+model.set_loss(loss_fn)
+```
+
+**参数:**
+- `chunk_size`: 每块处理的词表 token 数量（默认: 1024）
+- `reduction`: 归约模式 — `sum`, `mean`, 或 `none`
+
+实现使用自定义 autograd 函数，沿词表维度将 logit 到损失的计算分块进行。这避免了实例化完整的 `[batch*seq_len, vocab_size]` 概率张量，显著减少了大词表模型的内存占用。
+
+> 当训练大词表模型时标准交叉熵导致 OOM 错误时非常有用。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/DPOLoss.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/DPOLoss.md"
new file mode 100644
index 00000000..bd157125
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/DPOLoss.md"
@@ -0,0 +1,70 @@
+# DPO 损失
+
+直接偏好优化（DPO）及其变体用于在不需要单独奖励模型的情况下将模型与人类偏好对齐。
+
+## DPOLoss
+
+标准 DPO 损失，支持多种损失类型和可选的无参考模式。
+
+```python
+from twinkle.loss import DPOLoss
+
+loss_fn = DPOLoss(
+    loss_type='sigmoid',  # 'sigmoid', 'hinge', 'ipo', 'kto'
+    beta=0.1,
+    sft_weight=0.0,       # 可选的 SFT 正则化权重
+    reference_free=False,
+)
+
+model.set_loss(loss_fn)
+```
+
+**参数:**
+- `loss_type`: DPO 变体 — `sigmoid`（默认）, `hinge`, `ipo`, 或 `kto`
+- `beta`: 控制偏好强度的温度参数
+- `sft_weight`: chosen 响应上额外 SFT 损失的权重
+- `reference_free`: 为 `True` 时跳过参考模型的对数概率
+
+损失函数期望批次中 chosen/rejected 样本交替排列。它计算序列级对数概率，优化策略使其偏好 chosen 而非 rejected 响应。
+
+## SimPOLoss
+
+简化偏好优化，通过使用长度归一化的对数概率来消除对参考模型的需求。
+
+```python
+from twinkle.loss import SimPOLoss
+
+loss_fn = SimPOLoss(beta=2.0, gamma=1.0)
+```
+
+**参数:**
+- `beta`: logit 差异的缩放因子
+- `gamma`: 添加到偏好差距的 margin 项
+
+## CPOLoss
+
+对比偏好优化，将偏好学习与行为克隆相结合。
+
+```python
+from twinkle.loss import CPOLoss
+
+loss_fn = CPOLoss(beta=0.1, cpo_alpha=1.0)
+```
+
+**参数:**
+- `beta`: 偏好损失的温度
+- `cpo_alpha`: chosen 响应上行为克隆（NLL）损失的权重
+
+## ORPOLoss
+
+赔率比偏好优化，在单一损失中统一 SFT 和偏好对齐。
+
+```python
+from twinkle.loss import ORPOLoss
+
+loss_fn = ORPOLoss(beta=0.1)
+```
+
+该损失将 chosen 响应上的标准 NLL 项与对数赔率比惩罚相结合，推动模型远离 rejected 响应。
+
+> 所有偏好损失都继承自 `PreferenceLossBase` 的共享工具方法，包括对数概率计算、chosen/rejected 拆分和序列级聚合。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/GKDLoss.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/GKDLoss.md"
new file mode 100644
index 00000000..06240b80
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/GKDLoss.md"
@@ -0,0 +1,27 @@
+# GKD 损失
+
+广义知识蒸馏（GKD）损失使用 Jensen-Shannon 散度将知识从教师模型蒸馏到学生模型。
+
+```python
+from twinkle.loss import GKDLoss
+
+loss_fn = GKDLoss(
+    teacher_mode='full',  # 'full', 'topk_local', 'topk_remote'
+    beta=0.5,             # JSD 的插值权重
+    temperature=1.0,
+)
+
+model.set_loss(loss_fn)
+```
+
+**参数:**
+- `teacher_mode`: 获取教师 logits 的方式
+  - `full`: 来自本地教师模型的全词表 logits
+  - `topk_local`: 来自本地教师的 top-k logits，使用分块计算以节省内存
+  - `topk_remote`: 来自远程 API 教师的 top-k logits
+- `beta`: 学生和教师分布在 JSD 中的插值权重（0 = 纯学生，1 = 纯教师）
+- `temperature`: 学生和教师分布的 softmax 温度
+
+GKD 损失内部实现了分块计算，以减少处理大词表时的峰值内存使用。
+
+> GKD 适用于训练模仿大型教师模型行为的小型学生模型，同时支持本地和远程教师设置。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/GRPOLoss.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/GRPOLoss.md"
new file mode 100644
index 00000000..0908d233
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/GRPOLoss.md"
@@ -0,0 +1,75 @@
+# GRPO 损失
+
+组相对策略优化（GRPO）及其变体实现了带有 PPO 风格裁剪和 KL 正则化的策略梯度损失。
+
+## GRPOLoss
+
+标准 GRPO 损失，带有重要性采样、PPO 裁剪和可选的 KL 惩罚。
+
+```python
+from twinkle.loss import GRPOLoss
+
+loss_fn = GRPOLoss(
+    clip_range=0.2,
+    beta=0.01,        # KL 惩罚系数
+)
+
+model.set_loss(loss_fn)
+```
+
+**参数:**
+- `clip_range`: 重要性权重的 PPO 裁剪范围（默认: 0.2）
+- `beta`: KL 散度惩罚系数。设为 0 以禁用 KL 正则化
+
+损失函数同时处理标准批次和打包序列（通过 `position_ids` 检测）。它计算每个 token 的重要性权重，应用 PPO 裁剪，并可选地添加针对参考策略的 KL 惩罚项。
+
+## 变体
+
+Twinkle 提供了多种 GRPO 变体:
+
+### GSPOLoss
+
+序列级重要性采样变体，在序列级别而非 token 级别计算重要性权重。
+
+```python
+from twinkle.loss import GSPOLoss
+loss_fn = GSPOLoss(clip_range=0.2, beta=0.01)
+```
+
+### SAPOLoss
+
+软门控优势策略优化，在优势值上应用 sigmoid 门控来控制优化方向。
+
+```python
+from twinkle.loss import SAPOLoss
+loss_fn = SAPOLoss(clip_range=0.2, beta=0.01, tau=1.0)
+```
+
+### CISPOLoss
+
+裁剪重要性采样策略优化，在与优势值相乘之前对重要性权重进行显式裁剪。
+
+```python
+from twinkle.loss import CISPOLoss
+loss_fn = CISPOLoss(clip_range=0.2, beta=0.01)
+```
+
+### BNPOLoss
+
+批归一化策略优化，在聚合之前对批次内的每 token 损失进行归一化。
+
+```python
+from twinkle.loss import BNPOLoss
+loss_fn = BNPOLoss(clip_range=0.2, beta=0.01)
+```
+
+### DRGRPOLoss
+
+动态比率 GRPO，使用固定分母进行重要性权重计算。
+
+```python
+from twinkle.loss import DRGRPOLoss
+loss_fn = DRGRPOLoss(clip_range=0.2, beta=0.01)
+```
+
+> 所有 GRPO 变体共享相同的打包序列处理、对数概率对齐和 KL 惩罚计算基础流水线。它们的主要区别在于重要性权重和优势值的组合方式。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/MSELoss.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/MSELoss.md"
new file mode 100644
index 00000000..e28d785a
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/MSELoss.md"
@@ -0,0 +1,12 @@
+# MSE 损失
+
+均方误差损失，用于回归式训练任务。
+
+```python
+from twinkle.loss import MSELoss
+
+loss_fn = MSELoss()
+model.set_loss(loss_fn)
+```
+
+MSELoss 计算模型输出 logits 与目标 labels 之间的均方误差。适用于奖励模型训练或价值函数估计等任务。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/index.rst"
index 2696d072..ea813f56 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261/index.rst"
@@ -4,4 +4,9 @@
    :maxdepth: 1
 
    CrossEntropy.md
+   ChunkedCrossEntropy.md
+   DPOLoss.md
+   GKDLoss.md
+   GRPOLoss.md
+   MSELoss.md
    构建损失.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261\347\274\251\346\224\276/LossScale.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261\347\274\251\346\224\276/LossScale.md"
new file mode 100644
index 00000000..8260ad27
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261\347\274\251\346\224\276/LossScale.md"
@@ -0,0 +1,16 @@
+# 损失缩放
+
+LossScale 组件控制训练过程中的损失缩放，确保数值稳定性，在混合精度训练中尤为重要。
+
+```python
+from twinkle.loss_scale import LossScale
+
+loss_scale = LossScale()
+
+# 在反向传播前对损失进行缩放
+scaled_loss = loss_scale(loss, num_tokens)
+```
+
+LossScale 通过有效 token 数量对损失值进行归一化，确保不同批次大小和序列长度下梯度幅度的一致性。
+
+> LossScale 在模型训练流水线中内部使用。使用 `model.forward_backward()` 时会自动应用。
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261\347\274\251\346\224\276/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261\347\274\251\346\224\276/index.rst"
new file mode 100644
index 00000000..a997ef06
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\215\237\345\244\261\347\274\251\346\224\276/index.rst"
@@ -0,0 +1,6 @@
+损失缩放
+===============
+.. toctree::
+   :maxdepth: 1
+
+   LossScale.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\351\242\204\345\244\204\347\220\206\345\231\250\345\222\214\350\277\207\346\273\244\345\231\250/index.rst" "b/docs/source_zh/\347\273\204\344\273\266/\351\242\204\345\244\204\347\220\206\345\231\250\345\222\214\350\277\207\346\273\244\345\231\250/index.rst"
index 4842d98c..b2da04c1 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\351\242\204\345\244\204\347\220\206\345\231\250\345\222\214\350\277\207\346\273\244\345\231\250/index.rst"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\351\242\204\345\244\204\347\220\206\345\231\250\345\222\214\350\277\207\346\273\244\345\231\250/index.rst"
@@ -4,4 +4,5 @@
    :maxdepth: 1
 
    Preprocessor.md
+   内置预处理器.md
    Filter.md
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\351\242\204\345\244\204\347\220\206\345\231\250\345\222\214\350\277\207\346\273\244\345\231\250/\345\206\205\347\275\256\351\242\204\345\244\204\347\220\206\345\231\250.md" "b/docs/source_zh/\347\273\204\344\273\266/\351\242\204\345\244\204\347\220\206\345\231\250\345\222\214\350\277\207\346\273\244\345\231\250/\345\206\205\347\275\256\351\242\204\345\244\204\347\220\206\345\231\250.md"
new file mode 100644
index 00000000..9222406c
--- /dev/null
+++ "b/docs/source_zh/\347\273\204\344\273\266/\351\242\204\345\244\204\347\220\206\345\231\250\345\222\214\350\277\207\346\273\244\345\231\250/\345\206\205\347\275\256\351\242\204\345\244\204\347\220\206\345\231\250.md"
@@ -0,0 +1,92 @@
+# 内置预处理器
+
+Twinkle 提供了一系列内置预处理器，用于常见的数据集格式。每个预处理器将原始数据转换为标准化的 `Trajectory` 对象。
+
+## LLM 预处理器
+
+### CompetitionMathProcessor
+
+转换包含 `problem` 和 `solution` 字段的竞赛数学数据集。
+
+```python
+dataset.map('CompetitionMathProcessor')
+# 输入: {'problem': '...', 'solution': '...'}
+# 输出: 包含用户消息（问题）和助手消息（解答）的 Trajectory
+```
+
+### CompetitionMathGRPOProcessor
+
+类似于 CompetitionMathProcessor，但将解答存储在 `user_data` 中，用于 GRPO 奖励计算的真实答案。
+
+```python
+dataset.map('CompetitionMathGRPOProcessor')
+```
+
+### SelfCognitionProcessor
+
+将模板占位符替换为模型身份信息，用于自我认知训练。
+
+```python
+dataset.map('SelfCognitionProcessor', model_name='MyModel', model_author='MyOrg')
+```
+
+### AlpacaProcessor
+
+转换 Alpaca 格式数据集，包含 `instruction`、`input` 和 `output` 字段。
+
+```python
+dataset.map('AlpacaProcessor')
+# 输入: {'instruction': '...', 'input': '...', 'output': '...'}
+```
+
+### CountdownProcessor
+
+生成倒计时算术问题，用于推理训练。
+
+```python
+dataset.map('CountdownProcessor')
+```
+
+### GSM8KProcessor
+
+预处理 GSM8K 数学数据集，从 `#### answer` 格式中提取真实答案。
+
+```python
+dataset.map('GSM8KProcessor')
+# 从 '#### 42' 格式提取答案并存储在 user_data 中
+```
+
+## DPO 预处理器
+
+### EmojiDPOProcessor
+
+将基于 emoji 的偏好数据集转换为 DPO 训练所需的正/负轨迹对。
+
+```python
+dataset.map('EmojiDPOProcessor')
+# 输入: {'prompt': '...', 'chosen': '...', 'rejected': '...'}
+# 输出: 交替排列的 chosen 和 rejected Trajectory 对
+```
+
+## 多模态预处理器
+
+### CLEVRProcessor
+
+预处理 CLEVR 视觉推理数据集，支持图像处理。
+
+```python
+dataset.map('CLEVRProcessor')
+# 输入: {'question': '...', 'answer': '...', 'image': PIL.Image}
+# 输出: 包含多模态内容（图像 + 文本）的 Trajectory
+```
+
+### OlympiadBenchProcessor
+
+预处理 OlympiadBench 多模态数学/物理问题，支持图像收集和元数据存储。
+
+```python
+dataset.map('OlympiadBenchProcessor')
+# 处理每个问题的多张图像，将真实答案和元数据存储在 user_data 中
+```
+
+> 所有预处理器遵循相同的接口: `__call__(rows) -> List[Trajectory]`。你可以按照相同的模式注册自定义预处理器（参见 [预处理器](Preprocessor.md)）。

From ecd6545b8aef1027df07bc1ca3d8a6c27cee6774 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 15 Apr 2026 20:51:33 +0800
Subject: [PATCH 4/8] fix

---
 README.md                                     | 47 +++++++++++--------
 README_ZH.md                                  | 41 ++++++++++------
 docs/source_en/Usage Guide/Installation.md    | 30 ++++++++++++
 docs/source_en/Usage Guide/Quick-Start.md     | 22 +++++----
 .../Usage Guide/Train-as-a-Service.md         | 10 ++--
 .../\345\256\211\350\243\205.md"              | 30 ++++++++++++
 ...53\351\200\237\345\274\200\345\247\213.md" | 40 +++++++++-------
 ...55\347\273\203\346\234\215\345\212\241.md" | 10 ++--
 8 files changed, 159 insertions(+), 71 deletions(-)

diff --git a/README.md b/README.md
index 4713f442..42af8684 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 <p align="center">
     <img src="assets/slogan.png" width="200"/>
-<p>
+</p>
 <p align="center">
 by <a href="https://modelscope.cn/home">ModelScope</a>
 <br>
@@ -30,8 +30,8 @@ with `torchrun`, or scaling training across Ray clusters,
 Twinkle✨ eliminates infrastructure friction by encapsulating
 training logic into standardized APIs. Beyond simple
 abstraction, Twinkle✨ serves as a robust backend and gateway to enable serverless Training-as-a-Service (TaaS).
-It offers interfaces that constitute a _superset_ of  [Tinker](https://thinkingmachines.ai/tinker/) APIs,
-thereby making it possible to access a Twinkle✨ training service via Tinker client or native Twinkle✨ client
+It offers interfaces that constitute a _superset_ of [Tinker](https://thinkingmachines.ai/tinker/) APIs,
+thereby making it possible to access a Twinkle✨ training service via Tinker client or the native Twinkle✨ client,
 which offers more functionalities.
 
 🧩 <b>Decoupled Architecture</b>: Standardized Interfaces, backward compatible with Tinker APIs.<br>
@@ -39,7 +39,7 @@ which offers more functionalities.
 🔌 <b>Versatile Backends</b>: Transformers / Megatron.<br>
 👥 <b>Multi-Tenancy Training Service</b>: Train multiple LoRAs that share one base model deployment.<br>
 
-Note: Twinkle✨is built by the team behind [ms-swift](https://github.com/modelscope/ms-swift), and
+Note: Twinkle✨ is built by the team behind [ms-swift](https://github.com/modelscope/ms-swift), and
 we expect the two projects to evolve together. We expect some fundamental components in Twinkle✨will likely
 be reused in [ms-swift](https://github.com/modelscope/ms-swift).
 
@@ -89,25 +89,34 @@ sh INSTALL_MEGATRON.sh
 
 ## Tutorials
 
-| Training Type                     | Model Framework | Cookbook Path                                     |
-| --------------------------------- | --------------- | ------------------------------------------------- |
-| FSDP finetuning                   | transformers    | [Script](cookbook/transformers/fsdp2.py)             |
-| FSDP MoE finetuning               | transformers    | [Script](cookbook/transformers/fsdp2_moe.py)         |
-| ep FSDP MoE finetuning            | transformers    | [Script](cookbook/transformers/ep_fsdp_qwen3_moe.py) |
-| sp FSDP finetuning                | transformers    | [Script](cookbook/transformers/sp_fsdp_dense.py)     |
-| EP MoE finetuning                 | transformers    | [Script](cookbook/transformers/ep_fsdp_qwen3_moe.py) |
-| pp/tp/cp finetuning               | megatron        | [Script](cookbook/megatron/tp.py)                    |
-| pp/tp/cp MoE finetuning           | megatron        | [Script](cookbook/megatron/tp_moe.py)                |
-| tinker client finetuning          | megatron        | [Script](cookbook/client/tinker/megatron)            |
-| tinker client finetuning/sampling | transformers    | [Script](cookbook/client/tinker/transformer)         |
-| twinkle client finetuning         | megatron        | [Script](cookbook/client/twinkle/megatron)           |
-| twinkle client finetuning         | transformer     | [Script](cookbook/client/twinkle/transformer)        |
+| Training Type                        | Model Framework | Cookbook Path                                          |
+| ------------------------------------ | --------------- | ----------------------------------------------------- |
+| FSDP finetuning                      | transformers    | [Script](cookbook/transformers/fsdp2.py)               |
+| FSDP MoE finetuning                  | transformers    | [Script](cookbook/transformers/fsdp2_moe.py)           |
+| EP FSDP MoE finetuning               | transformers    | [Script](cookbook/transformers/ep_fsdp_qwen3_moe.py)  |
+| SP FSDP finetuning                   | transformers    | [Script](cookbook/transformers/sp_fsdp_dense.py)      |
+| pp/tp/cp finetuning                  | megatron        | [Script](cookbook/megatron/tp.py)                      |
+| pp/tp/cp MoE finetuning              | megatron        | [Script](cookbook/megatron/tp_moe.py)                  |
+| Multimodal FSDP finetuning           | transformers    | [Script](cookbook/mm/fsdp2.py)                         |
+| GRPO RL training                     | megatron        | [Script](cookbook/rl/grpo.py)                          |
+| GRPO Multimodal RL training          | megatron        | [Script](cookbook/rl/grpo_mm.py)                       |
+| GRPO Math RL training                | megatron        | [Script](cookbook/rl/short_math_grpo.py)               |
+| DPO full-parameter training          | transformers    | [Script](cookbook/rl/dpo_full.py)                      |
+| DPO LoRA training                    | transformers    | [Script](cookbook/rl/dpo_lora.py)                      |
+| DPO multi-LoRA training              | transformers    | [Script](cookbook/rl/dpo_multi_lora.py)                |
+| GKD on-policy distillation           | megatron        | [Script](cookbook/rl/gkd_on_policy.py)                 |
+| GKD off-policy distillation          | megatron        | [Script](cookbook/rl/gkd_off_policy.py)                |
+| Tinker client finetuning (self-host) | transformers    | [Script](cookbook/client/tinker/self_host)             |
+| Tinker client finetuning (ModelScope) | transformers   | [Script](cookbook/client/tinker/modelscope)            |
+| Twinkle client finetuning (self-host) | transformers   | [Script](cookbook/client/twinkle/self_host)            |
+| Twinkle client finetuning (ModelScope) | transformers  | [Script](cookbook/client/twinkle/modelscope)           |
+| Server startup scripts               | transformers/megatron | [Script](cookbook/client/server)                 |
 
 ## Changelog
 - 🎉2026-04-14 The ModelScope service has been deployed to [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B) with a new release 0.2.0.
 - 🎉2026-03-28 Support DPO training with both Transformers and Megatron backends. See [dpo_full.py](cookbook/rl/dpo_full.py) and [dpo_lora.py](cookbook/rl/dpo_lora.py).
 - 🎉2026-03-24 Twinkle Web site is now live at https://modelscope.github.io/twinkle-web/
-- 🎉2026-03-19 Support GKD training ，please refer to this [cookbook](cookbook/rl/gkd_on_policy.py).
+- 🎉2026-03-19 Support GKD training, please refer to this [cookbook](cookbook/rl/gkd_on_policy.py).
 - 🎉2026-02-13 Initial version of Twinkle✨ released, including SFT/PT/RL support for text models.
 
 ## Training as a Service on ModelScope
@@ -122,7 +131,7 @@ our [documentation](docs/source_en/Usage%20Guide/Train-as-a-Service.md).
 | Hardware Environment | Notes                                                            |
 | -------------------- | ---------------------------------------------------------------- |
 | Nvidia GPUs          | ✅ Support for BF16/Flash-Attn may be incomplete in earlier GPUs |
-| Ascend NPU           | ✅ Some operators may not supported                              |
+| Ascend NPU           | ✅ Some operators may not be supported                           |
 | PPU                  | ✅                                                               |
 | CPU                  | Supports partial components like dataset, dataloader             |
 
diff --git a/README_ZH.md b/README_ZH.md
index a6a2662f..5fd89b6b 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -1,8 +1,8 @@
-# Twinkle: Training workbench to make your model glow
+<h1 align="center">Twinkle: Training workbench to make your model glow</h1>
 
 <p align="center">
     <img src="assets/slogan.png" width="200"/>
-<p>
+</p>
 <p align="center">
 <a href="https://modelscope.cn/home">ModelScope</a>
 <br>
@@ -71,7 +71,7 @@ Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
 
 这个脚本会下载或利用conda，创建一个叫`twinkle-client`的虚拟环境，这个环境可以直接用于远端训练。
 
-如果你需要安装Megatron相关依赖，可以如下脚本：
+如果你需要安装Megatron相关依赖，可以使用如下脚本：
 
 ```shell
 sh INSTALL_MEGATRON.sh
@@ -79,17 +79,28 @@ sh INSTALL_MEGATRON.sh
 
 ## 教程
 
-| 训练类型                     | 模型框架 | Cookbook 路径                                     |
-| ---------------------------- | -------- | ------------------------------------------------- |
-| FSDP 微调                    | transformers    | [脚本](cookbook/transformers/fsdp2.py)             |
-| FSDP MoE 微调                | transformers    | [脚本](cookbook/transformers/fsdp2_moe.py)         |
-| EP MoE 微调                  | transformers    | [脚本](cookbook/transformers/ep_fsdp_qwen3_moe.py) |
-| pp/tp/cp 微调                | megatron        | [脚本](cookbook/megatron/tp.py)                    |
-| pp/tp/cp MoE 微调            | megatron        | [脚本](cookbook/megatron/tp_moe.py)                |
-| tinker 客户端微调            | megatron        | [脚本](cookbook/client/tinker/megatron)            |
-| tinker 客户端微调/采样       | transformers    | [脚本](cookbook/client/tinker/transformer)         |
-| twinkle 客户端微调           | megatron        | [脚本](cookbook/client/twinkle/megatron)           |
-| twinkle 客户端微调           | transformer     | [脚本](cookbook/client/twinkle/transformer)        |
+| 训练类型                            | 模型框架              | Cookbook 路径                                          |
+| ----------------------------------- | --------------------- | ----------------------------------------------------- |
+| FSDP 微调                           | transformers          | [脚本](cookbook/transformers/fsdp2.py)                 |
+| FSDP MoE 微调                       | transformers          | [脚本](cookbook/transformers/fsdp2_moe.py)             |
+| EP FSDP MoE 微调                    | transformers          | [脚本](cookbook/transformers/ep_fsdp_qwen3_moe.py)    |
+| SP FSDP 微调                        | transformers          | [脚本](cookbook/transformers/sp_fsdp_dense.py)        |
+| pp/tp/cp 微调                       | megatron              | [脚本](cookbook/megatron/tp.py)                        |
+| pp/tp/cp MoE 微调                   | megatron              | [脚本](cookbook/megatron/tp_moe.py)                    |
+| 多模态 FSDP 微调                    | transformers          | [脚本](cookbook/mm/fsdp2.py)                           |
+| GRPO 强化学习训练                    | megatron              | [脚本](cookbook/rl/grpo.py)                            |
+| GRPO 多模态强化学习训练             | megatron              | [脚本](cookbook/rl/grpo_mm.py)                         |
+| GRPO 数学强化学习训练               | megatron              | [脚本](cookbook/rl/short_math_grpo.py)                 |
+| DPO 全参数训练                      | transformers          | [脚本](cookbook/rl/dpo_full.py)                        |
+| DPO LoRA 训练                       | transformers          | [脚本](cookbook/rl/dpo_lora.py)                        |
+| DPO 多 LoRA 训练                    | transformers          | [脚本](cookbook/rl/dpo_multi_lora.py)                  |
+| GKD 在线蒸馏                        | megatron              | [脚本](cookbook/rl/gkd_on_policy.py)                   |
+| GKD 离线蒸馏                        | megatron              | [脚本](cookbook/rl/gkd_off_policy.py)                  |
+| Tinker 客户端微调（自部署）         | transformers          | [脚本](cookbook/client/tinker/self_host)               |
+| Tinker 客户端微调（ModelScope）      | transformers          | [脚本](cookbook/client/tinker/modelscope)              |
+| Twinkle 客户端微调（自部署）        | transformers          | [脚本](cookbook/client/twinkle/self_host)              |
+| Twinkle 客户端微调（ModelScope）     | transformers          | [脚本](cookbook/client/twinkle/modelscope)             |
+| 服务端启动脚本                      | transformers/megatron | [脚本](cookbook/client/server)                         |
 
 Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Client等各场景下。其算法过程是外露的，非常便于修改和调试。完整的框架介绍请查看[快速开始](docs/source_zh/使用指引/快速开始.md)
 
@@ -97,7 +108,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 🎉2026-04-16 ModelScope的训练服务部署为[Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)，并发布了0.2.0版本.
 🎉2026-03-28 支持 DPO 训练，同时支持 Transformers 和 Megatron 后端。参考 [dpo_full.py](cookbook/rl/dpo_full.py) 和 [dpo_lora.py](cookbook/rl/dpo_lora.py)。
 🎉2026-03-24 Twinkle 站点上线，访问地址 https://modelscope.github.io/twinkle-web/
-🎉2026-03-19 支持GKD蒸馏能力，参考[cookbook](cookbook/rl/gkd_on_policy.py)。
+🎉2026-03-19 支持 GKD 蒸馏能力，参考 [cookbook](cookbook/rl/gkd_on_policy.py)。
 🎉2026-02-13 Twinkle✨ 初始版本发布，支持文本模型的SFT/PT/RL训练。我们还通过兼容Tinker的API，在魔搭社区上提供了无服务器训练功能。
 
 ## ModelScope 的训练服务
diff --git a/docs/source_en/Usage Guide/Installation.md b/docs/source_en/Usage Guide/Installation.md
index 3cec8ded..75b8118a 100644
--- a/docs/source_en/Usage Guide/Installation.md	
+++ b/docs/source_en/Usage Guide/Installation.md	
@@ -16,6 +16,36 @@ cd twinkle
 pip install -e .
 ```
 
+## Docker Image
+
+You can also use our pre-built Docker image:
+
+```text
+modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:twinkle-0.2.0
+```
+
+## Client Installation
+
+If you need to use Twinkle's Client for remote training, you can use our one-click installation script:
+
+```shell
+# Mac or Linux
+sh INSTALL_CLIENT.sh
+# Windows, Open with PowerShell
+Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
+.\INSTALL_CLIENT.ps1
+```
+
+This script will download or utilize conda to create a virtual environment called `twinkle-client`, which can be directly used for remote training.
+
+## Megatron Dependencies
+
+If you need to install Megatron-related dependencies, you can use the following script:
+
+```shell
+sh INSTALL_MEGATRON.sh
+```
+
 ## Supported Hardware
 
 | Hardware Environment            | Notes                                    |
diff --git a/docs/source_en/Usage Guide/Quick-Start.md b/docs/source_en/Usage Guide/Quick-Start.md
index 4ffa9c86..906f0db0 100644
--- a/docs/source_en/Usage Guide/Quick-Start.md	
+++ b/docs/source_en/Usage Guide/Quick-Start.md	
@@ -2,12 +2,12 @@
 
 ## ✨ What is Twinkle?
 
-A component library for large model training. Based on PyTorch, simpler, more flexible, production-ready.
+A component library for large model training. Based on PyTorch, it is simpler, more flexible, and production-ready.
 
 🧩 <b>Loosely Coupled Architecture</b> · Standardized Interfaces<br>
 🚀 <b>Multiple Runtime Modes</b> · torchrun / Ray / HTTP<br>
 🔌 <b>Multi-Framework Compatible</b> · Transformers / Megatron<br>
-👥 <b>Multi-Tenant Support</b> · Single Base Model Deployment
+👥 <b>Multi-Tenant Support</b> · Single Base Model Deployment<br>
 
 ## Twinkle Compatibility
 
@@ -764,14 +764,18 @@ This service shares the same code as the Tinker API section described above. The
 
 Twinkle provides a sampling API that can be used to control the sampling process more flexibly for result validation, or to participate in the sampling workflow of RL algorithms.
 
-## Using Hugging Face models
+> For complete examples of all supported training modes, please refer to the [cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook) directory.
 
-Switch the prefix.
+## Using Hugging Face Models
+
+To load models from Hugging Face instead of ModelScope, simply switch the prefix:
 
 ```text
 ms://Qwen/Qwen3.5-4B -> hf://Qwen/Qwen3.5-4B
 ```
 
+All components that accept a `model_id` parameter support this prefix-based routing.
+
 ## 🛠️ Twinkle✨ Modular Ecosystem
 
 <div align="center">
@@ -849,7 +853,7 @@ ms://Qwen/Qwen3.5-4B -> hf://Qwen/Qwen3.5-4B
 
 ## Twinkle's Customizable Components
 
-In Twinkle's design, training using torchrun, Ray, and HTTP uses the same API and shares the same components and input/output structures. Therefore, many of its components can be customized by developers to implement new algorithm development.
+In Twinkle's design, training via torchrun, Ray, and HTTP uses the same API and shares the same components and input/output structures. Therefore, many of its components can be customized by developers to implement new algorithms.
 
 Below is a list of recommended components for customization:
 
@@ -869,11 +873,11 @@ Below is a list of recommended components for customization:
 | Template              | twinkle.template.Template                  | Used to process standard inputs and convert them to tokens required by the model |
 | Weight Synchronization | twinkle.checkpoint_engine.CheckpointEngine | Used for weight synchronization in RL training                 |
 
-> Components not listed in the above table, such as Dataset, DataLoader, etc., can also be customized, just follow the base class API design.
+> Components not listed in the above table, such as Dataset, DataLoader, etc., can also be customized; simply follow the base class API design.
 
 ## DeviceGroup and DeviceMesh
 
-DeviceGroup and DeviceMesh are the core of Twinkle's architecture. All code construction is based on these two designs.
+DeviceGroup and DeviceMesh are the core concepts of Twinkle's architecture. All code construction is based on these two designs.
 
 ```python
 import twinkle
@@ -892,7 +896,7 @@ twinkle.initialize(mode='ray', nproc_per_node=8, groups=device_group)
 
 After defining the device_group, you need to use `twinkle.initialize` to initialize resources.
 
-DeviceGroup: Define how many resource groups are needed for this training session. Once defined, components can run themselves remotely by selecting resource groups:
+DeviceGroup: Defines how many resource groups are needed for this training session. Once defined, components can run themselves remotely by selecting a resource group:
 
 ```python
 from twinkle.model import TransformersModel
@@ -902,7 +906,7 @@ from twinkle.model import MegatronModel
 model = MegatronModel(model_id='Qwen/Qwen3.5-4B', remote_group='default', device_mesh=device_mesh)
 ```
 
-DeviceMesh specifies the topology of components like models within the resource group. It can be understood as how to perform parallelization. This affects a series of framework decisions, such as data acquisition, data consumption, data return, etc.
+DeviceMesh specifies the topology of components like models within the resource group. It can be understood as how to perform parallelization. This affects a series of framework decisions such as data acquisition, data consumption, and data return.
 
 ## Usage Example
 
diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md
index 7ed8db76..e244b9a6 100644
--- a/docs/source_en/Usage Guide/Train-as-a-Service.md	
+++ b/docs/source_en/Usage Guide/Train-as-a-Service.md	
@@ -31,7 +31,7 @@ from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
 base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
-base_url='http://www.modelscope.cn/twinkle'
+base_url='https://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
 # Use twinkle dataset to load the data
@@ -80,7 +80,7 @@ init_tinker_client()
 from tinker import ServiceClient
 
 base_model = 'Qwen/Qwen3.6-35B-A3B'
-base_url = 'http://www.modelscope.cn/twinkle'
+base_url = 'https://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
 service_client = ServiceClient(
@@ -121,7 +121,7 @@ params = types.SamplingParams(
 )
 
 # Step 6: Send the sampling request to the server.
-# num_samples=1 generates 1 independent completions for the same prompt.
+# num_samples=1 generates 1 independent completion for the same prompt.
 print('Sampling...')
 future = sampling_client.sample(prompt=prompt, sampling_params=params, num_samples=1)
 result = future.result()
@@ -134,9 +134,9 @@ for i, seq in enumerate(result.sequences):
 
 Developers can also merge this LoRA with the base model and then deploy it using their own service, calling it through the OpenAI-compatible standard API.
 
-> The ModelScope server is tinker-compatible, so use the tinker cookbooks. In the future version, we will support a server works both for twinkle/tinker clients.
+> The ModelScope server is currently Tinker-compatible, so please use the Tinker cookbooks. In a future version, we will support a server that works for both Twinkle and Tinker clients.
 
-Developers can customize datasets, advantage functions, rewards, templates, and more. However, the Loss component is not currently customizable since it needs to be executed on the server side (for security reasons). If you need support for additional Loss functions, you can upload your Loss implementation to ModelHub and contact us via the Q&A group or through an issue to have the corresponding component added to the whitelist.
+Developers can customize datasets, advantage functions, rewards, templates, and more. However, the Loss component is not currently customizable since it needs to be executed on the server side (for security reasons). If you need support for additional Loss functions, you can upload your Loss implementation to [ModelHub](https://modelscope.cn) and contact us via the Q&A group or through an [issue](https://github.com/modelscope/twinkle/issues) to have the corresponding component added to the whitelist.
 
 ## Appendix: Supported Training Methods
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\256\211\350\243\205.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\256\211\350\243\205.md"
index c13a1022..4915d677 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\256\211\350\243\205.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\256\211\350\243\205.md"
@@ -16,6 +16,36 @@ cd twinkle
 pip install -e .
 ```
 
+## Docker 镜像
+
+你也可以使用我们的预构建 Docker 镜像：
+
+```text
+modelscope-registry.cn-hangzhou.cr.aliyuncs.com/modelscope-repo/modelscope:twinkle-0.2.0
+```
+
+## 客户端安装
+
+如果你需要使用 Twinkle 的 Client 进行远程训练，可以使用我们的一键安装脚本：
+
+```shell
+# Mac or Linux
+sh INSTALL_CLIENT.sh
+# Windows, Open with PowerShell
+Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser
+.\INSTALL_CLIENT.ps1
+```
+
+这个脚本会下载或利用 conda，创建一个叫 `twinkle-client` 的虚拟环境，这个环境可以直接用于远端训练。
+
+## Megatron 依赖安装
+
+如果你需要安装 Megatron 相关依赖，可以使用如下脚本：
+
+```shell
+sh INSTALL_MEGATRON.sh
+```
+
 ## 支持的硬件
 
 | 硬件环境                     | 备注                          |
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md"
index b8161c81..710022bb 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\345\277\253\351\200\237\345\274\200\345\247\213.md"
@@ -7,7 +7,7 @@
 🧩 <b>松耦合架构</b> · 标准化接口<br>
 🚀 <b>多运行模式</b> · torchrun / Ray / HTTP<br>
 🔌 <b>多框架兼容</b> · Transformers / Megatron<br>
-👥 <b>多租户支持</b> · 单基座模型部署
+👥 <b>多租户支持</b> · 单基座模型部署<br>
 
 ## Twinkle 适配性
 
@@ -96,7 +96,7 @@ Twinkle的所有组件都支持单独拆分使用，可以参考下面章节的
 
 ### 单GPU
 
-Twinkle支持单GPU运行训练。下面是一个例子：
+Twinkle 支持单GPU运行训练。下面是一个例子：
 
 ```python
 from peft import LoraConfig
@@ -159,7 +159,7 @@ if __name__ == '__main__':
 
 ### torchrun
 
-Twinkle支持以torchrun模式运行训练。在这种场景下，不需要安装ray相关的依赖。
+Twinkle 支持以 torchrun 模式运行训练。在这种场景下，不需要安装 Ray 相关的依赖。
 
 ```python
 from peft import LoraConfig
@@ -236,7 +236,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun --nproc_per_node=8 train.py
 [Ray](https://github.com/ray-project/ray)是多机模型训练和推理场景中常用的调度中间件框架。它针对多模型、多设备的执行和资源管理进行了额外优化，
 并支持对接kubernetes系统进行生产化。这样的特性使得它尤其适用于RL、GKD等复杂训练场景中。
 
-Twinkle支持使用ray进行训练和采样，并且它的代码和上面的训练API几乎一致：
+Twinkle 支持使用 Ray 进行训练和采样，并且它的代码和上面的训练 API 几乎一致：
 
 ```python
 import os
@@ -397,15 +397,15 @@ if __name__ == '__main__':
 ```
 
 在上面的代码中，我们给出了一个RL的训练代码。我们可以在代码中清晰看到数据如何构造、sampler/model如何声明和传参，以及advantage和loss的构造过程。
-这个过程没有任何显示引用`ray`的地方。我们仅在初始化时声明了ray模式：
+这个过程没有任何显式引用 `ray` 的地方。我们仅在初始化时声明了 ray 模式：
 
 ```python
 twinkle.initialize(mode='ray', nproc_per_node=NUM_GPUS, groups=device_groups, lazy_collect=False)
 ```
 
-开发者可以定制模型等组件的构造和调用方式，所有transformers、Megatron的模型参数都可以在构造模型时传入。
+开发者可以定制模型等组件的构造和调用方式，所有 Transformers、Megatron 的模型参数都可以在构造模型时传入。
 
-后面所有的ray调用和数据分发，都是隐式进行的。运行这个脚本需要提前安装好ray。之后这样运行：
+后面所有的 ray 调用和数据分发，都是隐式进行的。运行这个脚本需要提前安装好 Ray。之后这样运行：
 
 ```shell
 python train.py
@@ -413,7 +413,7 @@ python train.py
 
 ### 远程训练
 
-Twinkle的一大特色是支持多租户用户混合训练。具体来说，多个用户可以使用一个基模进行lora训练，这样可以极大减小服务端部署成本。
+Twinkle 的一大特色是支持多租户用户混合训练。具体来说，多个用户可以使用一个基模进行 LoRA 训练，这样可以极大减小服务端部署成本。
 
 假设我们使用八卡开启一个服务。首先我们需要启动ray集群：
 
@@ -423,12 +423,12 @@ CUDA_VISIBLE_DEVICES=2,3 ray start --address=127.0.0.1:6379 --num-gpus=2
 CUDA_VISIBLE_DEVICES="" ray start --address=127.0.0.1:6379 --num-gpus=0
 ```
 
-我们启动了一组包含三个node的ray集群：
-- 01两张卡作为一个node
-- 23两张卡作为一个node
-- cpu资源作为一个node
+我们启动了一组包含三个 node 的 Ray 集群：
+- 0、1 两张卡作为一个 node
+- 2、3 两张卡作为一个 node
+- CPU 资源作为一个 node
 
-如果在生产环境使用，可以启动更多node，并部署更多replica以兼容更大的用户量。在这里我们仅以四卡作为例子。
+如果在生产环境使用，可以启动更多 node，并部署更多 replica 以兼容更大的用户量。在这里我们仅以四卡作为例子。
 
 下面，启动server：
 ```shell
@@ -437,7 +437,7 @@ cd cookbook/client/twinkle/transformer
 python server.py
 ```
 
-服务端会启动一个包含了一个sampler集群、一个模型集群、一个工具集群的三个服务。
+服务端会启动一个包含 Sampler 集群、模型集群、工具集群的三个服务。
 
 下面可以进行client端训练：
 ```python
@@ -692,7 +692,7 @@ if __name__ == '__main__':
     train()
 ```
 
-多个开发者可以并行使用这个服务的单个基模并行训练和采样。并且，他们进行的训练方式允许不同。例如，A用户可以进行SFT，B用户可以进行RL，C用户可以进行采样。 同样，Twinkle也支持Tinker-like API进行远端训练：
+多个开发者可以并行使用这个服务的单个基模进行并行训练和采样。并且，他们进行的训练方式允许不同。例如，A 用户可以进行 SFT，B 用户可以进行 RL，C 用户可以进行采样。同样，Twinkle 也支持 Tinker-like API 进行远端训练：
 
 ```python
 from tinker import types
@@ -764,16 +764,20 @@ if __name__ == '__main__':
 在 Twinkle 框架开源的同时，我们依托ModelScope的后台服务，也提供了托管的模型训练服务(Training as a Service)，开发者可以通过这一服务， 免费体验Twinkle的训练API。
 该服务和上面叙述的Tinker API部分代码是相同的，唯一不同的是Endpoint和Token需要使用魔搭官方的对应信息。关于如何使用官方服务，请查看[训练服务](./训练服务.md)的详细描述。
 
-Twinkle提供了采样API，该API可以用于更灵活地控制采样方式以验证结果，或者参与到RL算法的采样流程中。
+Twinkle提供了采样 API，该 API 可以用于更灵活地控制采样方式以验证结果，或者参与到 RL 算法的采样流程中。
 
-## 使用Hugging Face的模型
+> 完整的训练模式示例请参考 [cookbook](https://github.com/modelscope/twinkle/tree/main/cookbook) 目录。
 
-切换前缀即可。
+## 使用 Hugging Face 的模型
+
+要从 Hugging Face 加载模型而不是 ModelScope，只需切换前缀即可：
 
 ```text
 ms://Qwen/Qwen3.5-4B -> hf://Qwen/Qwen3.5-4B
 ```
 
+所有接受 `model_id` 参数的组件都支持这种基于前缀的路由。
+
 ## 🛠️ Twinkle✨ 模块化生态系统
 
 <div align="center">
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
index 9ad901de..63a7425c 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
@@ -34,7 +34,7 @@ from twinkle.preprocessor import SelfCognitionProcessor
 from twinkle.server.common import input_feature_to_datum
 
 base_model = 'ms://Qwen/Qwen3.6-35B-A3B'
-base_url='http://www.modelscope.cn/twinkle'
+base_url='https://www.modelscope.cn/twinkle'
 api_key=os.environ.get('MODELSCOPE_TOKEN')
 
 # Use twinkle dataset to load the data
@@ -83,7 +83,7 @@ init_tinker_client()
 from tinker import ServiceClient
 
 base_model = 'Qwen/Qwen3.6-35B-A3B'
-base_url = 'http://www.modelscope.cn/twinkle'
+base_url = 'https://www.modelscope.cn/twinkle'
 
 # Step 2: Define the base model and connect to the server
 service_client = ServiceClient(
@@ -137,10 +137,10 @@ for i, seq in enumerate(result.sequences):
 
 开发者也可以将这个lora和原模型合并之后，使用自己的服务进行部署，并使用OpenAI标准接口进行调用。
 
-> 目前的服务兼容tinker client，因此请使用tinker的cookbook进行训练。后续我们会支持单服务器支持twinkle/tinker双client。
+> 目前的服务兼容 Tinker Client，因此请使用 Tinker 的 cookbook 进行训练。后续我们会支持单服务器同时支持 Twinkle/Tinker 双客户端。
 
-开发者可以定制数据集/优势函数/奖励/模板等，其中 Loss 部分由于需要在服务端执行，因此当前暂不支持（安全性原因）。
-如果需要支持您的额外 Loss，可以将该 Loss 实现上传到 ModelHub 中，并在答疑群中或者 issue 中联系我们，将对应组件开放白名单即可使用。
+开发者可以定制数据集/优势函数/奖励/模板等，其中 Loss 部分由于需要在服务端执行，因此当前暂不支持定制（安全性原因）。
+如果需要支持您的额外 Loss，可以将该 Loss 实现上传到 [ModelHub](https://modelscope.cn) 中，并在答疑群中或者 [issue](https://github.com/modelscope/twinkle/issues) 中联系我们，将对应组件开放白名单即可使用。
 
 ## 附录：支持的训练方式
 

From 40b7ae471c1de95482ed2277802f5258ce70d13f Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 15 Apr 2026 21:01:21 +0800
Subject: [PATCH 5/8] fix

---
 README.md                                                 | 3 +--
 README_ZH.md                                              | 2 +-
 docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md   | 2 +-
 docs/source_en/Usage Guide/Train-as-a-Service.md          | 6 +++---
 ....5\346\234\200\344\275\263\345\256\236\350\267\265.md" | 2 +-
 .../\350\256\255\347\273\203\346\234\215\345\212\241.md"  | 8 +++-----
 6 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 42af8684..e6779d46 100644
--- a/README.md
+++ b/README.md
@@ -121,8 +121,7 @@ sh INSTALL_MEGATRON.sh
 
 ## Training as a Service on ModelScope
 
-We are rolling out training service built atop Twinkle✨ on ModelScope. It is currently in _Beta_. You may
-sign up for free access by joining the [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) organization, and
+We are rolling out training service built atop Twinkle✨ on ModelScope. You may
 train via API endpoint  `base_url=https://www.modelscope.cn/twinkle`. For more details, please refer to
 our [documentation](docs/source_en/Usage%20Guide/Train-as-a-Service.md).
 
diff --git a/README_ZH.md b/README_ZH.md
index 5fd89b6b..64ce16a6 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -113,7 +113,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl
 
 ## ModelScope 的训练服务
 
-我们正在 ModelScope 上推出基于 Twinkle✨ 构建的训练服务。目前处于 _Beta_ 阶段。你可以通过加入 [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) 组织来注册免费访问，并通过 API 端点 `base_url=https://www.modelscope.cn/twinkle` 进行训练。更多详情请参阅我们的[文档](docs/source_zh/使用指引/训练服务.md)。
+我们正在 ModelScope 上推出基于 Twinkle✨ 构建的训练服务。你可以通过 API 端点 `base_url=https://www.modelscope.cn/twinkle` 进行训练。更多详情请参阅我们的[文档](docs/source_zh/使用指引/训练服务.md)。
 
 ## 支持的硬件
 
diff --git a/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md b/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md
index c5856fdc..6f402823 100644
--- a/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md	
+++ b/docs/source_en/Usage Guide/Introduction-with-Qwen3.5.md	
@@ -524,7 +524,7 @@ Alongside the open-source release of Twinkle, ModelScope provides a hosted model
 
 **How to use:**
 
-1. Register a ModelScope account and apply to join the [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) organization
+1. Register a ModelScope account at [modelscope.cn](https://www.modelscope.cn/)
 2. Obtain your API Key on the [Token Management page](https://www.modelscope.cn/my/access/token)
 3. Use the Tinker Client code above with the following endpoint:
 
diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md
index e244b9a6..b3a3d736 100644
--- a/docs/source_en/Usage Guide/Train-as-a-Service.md	
+++ b/docs/source_en/Usage Guide/Train-as-a-Service.md	
@@ -4,13 +4,13 @@ Alongside the open-source release of the Twinkle framework, we also provide a ho
 
 The model currently running on the cluster is [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B). Below are the detailed usage instructions:
 
-## Step 1. Register a ModelScope Account and Apply to Join the twinkle-explorers Organization
+## Step 1. Register a ModelScope Account and Obtain Your API Key
 
-Developers first need to register as a ModelScope user and apply to join the [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) organization to obtain access permissions. The current free Serverless training experience is still in beta testing and is only available to users within the organization. You can also use Twinkle✨ by deploying the service locally.
+Developers first need to register as a ModelScope user. You can also use Twinkle✨ by deploying the service locally.
 
 Registration link: https://www.modelscope.cn/
 
-After registering and being approved to join the [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) organization, obtain your API-Key (i.e., the ModelScope platform access token) from this page: https://www.modelscope.cn/my/access/token.
+After registering, obtain your API-Key (i.e., the ModelScope platform access token) from this page: https://www.modelscope.cn/my/access/token.
 
 API endpoint: `base_url="https://www.modelscope.cn/twinkle"`
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md"
index bd29a651..fd556108 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/Qwen3.5\346\234\200\344\275\263\345\256\236\350\267\265.md"
@@ -524,7 +524,7 @@ Twinkle 框架开源的同时，魔搭社区依托自身算力基础设施，提
 
 **使用方式：**
 
-1. 注册魔搭账号并申请加入 [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) 组织
+1. 注册魔搭账号：[modelscope.cn](https://www.modelscope.cn/)
 2. 在 [Token 管理页面](https://www.modelscope.cn/my/access/token) 获取 API Key
 3. 使用上面的 Tinker Client 代码，修改 endpoint：
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
index 63a7425c..001ad5e7 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
@@ -5,15 +5,13 @@
 
 目前在集群中运行的模型是[Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)。下面介绍具体的使用方法：
 
-## Step 1. 注册ModelScope用户并申请加入 twinkle-explorers 组织
+## Step 1. 注册ModelScope用户并获取 API Key
 
-开发者首先需要注册成为ModelScope用户，并申请加入 [Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) 组织，
-来获取访问权限。当前免费的Serverless训练体验，还在灰度测试中，暂时只向组织内的用户开放。您也可以通过本地部署服务，来使用Twinkle✨。
+开发者首先需要注册成为ModelScope用户。您也可以通过本地部署服务，来使用Twinkle✨。
 
 注册地址：https://www.modelscope.cn/
 
-在注册并获批加入[Twinkle-Explorers](https://modelscope.cn/organization/twinkle-explorers) 组织后，在此页面获取
-访问的API-Key（即ModelScope平台的访问Token）：https://www.modelscope.cn/my/access/token 。
+注册后，在此页面获取访问的API-Key（即ModelScope平台的访问Token）：https://www.modelscope.cn/my/access/token 。
 
 调用端点：`base_url="https://www.modelscope.cn/twinkle"`
 

From beab1105a7f9552f235f012b235c34b039797f8c Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 15 Apr 2026 21:15:38 +0800
Subject: [PATCH 6/8] fix

---
 cookbook/client/tinker/modelscope/sample.py   |  2 +-
 .../tinker/modelscope/self_cognition.py       |  2 +-
 cookbook/client/tinker/self_host/sample.py    |  2 +-
 .../client/tinker/self_host/self_cognition.py |  2 +-
 .../Usage Guide/Train-as-a-Service.md         |  2 +-
 ...55\347\273\203\346\234\215\345\212\241.md" |  2 +-
 .../dataset/iterable_packing_dataset.py       |  4 +-
 src/twinkle/dataset/lazy_dataset.py           |  4 +-
 src/twinkle/sampler/base.py                   |  4 +-
 .../sampler/vllm_sampler/vllm_sampler.py      |  8 +-
 src/twinkle/template/base.py                  | 23 +++++-
 tests/template/test_chatglm.py                |  6 +-
 tests/template/test_mm.py                     |  8 +-
 tests/template/test_template.py               | 81 +++++++++----------
 14 files changed, 78 insertions(+), 72 deletions(-)

diff --git a/cookbook/client/tinker/modelscope/sample.py b/cookbook/client/tinker/modelscope/sample.py
index ca3aa62b..d12f3bd6 100644
--- a/cookbook/client/tinker/modelscope/sample.py
+++ b/cookbook/client/tinker/modelscope/sample.py
@@ -45,7 +45,7 @@
     ]
 )
 
-input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
+input_feature = template.encode(trajectory, add_generation_prompt=True)
 
 input_ids = input_feature['input_ids'].tolist()
 
diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py
index 7780df60..ac98be89 100644
--- a/cookbook/client/tinker/modelscope/self_cognition.py
+++ b/cookbook/client/tinker/modelscope/self_cognition.py
@@ -107,7 +107,7 @@ def eval():
         ]
     )
 
-    input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
+    input_feature = template.encode(trajectory, add_generation_prompt=True)
 
     input_ids = input_feature['input_ids'].tolist()
 
diff --git a/cookbook/client/tinker/self_host/sample.py b/cookbook/client/tinker/self_host/sample.py
index 5a84c75b..1f4dfb27 100644
--- a/cookbook/client/tinker/self_host/sample.py
+++ b/cookbook/client/tinker/self_host/sample.py
@@ -43,7 +43,7 @@
     ]
 )
 
-input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
+input_feature = template.encode(trajectory, add_generation_prompt=True)
 
 input_ids = input_feature['input_ids'].tolist()
 
diff --git a/cookbook/client/tinker/self_host/self_cognition.py b/cookbook/client/tinker/self_host/self_cognition.py
index 6d33b6c8..691662e6 100644
--- a/cookbook/client/tinker/self_host/self_cognition.py
+++ b/cookbook/client/tinker/self_host/self_cognition.py
@@ -109,7 +109,7 @@ def eval():
         ]
     )
 
-    input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
+    input_feature = template.encode(trajectory, add_generation_prompt=True)
 
     input_ids = input_feature['input_ids'].tolist()
 
diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md
index b3a3d736..2ea2d087 100644
--- a/docs/source_en/Usage Guide/Train-as-a-Service.md	
+++ b/docs/source_en/Usage Guide/Train-as-a-Service.md	
@@ -108,7 +108,7 @@ trajectory = Trajectory(
     ]
 )
 
-input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
+input_feature = template.encode(trajectory, add_generation_prompt=True)
 
 input_ids = input_feature['input_ids'].tolist()
 
diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
index 001ad5e7..a6501e28 100644
--- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
+++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md"
@@ -109,7 +109,7 @@ trajectory = Trajectory(
     ]
 )
 
-input_feature = template.batch_encode([trajectory], add_generation_prompt=True)[0]
+input_feature = template.encode(trajectory, add_generation_prompt=True)
 
 input_ids = input_feature['input_ids'].tolist()
 
diff --git a/src/twinkle/dataset/iterable_packing_dataset.py b/src/twinkle/dataset/iterable_packing_dataset.py
index a5fea729..4c8ceb6f 100644
--- a/src/twinkle/dataset/iterable_packing_dataset.py
+++ b/src/twinkle/dataset/iterable_packing_dataset.py
@@ -63,8 +63,8 @@ def pack_dataset(self):
     def _processor(self):
         while True:
             i, data = self._in_queue.get()
-            encoded_data = self.template.batch_encode([data])
-            data.update(encoded_data[0])
+            encoded_data = self.template.encode(data)
+            data.update(encoded_data)
             self._out_queue.put((i, data))
 
     def _put_data_in_queue(self, iterator) -> int:
diff --git a/src/twinkle/dataset/lazy_dataset.py b/src/twinkle/dataset/lazy_dataset.py
index 26447238..b1b9b9ad 100644
--- a/src/twinkle/dataset/lazy_dataset.py
+++ b/src/twinkle/dataset/lazy_dataset.py
@@ -177,9 +177,7 @@ def __getitem__(self, idx):
 
         # Lazy encode
         if self.do_encode:
-            encoded = self.template.batch_encode([item],
-                                                 add_generation_prompt=self.add_generation_prompt,
-                                                 **self.encode_kwargs)[0]
+            encoded = self.template.encode(item, add_generation_prompt=self.add_generation_prompt, **self.encode_kwargs)
             # Preserve extra fields not produced by encoding
             for key in item:
                 if key not in encoded:
diff --git a/src/twinkle/sampler/base.py b/src/twinkle/sampler/base.py
index 47959b92..d8222ead 100644
--- a/src/twinkle/sampler/base.py
+++ b/src/twinkle/sampler/base.py
@@ -79,11 +79,11 @@ def encode_trajectory(self,
         if template is None:
             raise ValueError(f"Template not set for adapter '{adapter_name}'. Use set_template() first.")
 
-        encoded = template._encode(trajectory, add_generation_prompt=add_generation_prompt)
+        encoded = template.encode(trajectory, add_generation_prompt=add_generation_prompt)
 
         input_ids = encoded.get('input_ids')
         if input_ids is None:
-            raise ValueError("Template._encode() must return 'input_ids'")
+            raise ValueError("Template.encode() must return 'input_ids'")
         if hasattr(input_ids, 'tolist'):
             input_ids = input_ids.tolist()
 
diff --git a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
index 70f9121e..64816cae 100644
--- a/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
+++ b/src/twinkle/sampler/vllm_sampler/vllm_sampler.py
@@ -150,15 +150,15 @@ def encode_trajectory_for_vllm(self,
         """Encode trajectory for vLLM.
 
         Messages should already use transformers standard format (content is List[Dict]).
-        ``batch_encode`` preprocesses media refs in-place (to PIL objects).
+        ``encode`` preprocesses media refs in-place (to PIL objects).
         """
         template = self.template
         if template is None:
             raise ValueError(f"Template not set for adapter '{adapter_name}'. Use set_template() first.")
-        encoded = template.batch_encode(
-            [trajectory],
+        encoded = template.encode(
+            trajectory,
             add_generation_prompt=add_generation_prompt,
-        )[0]
+        )
         for key in encoded:
             if isinstance(encoded[key], np.ndarray):
                 encoded[key] = encoded[key].tolist()
diff --git a/src/twinkle/template/base.py b/src/twinkle/template/base.py
index cb2419b1..10d0984f 100644
--- a/src/twinkle/template/base.py
+++ b/src/twinkle/template/base.py
@@ -548,8 +548,23 @@ def _encode_messages(self, trajectory: Trajectory, add_generation_prompt: bool =
         trajectory.update(input_feature)
         return trajectory
 
-    def _encode(self, trajectory: Trajectory, add_generation_prompt: bool = False, **kwargs) -> InputFeature:
-        return self._encode_messages(trajectory, add_generation_prompt, **kwargs)
+    def encode(self, trajectory: Trajectory, add_generation_prompt: bool = False, **kwargs) -> InputFeature:
+        """Encode a single trajectory into an InputFeature.
+
+        This is a convenience wrapper around :meth:`batch_encode` for encoding
+        a single trajectory.
+
+        Args:
+            trajectory: The trajectory to encode.
+            add_generation_prompt: Whether to add generation prompt.
+
+        Returns:
+            The encoded InputFeature.
+        """
+        assert self.truncation_strategy != 'split', (
+            'encode() does not support truncation_strategy=="split" because it may produce multiple outputs. '
+            'Use batch_encode() instead.')
+        return self.batch_encode([trajectory], add_generation_prompt=add_generation_prompt, **kwargs)[0]
 
     @staticmethod
     def map_col_to_row(trajectories: Dict[str, Any]):
@@ -645,7 +660,7 @@ def batch_encode(
         from concurrent.futures import ThreadPoolExecutor
         from functools import partial
         encode_fn = partial(
-            self._encode,
+            self._encode_messages,
             add_generation_prompt=add_generation_prompt,
             **kwargs,
         )
@@ -661,7 +676,7 @@ def batch_encode(
     def check(self, trajectory: Trajectory) -> Optional[Trajectory]:
         encoded = None
         try:
-            encoded = self.batch_encode([trajectory])
+            encoded = self.encode(trajectory)
             if not encoded:
                 return None
             else:
diff --git a/tests/template/test_chatglm.py b/tests/template/test_chatglm.py
index 2f1373ab..3db2690c 100644
--- a/tests/template/test_chatglm.py
+++ b/tests/template/test_chatglm.py
@@ -21,8 +21,8 @@ def test_nlp(self):
             ),
         ]
         trajectory = Trajectory(messages=messages)
-        encoded = template.batch_encode([trajectory])
-        self.assertTrue('input_ids' in encoded[0])
+        encoded = template.encode(trajectory)
+        self.assertTrue('input_ids' in encoded)
 
     def test_mm(self):
         model_dir = HubOperation.download_model('ms://Qwen/Qwen3-VL-2B-Instruct')
@@ -39,4 +39,4 @@ def test_mm(self):
             ),
         ]
         trajectory = Trajectory(messages=messages)
-        template.batch_encode([trajectory])
+        template.encode(trajectory)
diff --git a/tests/template/test_mm.py b/tests/template/test_mm.py
index 6c222342..51b6d997 100644
--- a/tests/template/test_mm.py
+++ b/tests/template/test_mm.py
@@ -21,8 +21,8 @@ def test_nlp(self):
             ),
         ]
         trajectory = Trajectory(messages=messages)
-        encoded = template.batch_encode([trajectory])
-        self.assertTrue('input_ids' in encoded[0])
+        encoded = template.encode(trajectory)
+        self.assertTrue('input_ids' in encoded)
 
     def test_mm(self):
         model_dir = HubOperation.download_model('ms://Qwen/Qwen3-VL-2B-Instruct')
@@ -39,5 +39,5 @@ def test_mm(self):
             ),
         ]
         trajectory = Trajectory(messages=messages)
-        encoded = template.batch_encode([trajectory])
-        self.assertTrue('input_ids' in encoded[0])
+        encoded = template.encode(trajectory)
+        self.assertTrue('input_ids' in encoded)
diff --git a/tests/template/test_template.py b/tests/template/test_template.py
index 41554d3a..a3095017 100644
--- a/tests/template/test_template.py
+++ b/tests/template/test_template.py
@@ -29,16 +29,15 @@ def test_qwen25_text_template_basic(self):
         ]
         trajectory = Trajectory(messages=messages)
 
-        encoded = template.batch_encode([trajectory])
+        encoded = template.encode(trajectory)
 
-        assert len(encoded) == 1
-        assert 'input_ids' in encoded[0]
-        assert 'labels' in encoded[0]
-        assert len(encoded[0]['input_ids']) > 0
-        assert len(encoded[0]['labels']) == len(encoded[0]['input_ids'])
+        assert 'input_ids' in encoded
+        assert 'labels' in encoded
+        assert len(encoded['input_ids']) > 0
+        assert len(encoded['labels']) == len(encoded['input_ids'])
 
-        input_ids = encoded[0]['input_ids']
-        labels = encoded[0]['labels']
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
 
         assert isinstance(input_ids, np.ndarray)
         assert isinstance(labels, np.ndarray)
@@ -61,12 +60,11 @@ def test_qwen25_text_template_multiple_messages(self):
         ]
         trajectory = Trajectory(messages=messages)
 
-        encoded = template.batch_encode([trajectory])
+        encoded = template.encode(trajectory)
 
-        assert len(encoded) == 1
-        assert 'input_ids' in encoded[0]
-        assert 'labels' in encoded[0]
-        assert len(encoded[0]['input_ids']) > 0
+        assert 'input_ids' in encoded
+        assert 'labels' in encoded
+        assert len(encoded['input_ids']) > 0
 
     @pytest.mark.skipif(SKIP_MODEL_DOWNLOAD, reason='Skipping tests that require model download')
     def test_qwen25_text_template_labels_correctness(self):
@@ -78,10 +76,10 @@ def test_qwen25_text_template_labels_correctness(self):
         messages = [Message(role='user', content='Hello'), Message(role='assistant', content='Hi there')]
         trajectory = Trajectory(messages=messages)
 
-        encoded = template.batch_encode([trajectory])
+        encoded = template.encode(trajectory)
 
-        input_ids = encoded[0]['input_ids']
-        labels = encoded[0]['labels']
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
 
         assert len(input_ids) == len(labels)
 
@@ -113,16 +111,15 @@ def test_qwen2vl_multimodal_template_basic(self):
         ]
         trajectory = Trajectory(messages=messages)
 
-        encoded = template.batch_encode([trajectory])
+        encoded = template.encode(trajectory)
 
-        assert len(encoded) == 1
-        assert 'input_ids' in encoded[0]
-        assert 'labels' in encoded[0]
-        assert len(encoded[0]['input_ids']) > 0
-        assert len(encoded[0]['labels']) == len(encoded[0]['input_ids'])
+        assert 'input_ids' in encoded
+        assert 'labels' in encoded
+        assert len(encoded['input_ids']) > 0
+        assert len(encoded['labels']) == len(encoded['input_ids'])
 
-        input_ids = encoded[0]['input_ids']
-        labels = encoded[0]['labels']
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
 
         assert isinstance(input_ids, np.ndarray)
         assert isinstance(labels, np.ndarray)
@@ -141,14 +138,13 @@ def test_qwen2vl_multimodal_template_with_placeholder(self):
         ]
         trajectory = Trajectory(messages=messages, images=[image_url])
 
-        encoded = template.batch_encode([trajectory])
+        encoded = template.encode(trajectory)
 
-        assert len(encoded) == 1
-        assert 'input_ids' in encoded[0]
-        assert 'labels' in encoded[0]
+        assert 'input_ids' in encoded
+        assert 'labels' in encoded
 
-        if 'pixel_values' in encoded[0]:
-            assert encoded[0]['pixel_values'].shape[0] > 0
+        if 'pixel_values' in encoded:
+            assert encoded['pixel_values'].shape[0] > 0
 
     @pytest.mark.skipif(SKIP_MODEL_DOWNLOAD, reason='Skipping tests that require model download')
     def test_qwen2vl_multimodal_template_labels_correctness(self):
@@ -164,10 +160,10 @@ def test_qwen2vl_multimodal_template_labels_correctness(self):
         ]
         trajectory = Trajectory(messages=messages)
 
-        encoded = template.batch_encode([trajectory])
+        encoded = template.encode(trajectory)
 
-        input_ids = encoded[0]['input_ids']
-        labels = encoded[0]['labels']
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
 
         assert len(input_ids) == len(labels)
 
@@ -191,11 +187,10 @@ def test_qwen2vl_multimodal_template_multiple_images(self):
         ]
         trajectory = Trajectory(messages=messages)
 
-        encoded = template.batch_encode([trajectory])
+        encoded = template.encode(trajectory)
 
-        assert len(encoded) == 1
-        assert 'input_ids' in encoded[0]
-        assert 'labels' in encoded[0]
+        assert 'input_ids' in encoded
+        assert 'labels' in encoded
 
 
 class TestTemplateEdgeCases:
@@ -210,11 +205,10 @@ def test_text_template_empty_assistant(self):
         messages = [Message(role='user', content='Hello')]
         trajectory = Trajectory(messages=messages)
 
-        encoded = template.batch_encode([trajectory])
+        encoded = template.encode(trajectory)
 
-        assert len(encoded) == 1
-        assert 'input_ids' in encoded[0]
-        assert 'labels' in encoded[0]
+        assert 'input_ids' in encoded
+        assert 'labels' in encoded
 
     @pytest.mark.skipif(SKIP_MODEL_DOWNLOAD, reason='Skipping tests that require model download')
     def test_text_template_max_length_truncation(self):
@@ -227,7 +221,6 @@ def test_text_template_max_length_truncation(self):
         messages = [Message(role='user', content=long_text), Message(role='assistant', content='Response')]
         trajectory = Trajectory(messages=messages)
 
-        encoded = template.batch_encode([trajectory])
+        encoded = template.encode(trajectory)
 
-        assert len(encoded) == 1
-        assert len(encoded[0]['input_ids']) <= 50
+        assert len(encoded['input_ids']) <= 50

From 3f63c891ec4001278516b8705ae16c9d2c8a7886 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Wed, 15 Apr 2026 21:20:43 +0800
Subject: [PATCH 7/8] fix

---
 .../Components/Dataset/LazyDataset.md         | 71 ++++++++++++++++++-
 .../LazyDataset.md"                           | 71 ++++++++++++++++++-
 2 files changed, 136 insertions(+), 6 deletions(-)

diff --git a/docs/source_en/Components/Dataset/LazyDataset.md b/docs/source_en/Components/Dataset/LazyDataset.md
index 50a20215..f2ccc3a4 100644
--- a/docs/source_en/Components/Dataset/LazyDataset.md
+++ b/docs/source_en/Components/Dataset/LazyDataset.md
@@ -1,6 +1,71 @@
 # Lazy Loading Dataset
 
-The difference between lazy loading datasets and `Dataset` is that its encode process occurs during `__getitem__`. When you call `encode`, the dataset will only mark that encoding needs to be performed when actually fetching data.
-This type of dataset is generally used for multimodal scenarios to prevent memory explosion.
+LazyDataset is a variant of `Dataset` that defers expensive operations (preprocessing, encoding) to `__getitem__` time, preventing OOM for large or multimodal datasets.
 
-Lazy loading datasets also have the `@remote_class` decorator and can run in Ray workers.
+## Key Differences from Dataset
+
+| Operation | Dataset | LazyDataset |
+|-----------|---------|-------------|
+| `map` | Executes immediately on all data | Records the operation, applies per-item in `__getitem__` |
+| `filter` | Executes immediately | Executes immediately (same as Dataset, index mapping required) |
+| `mix_dataset` | Merges datasets immediately | Records strategy, resolves indices lazily |
+| `encode` | Encodes all data immediately | Records flag, encodes per-item in `__getitem__` |
+
+## Lazy Map
+
+When you call `map`, LazyDataset records the preprocessing function instead of applying it eagerly:
+
+```python
+from twinkle.dataset import LazyDataset, DatasetMeta
+
+dataset = LazyDataset(DatasetMeta(dataset_id='ms://xxx/xxx'))
+dataset.add_dataset(DatasetMeta(dataset_id='ms://yyy/yyy'))
+
+# Per-dataset preprocessing (before mix)
+dataset.map(preprocess_fn_a, dataset_meta=DatasetMeta(dataset_id='ms://xxx/xxx'))
+dataset.map(preprocess_fn_b, dataset_meta=DatasetMeta(dataset_id='ms://yyy/yyy'))
+
+dataset.mix_dataset()
+
+# Global preprocessing (after mix, applies to all items)
+dataset.map(global_preprocess_fn)
+```
+
+- **Before mix**: `map` is recorded per-dataset, so different datasets can have different preprocessing pipelines.
+- **After mix**: `map` is recorded globally and applies to all items regardless of source dataset.
+- All map operations are applied lazily in `__getitem__` in the order they were registered.
+
+## Lazy Mix
+
+`mix_dataset` supports two strategies:
+
+```python
+dataset.mix_dataset(interleave=True)   # Round-robin interleaving (default)
+dataset.mix_dataset(interleave=False)  # Concatenation
+```
+
+- **Interleave**: Items cycle through datasets in round-robin order. Shorter datasets wrap around.
+- **Concatenate**: Items are accessed sequentially — all of dataset A, then all of dataset B.
+
+## Lazy Encode
+
+Calling `encode` only marks the dataset for encoding. The actual `template.encode()` call happens inside `__getitem__`:
+
+```python
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512)
+dataset.encode()
+```
+
+> Note: `truncation_strategy='split'` is not supported in LazyDataset because splitting may produce multiple outputs from a single item.
+
+## Eager Filter
+
+Unlike other operations, `filter` executes immediately because it needs to build the index mapping of valid items upfront:
+
+```python
+dataset.filter(filter_fn, dataset_meta=DatasetMeta(dataset_id='ms://xxx/xxx'))
+```
+
+## Remote Execution
+
+LazyDataset has the `@remote_class` decorator and can run in Ray workers, just like `Dataset`.
diff --git "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/LazyDataset.md" "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/LazyDataset.md"
index 4161445c..4e2c00a8 100644
--- "a/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/LazyDataset.md"
+++ "b/docs/source_zh/\347\273\204\344\273\266/\346\225\260\346\215\256\351\233\206/LazyDataset.md"
@@ -1,6 +1,71 @@
 # 懒加载数据集
 
-懒加载数据集和`Dataset`的区别在于它的encode过程发生在`__getitem__`的时候。在你调用`encode`的时候，数据集仅会进行标记，表示在实际取数据的时候需要进行encode。
-这种数据集一般用于多模态场景，用于防止内存爆炸。
+LazyDataset 是 `Dataset` 的变体，它将预处理、编码等开销较大的操作推迟到 `__getitem__` 时执行，从而避免大规模或多模态数据集的内存溢出问题。
 
-懒加载数据集也有`@remote_class`装饰器，可以在ray的worker中运行。
+## 与 Dataset 的关键差异
+
+| 操作 | Dataset | LazyDataset |
+|------|---------|-------------|
+| `map` | 立即对所有数据执行 | 记录操作，在 `__getitem__` 中逐条执行 |
+| `filter` | 立即执行 | 立即执行（与 Dataset 相同，需要构建索引映射） |
+| `mix_dataset` | 立即合并数据集 | 记录策略，延迟解析索引 |
+| `encode` | 立即编码所有数据 | 记录标志，在 `__getitem__` 中逐条编码 |
+
+## 懒加载 Map
+
+调用 `map` 时，LazyDataset 会记录预处理函数而非立即执行：
+
+```python
+from twinkle.dataset import LazyDataset, DatasetMeta
+
+dataset = LazyDataset(DatasetMeta(dataset_id='ms://xxx/xxx'))
+dataset.add_dataset(DatasetMeta(dataset_id='ms://yyy/yyy'))
+
+# 按数据集的预处理（混合前）
+dataset.map(preprocess_fn_a, dataset_meta=DatasetMeta(dataset_id='ms://xxx/xxx'))
+dataset.map(preprocess_fn_b, dataset_meta=DatasetMeta(dataset_id='ms://yyy/yyy'))
+
+dataset.mix_dataset()
+
+# 全局预处理（混合后，对所有数据生效）
+dataset.map(global_preprocess_fn)
+```
+
+- **混合前**：`map` 按数据集记录，不同数据集可以有不同的预处理流程。
+- **混合后**：`map` 全局记录，对所有数据统一生效。
+- 所有 map 操作在 `__getitem__` 中按注册顺序依次执行。
+
+## 懒加载 Mix
+
+`mix_dataset` 支持两种策略：
+
+```python
+dataset.mix_dataset(interleave=True)   # 轮询交错（默认）
+dataset.mix_dataset(interleave=False)  # 顺序拼接
+```
+
+- **交错**：按轮询顺序从各数据集中取数据，较短的数据集会循环。
+- **拼接**：按顺序访问——先取完数据集 A 的全部数据，再取数据集 B。
+
+## 懒加载 Encode
+
+调用 `encode` 仅标记需要编码，实际的 `template.encode()` 在 `__getitem__` 中执行：
+
+```python
+dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B', max_length=512)
+dataset.encode()
+```
+
+> 注意：LazyDataset 不支持 `truncation_strategy='split'`，因为分割可能从单条数据产生多条输出。
+
+## 即时 Filter
+
+与其他操作不同，`filter` 会立即执行，因为它需要预先构建有效数据项的索引映射：
+
+```python
+dataset.filter(filter_fn, dataset_meta=DatasetMeta(dataset_id='ms://xxx/xxx'))
+```
+
+## 远程执行
+
+LazyDataset 拥有 `@remote_class` 装饰器，可以在 Ray Worker 中运行，与 `Dataset` 一致。

From c6e66fa8b1e124c6b6bca5bcd2a6c5fff9923715 Mon Sep 17 00:00:00 2001
From: tastelikefeet <yuze.zyz@alibaba-inc.com>
Date: Thu, 16 Apr 2026 15:28:22 +0800
Subject: [PATCH 8/8] fix

---
 Dockerfile                                    |   2 +-
 cookbook/sample/sample.py                     | 109 ++++++++++++++++++
 .../model/megatron/strategy/megatron.py       |   1 +
 3 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 cookbook/sample/sample.py

diff --git a/Dockerfile b/Dockerfile
index e29e4d17..8107ebcb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -37,7 +37,7 @@ RUN pip install flash-linear-attention -U --no-cache-dir
 RUN pip install numpy==2.2 --no-cache-dir
 
 # Install tinker, ray, and other deps
-RUN pip install --no-cache-dir tinker==0.16.1 "ray[serve]" transformers peft accelerate -U
+RUN pip install --no-cache-dir tinker==0.16.1 "ray[serve]" transformers peft<=0.18 accelerate -U
 
 # Clone and install twinkle, checkout to latest v-tag
 RUN git clone https://github.com/modelscope/twinkle.git
diff --git a/cookbook/sample/sample.py b/cookbook/sample/sample.py
new file mode 100644
index 00000000..b56460ea
--- /dev/null
+++ b/cookbook/sample/sample.py
@@ -0,0 +1,109 @@
+"""
+Standalone inference example using Ray + vLLMSampler with LoRA adapter.
+
+This script demonstrates how to:
+1. Initialize Twinkle with Ray for distributed inference
+2. Create a vLLMSampler with LoRA enabled on dedicated GPUs
+3. Load a LoRA adapter from a local checkpoint path
+4. Send prompts (Trajectory format) and collect generated responses
+
+Usage:
+    # Single GPU inference
+    SAMPLER_GPUS=1 python sample.py
+
+    # Multi-GPU inference (tensor parallel)
+    SAMPLER_GPUS=2 python sample.py
+
+    # Use a different model / adapter
+    MODEL_ID=/path/to/model LORA_PATH=/path/to/adapter SAMPLER_GPUS=1 python sample.py
+"""
+
+import os
+from typing import List, Dict, Any
+
+import twinkle
+from twinkle import DeviceMesh, DeviceGroup, get_device_placement, get_logger
+from twinkle.data_format import SamplingParams
+from twinkle.sampler import vLLMSampler
+
+logger = get_logger()
+
+MODEL_ID = os.environ.get('MODEL_ID', 'Qwen/Qwen3.5-4B')
+LORA_PATH = os.environ.get('LORA_PATH', '/path/to/lora')
+SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 1))
+
+
+def build_prompts() -> List[Dict[str, Any]]:
+    """Build a list of Trajectory dicts (messages format) as prompts."""
+    prompts = [
+        {
+            'messages': [
+                {'role': 'system', 'content': 'You are a helpful assistant.'},
+                {'role': 'user', 'content': 'What is the capital of France?'},
+            ]
+        },
+        {
+            'messages': [
+                {'role': 'system', 'content': 'You are a helpful assistant.'},
+                {'role': 'user', 'content': 'Write a short poem about the moon.'},
+            ]
+        },
+        {
+            'messages': [
+                {'role': 'user', 'content': 'Solve: 2x + 3 = 11. What is x?'},
+            ]
+        },
+    ]
+    return prompts
+
+
+def main():
+    # ── 1. Initialize Twinkle with Ray ──────────────────────────────────
+    device_groups = [
+        DeviceGroup(name='sampler', ranks=list(range(SAMPLER_GPUS)), device_type='GPU', gpus_per_worker=SAMPLER_GPUS),
+    ]
+    sampler_mesh = DeviceMesh.from_sizes(world_size=SAMPLER_GPUS, tp_size=SAMPLER_GPUS)
+    twinkle.initialize(mode='ray', nproc_per_node=SAMPLER_GPUS, groups=device_groups)
+
+    # ── 2. Create vLLMSampler with LoRA enabled ────────────────────────
+    sampler = vLLMSampler(
+        model_id=MODEL_ID,
+        engine_args={
+            'gpu_memory_utilization': 0.7,
+            'max_model_len': 4096,
+            'enable_lora': True,
+            'max_loras': 1,
+            'max_lora_rank': 32,
+            'enable_tower_connector_lora': True,
+        },
+        device_mesh=sampler_mesh,
+        remote_group='sampler',
+    )
+    sampler.set_template('Qwen3_5Template', model_id=MODEL_ID)
+    logger.info(get_device_placement())
+
+    # ── 3. Configure sampling parameters ────────────────────────────────
+    sampling_params = SamplingParams(
+        max_tokens=2018,
+        temperature=0.7,
+        top_p=0.9,
+        num_samples=1,
+    )
+
+    # ── 4. Run inference ────────────────────────────────────────────────
+    prompts = build_prompts()
+    logger.info(f'Sampling {len(prompts)} prompts with model {MODEL_ID} ...')
+
+    responses = sampler.sample(prompts, sampling_params, adapter_path=LORA_PATH)
+
+    # ── 5. Print results ────────────────────────────────────────────────
+    for i, response in enumerate(responses):
+        for seq in response.sequences:
+            text = sampler.template.tokenizer.decode(seq.tokens, skip_special_tokens=True)
+            logger.info(f'\n{"="*60}\nPrompt {i}: {prompts[i]["messages"][-1]["content"]}\n{"─"*60}\n{text}\n')
+
+    logger.info('Done.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/twinkle/model/megatron/strategy/megatron.py b/src/twinkle/model/megatron/strategy/megatron.py
index b9e66505..9d03bb87 100644
--- a/src/twinkle/model/megatron/strategy/megatron.py
+++ b/src/twinkle/model/megatron/strategy/megatron.py
@@ -187,6 +187,7 @@ def _wrap_with_megatron_ddp(
 
         wrapped_models = []
         for _model in model:
+            _model = MegatronStrategy._move_model_to_gpu(_model)
             config: TransformerConfig = _model.config  # noqa
 
             if not isinstance(model, Float16Module) and (config.fp16 or config.bf16):