From 2f4edd279cf186670a730453549cd5e0592c83cf Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 22 Apr 2026 16:04:08 +0800 Subject: [PATCH 1/3] support q3.6-27b --- README.md | 7 +- README_ZH.md | 7 +- .../client/server/megatron/server_config.yaml | 26 ++-- cookbook/client/tinker/modelscope/dpo.py | 2 +- cookbook/client/tinker/modelscope/sample.py | 2 +- .../tinker/modelscope/self_cognition.py | 2 +- .../tinker/modelscope/short_math_grpo.py | 2 +- cookbook/client/twinkle/modelscope/dpo.py | 2 +- .../client/twinkle/modelscope/multi_modal.py | 2 +- .../twinkle/modelscope/self_congnition.py | 2 +- cookbook/rl/short_math_grpo_moe.py | 2 +- cookbook/rl/short_math_grpo_multi_lora.py | 4 +- .../Usage Guide/Train-as-a-Service.md | 8 +- ...55\347\273\203\346\234\215\345\212\241.md" | 8 +- notebook/dpo.ipynb | 90 +++++++----- notebook/multi_modal.ipynb | 88 +++++++----- notebook/sample.ipynb | 65 ++++++--- notebook/self_cognition.ipynb | 88 +++++++----- notebook/short_math_grpo.ipynb | 133 ++++++++---------- src/twinkle/server/gateway/server.py | 2 +- 20 files changed, 294 insertions(+), 248 deletions(-) diff --git a/README.md b/README.md index e6779d46..7c01765e 100644 --- a/README.md +++ b/README.md @@ -113,6 +113,7 @@ sh INSTALL_MEGATRON.sh | Server startup scripts | transformers/megatron | [Script](cookbook/client/server) | ## Changelog +- 🎉2026-04-22 The ModelScope service has been deployed to [Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B) with a new release 0.2.1. - 🎉2026-04-14 The ModelScope service has been deployed to [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B) with a new release 0.2.0. - 🎉2026-03-28 Support DPO training with both Transformers and Megatron backends. See [dpo_full.py](cookbook/rl/dpo_full.py) and [dpo_lora.py](cookbook/rl/dpo_lora.py). - 🎉2026-03-24 Twinkle Web site is now live at https://modelscope.github.io/twinkle-web/ @@ -143,7 +144,7 @@ supported on Twinkle✨ framework. > For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it > is currently provided via the Tinker-compatible APIs. We will be rolling out services that support > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed -> by one training base at a time, and currently it is [Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-35B-A3B). +> by one training base at a time, and currently it is [Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-27B). | Model Type | Model ID on [ModelScope](https://modelscope.cn) | Model Size | Requires | Support Megatron | HF Model ID | |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:| @@ -192,7 +193,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me def train(): # to load model from Hugging Face, use 'hf://...' - base_model = 'ms://Qwen/Qwen3.6-35B-A3B' + base_model = 'ms://Qwen/Qwen3.6-27B' # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding @@ -248,7 +249,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3.6-35B-A3B' +base_model = 'ms://Qwen/Qwen3.6-27B' base_url='your-base-url' api_key='your-api-key' diff --git a/README_ZH.md b/README_ZH.md index 64ce16a6..3496abb8 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -105,6 +105,7 @@ sh INSTALL_MEGATRON.sh Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Client等各场景下。其算法过程是外露的,非常便于修改和调试。完整的框架介绍请查看[快速开始](docs/source_zh/使用指引/快速开始.md) ## 更新日志 +🎉2026-04-22 ModelScope的训练服务部署为[Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B),并发布了0.2.1版本. 🎉2026-04-16 ModelScope的训练服务部署为[Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B),并发布了0.2.0版本. 🎉2026-03-28 支持 DPO 训练,同时支持 Transformers 和 Megatron 后端。参考 [dpo_full.py](cookbook/rl/dpo_full.py) 和 [dpo_lora.py](cookbook/rl/dpo_lora.py)。 🎉2026-03-24 Twinkle 站点上线,访问地址 https://modelscope.github.io/twinkle-web/ @@ -129,7 +130,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl 随着新模型的发布,我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。 >[!Note] -> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)。 +> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-27B)。 | Model Type | Model ID 举例 | Model Size | Requires | Support Megatron | HF Model ID | |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:| @@ -177,7 +178,7 @@ twinkle.initialize(mode='ray', groups=device_group, global_device_mesh=device_me def train(): # to load model from Hugging Face, use 'hf://...' - base_model = 'ms://Qwen/Qwen3.6-35B-A3B' + base_model = 'ms://Qwen/Qwen3.6-27B' # 1000 samples dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000))) # Set template to prepare encoding @@ -233,7 +234,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3.6-35B-A3B' +base_model = 'ms://Qwen/Qwen3.6-27B' base_url='your-base-url' api_key='your-api-key' diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml index d95bfa4e..69620020 100644 --- a/cookbook/client/server/megatron/server_config.yaml +++ b/cookbook/client/server/megatron/server_config.yaml @@ -39,15 +39,15 @@ applications: # Used for generating text from the model (e.g., evaluating LoRA results). # Config: TP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, ~37GB for KV cache + LoRA - name: sampler-Qwen3.6-35B-A3B - route_prefix: /api/v1/sampler/Qwen/Qwen3.6-35B-A3B + route_prefix: /api/v1/sampler/Qwen/Qwen3.6-27B import_path: sampler args: - model_id: "ms://Qwen/Qwen3.6-35B-A3B" # ModelScope model identifier + model_id: "ms://Qwen/Qwen3.6-27B" # ModelScope model identifier nproc_per_node: 4 # Number of GPU processes per node sampler_type: vllm # Inference engine: 'vllm' (fast) or 'torch' (TorchSampler) engine_args: # vLLM engine-specific settings - max_model_len: 32000 # Maximum sequence length the engine supports - gpu_memory_utilization: 0.80 # 80% utilization, ~64GB/GPU, leaves buffer for safety + max_model_len: 65536 # Maximum sequence length the engine supports + gpu_memory_utilization: 0.75 # 80% utilization, ~64GB/GPU, leaves buffer for safety enable_lora: true # Allow loading LoRA adapters during inference max_loras: 5 # Max allowed loras working on vLLM at the same time max_lora_rank: 32 # Support up to rank 64 LoRA adapters @@ -63,8 +63,8 @@ applications: tp_size: 2 # 2 TP replicas for multi-tenant throughput queue_config: rps_limit: 20 # Max requests per second - tps_limit: 32000 # Max tokens per second - max_input_tokens: 32000 + tps_limit: 131072 # Max tokens per second + max_input_tokens: 65536 deployments: - name: SamplerManagement autoscaling_config: @@ -81,12 +81,12 @@ applications: # 2. Model Service - Hosts the base model for training. # Config: PP=2 x DP=2 on 4 GPUs, ~27GB weights/GPU, comfortable for LoRA training - name: models-Qwen3.6-35B-A3B - route_prefix: /api/v1/model/Qwen/Qwen3.6-35B-A3B + route_prefix: /api/v1/model/Qwen/Qwen3.6-27B import_path: model args: use_megatron: true # Use Megatron-LM backend - model_id: "ms://Qwen/Qwen3.6-35B-A3B" # ModelScope model identifier - max_length: 32000 # model max length + model_id: "ms://Qwen/Qwen3.6-27B" # ModelScope model identifier + max_length: 65536 # model max length max_loras: 3 # model max loras nproc_per_node: 4 # Number of GPU processes per node device_group: @@ -95,15 +95,13 @@ applications: device_type: cuda device_mesh: device_type: cuda - tp_size: 2 - ep_size: 2 + dp_size: 2 pp_size: 2 - sequence_parallel: True queue_config: rps_limit: 20 # Max requests per second - tps_limit: 32000 # Max tokens per second - max_input_tokens: 32000 + tps_limit: 131072 # Max tokens per second + max_input_tokens: 65536 adapter_config: adapter_timeout: 120 # Seconds before idle adapter unload adapter_max_lifetime: 36000 # Maximum lifetime of an adapter in seconds (e.g., 10 hours) diff --git a/cookbook/client/tinker/modelscope/dpo.py b/cookbook/client/tinker/modelscope/dpo.py index a88b70a7..23cf5aae 100644 --- a/cookbook/client/tinker/modelscope/dpo.py +++ b/cookbook/client/tinker/modelscope/dpo.py @@ -39,7 +39,7 @@ # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- -base_model = 'Qwen/Qwen3.6-35B-A3B' +base_model = 'Qwen/Qwen3.6-27B' base_url = 'http://www.modelscope.cn/twinkle' api_key = os.environ.get('MODELSCOPE_TOKEN') dataset_id = 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji' diff --git a/cookbook/client/tinker/modelscope/sample.py b/cookbook/client/tinker/modelscope/sample.py index d12f3bd6..7e03e0bd 100644 --- a/cookbook/client/tinker/modelscope/sample.py +++ b/cookbook/client/tinker/modelscope/sample.py @@ -16,7 +16,7 @@ from tinker import ServiceClient -base_model = 'Qwen/Qwen3.6-35B-A3B' +base_model = 'Qwen/Qwen3.6-27B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Define the base model and connect to the server diff --git a/cookbook/client/tinker/modelscope/self_cognition.py b/cookbook/client/tinker/modelscope/self_cognition.py index 7780df60..f74ec073 100644 --- a/cookbook/client/tinker/modelscope/self_cognition.py +++ b/cookbook/client/tinker/modelscope/self_cognition.py @@ -23,7 +23,7 @@ from tinker import ServiceClient # The base model to fine-tune / evaluate -base_model = 'Qwen/Qwen3.6-35B-A3B' +base_model = 'Qwen/Qwen3.6-27B' base_url = 'http://www.modelscope.cn/twinkle' diff --git a/cookbook/client/tinker/modelscope/short_math_grpo.py b/cookbook/client/tinker/modelscope/short_math_grpo.py index cf210c46..bf57a942 100644 --- a/cookbook/client/tinker/modelscope/short_math_grpo.py +++ b/cookbook/client/tinker/modelscope/short_math_grpo.py @@ -38,7 +38,7 @@ logger = get_logger() # ========== Configuration ========== -BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B' +BASE_MODEL = 'Qwen/Qwen3.6-27B' NUM_GENERATIONS = 4 MAX_NEW_TOKENS = 4096 LEARNING_RATE = 2e-5 diff --git a/cookbook/client/twinkle/modelscope/dpo.py b/cookbook/client/twinkle/modelscope/dpo.py index 17a69965..e9451e31 100644 --- a/cookbook/client/twinkle/modelscope/dpo.py +++ b/cookbook/client/twinkle/modelscope/dpo.py @@ -24,7 +24,7 @@ logger = get_logger() # Configuration (direct values, not from env) -base_model = 'Qwen/Qwen3.6-35B-A3B' +base_model = 'Qwen/Qwen3.6-27B' base_url = 'http://www.modelscope.cn/twinkle' dataset_id = 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji' diff --git a/cookbook/client/twinkle/modelscope/multi_modal.py b/cookbook/client/twinkle/modelscope/multi_modal.py index 331352c8..106d85d4 100644 --- a/cookbook/client/twinkle/modelscope/multi_modal.py +++ b/cookbook/client/twinkle/modelscope/multi_modal.py @@ -24,7 +24,7 @@ logger = get_logger() -base_model = 'Qwen/Qwen3.6-35B-A3B' +base_model = 'Qwen/Qwen3.6-27B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Initialize the Twinkle client to communicate with the remote server. diff --git a/cookbook/client/twinkle/modelscope/self_congnition.py b/cookbook/client/twinkle/modelscope/self_congnition.py index 2248ddd4..5acd8a9a 100644 --- a/cookbook/client/twinkle/modelscope/self_congnition.py +++ b/cookbook/client/twinkle/modelscope/self_congnition.py @@ -21,7 +21,7 @@ logger = get_logger() -base_model = 'Qwen/Qwen3.6-35B-A3B' +base_model = 'Qwen/Qwen3.6-27B' base_url = 'http://www.modelscope.cn/twinkle' # Step 2: Initialize the Twinkle client to communicate with the remote server. diff --git a/cookbook/rl/short_math_grpo_moe.py b/cookbook/rl/short_math_grpo_moe.py index 9d870eac..6ad5cc2f 100644 --- a/cookbook/rl/short_math_grpo_moe.py +++ b/cookbook/rl/short_math_grpo_moe.py @@ -28,7 +28,7 @@ logger = get_logger() # ========== Configuration ========== -MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B') +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-27B') USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1'))) MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) diff --git a/cookbook/rl/short_math_grpo_multi_lora.py b/cookbook/rl/short_math_grpo_multi_lora.py index 9dad8df3..96d7ef9b 100644 --- a/cookbook/rl/short_math_grpo_multi_lora.py +++ b/cookbook/rl/short_math_grpo_multi_lora.py @@ -5,7 +5,7 @@ weights to a local directory, then passes the path to vLLMSampler via `adapter_path` so vLLM loads the latest adapter from disk. -Model: Qwen/Qwen3.6-35B-A3B (MoE, 35B total / 3B active) +Model: Qwen/Qwen3.6-27B (MoE, 35B total / 3B active) Model mesh: tp=2, ep=2, pp=2, sequence_parallel=True (8 GPUs) Sampler mesh: dp=2, tp=2, gpus_per_worker=2 (4 GPUs) @@ -35,7 +35,7 @@ logger = get_logger() # ========== Configuration ========== -MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B') +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-27B') MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 2)) diff --git a/docs/source_en/Usage Guide/Train-as-a-Service.md b/docs/source_en/Usage Guide/Train-as-a-Service.md index b3a3d736..57ba26fb 100644 --- a/docs/source_en/Usage Guide/Train-as-a-Service.md +++ b/docs/source_en/Usage Guide/Train-as-a-Service.md @@ -2,7 +2,7 @@ Alongside the open-source release of the Twinkle framework, we also provide a hosted model training service (Training as a Service) powered by ModelScope's backend infrastructure. Developers can use this service to experience Twinkle's training API for free. -The model currently running on the cluster is [Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B). Below are the detailed usage instructions: +The model currently running on the cluster is [Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B). Below are the detailed usage instructions: ## Step 1. Register a ModelScope Account and Obtain Your API Key @@ -30,7 +30,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3.6-35B-A3B' +base_model = 'ms://Qwen/Qwen3.6-27B' base_url='https://www.modelscope.cn/twinkle' api_key=os.environ.get('MODELSCOPE_TOKEN') @@ -64,7 +64,7 @@ for epoch in range(2): print(f'Saved checkpoint for epoch {epoch} to {result.path}') ``` -With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.6-35B-A3B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA: +With the code above, you can train a self-cognition LoRA based on `Qwen/Qwen3.6-27B`. This LoRA will change the model's name and creator to the names specified during training. To perform inference using this LoRA: ```python import os @@ -79,7 +79,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = 'Qwen/Qwen3.6-35B-A3B' +base_model = 'Qwen/Qwen3.6-27B' base_url = 'https://www.modelscope.cn/twinkle' # Step 2: Define the base model and connect to the server diff --git "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" index 001ad5e7..c5db28d3 100644 --- "a/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" +++ "b/docs/source_zh/\344\275\277\347\224\250\346\214\207\345\274\225/\350\256\255\347\273\203\346\234\215\345\212\241.md" @@ -3,7 +3,7 @@ 在 Twinkle 框架开源的同时,我们依托ModelScope的后台服务,也提供了托管的模型训练服务(Training as a Service),开发者可以通过这一服务, 免费体验Twinkle的训练API。 -目前在集群中运行的模型是[Qwen/Qwen3.6-35B-A3B](https://www.modelscope.cn/models/Qwen/Qwen3.6-35B-A3B)。下面介绍具体的使用方法: +目前在集群中运行的模型是[Qwen/Qwen3.6-27B](https://www.modelscope.cn/models/Qwen/Qwen3.6-27B)。下面介绍具体的使用方法: ## Step 1. 注册ModelScope用户并获取 API Key @@ -31,7 +31,7 @@ from twinkle.dataset import Dataset, DatasetMeta from twinkle.preprocessor import SelfCognitionProcessor from twinkle.server.common import input_feature_to_datum -base_model = 'ms://Qwen/Qwen3.6-35B-A3B' +base_model = 'ms://Qwen/Qwen3.6-27B' base_url='https://www.modelscope.cn/twinkle' api_key=os.environ.get('MODELSCOPE_TOKEN') @@ -65,7 +65,7 @@ for epoch in range(2): print(f'Saved checkpoint for epoch {epoch} to {result.path}') ``` -通过上述代码,你可以训练一个原模型为`Qwen/Qwen3.6-35B-A3B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理: +通过上述代码,你可以训练一个原模型为`Qwen/Qwen3.6-27B`的自我认知lora。这个lora会改变模型的名称和制造者为训练时指定的名称。使用这个lora进行推理: ```python import os @@ -80,7 +80,7 @@ init_tinker_client() from tinker import ServiceClient -base_model = 'Qwen/Qwen3.6-35B-A3B' +base_model = 'Qwen/Qwen3.6-27B' base_url = 'https://www.modelscope.cn/twinkle' # Step 2: Define the base model and connect to the server diff --git a/notebook/dpo.ipynb b/notebook/dpo.ipynb index c1dc132d..caeb6582 100644 --- a/notebook/dpo.ipynb +++ b/notebook/dpo.ipynb @@ -95,7 +95,7 @@ "\n", "| 配置项 | 默认值 | 说明 |\n", "|--------|--------|------|\n", - "| `BASE_MODEL` | Qwen/Qwen3.6-35B-A3B | 基座模型 |\n", + "| `BASE_MODEL` | Qwen/Qwen3.6-27B | 基座模型 |\n", "| `BATCH_SIZE` | 4 | 每步处理的 DPO 样本对数 |\n", "| `LEARNING_RATE` | 1e-4 | 学习率 |\n", "| `DPO_BETA` | 0.1 | DPO 温度系数,控制偏好强度 |\n", @@ -149,7 +149,7 @@ "logger = get_logger()\n", "\n", "# ========== 全局配置 ==========\n", - "BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'\n", + "BASE_MODEL = 'Qwen/Qwen3.6-27B'\n", "BASE_URL = 'http://www.modelscope.cn/twinkle'\n", "API_KEY = getpass(\"ModelScope Token: \")\n", "DATASET_ID = 'ms://hjh0119/shareAI-Llama3-DPO-zh-en-emoji'\n", @@ -464,13 +464,9 @@ "\n", "训练完成后,可以直接使用 **线上服务** 进行推理,无需本地 GPU。\n", "\n", - "通过 `save_weights_and_get_sampling_client` 或 `create_sampling_client` 加载训练好的 LoRA 检查点,即可在线采样生成。\n", + "通过 `create_sampling_client` 加载训练好的 LoRA 检查点,即可在线采样生成。\n", "\n", - "> 将下方 `weight_path` 替换为训练输出的检查点路径(`twinkle://...` 格式)。\n", - "\n", - "### ⚠️ MoE 模型 LoRA 注意事项\n", - "\n", - "由于 `Qwen/Qwen3.6-35B-A3B` 是 MoE(Mixture of Experts)架构,在配合 vLLM 采样时存在已知兼容性问题。下面Sample部分仅作为示例代码。训练得到的 LoRA 权重可以上传值 ModelScope 并与原始模型合并,导出为完整的模型,方便后续部署和推理。" + "> 将下方 `weight_path` 替换为训练输出的检查点路径(`twinkle://...` 格式)。\n" ] }, { @@ -488,7 +484,7 @@ "\n", "logger = get_logger()\n", "\n", - "BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'\n", + "BASE_MODEL = 'Qwen/Qwen3.6-27B'\n", "\n", "# TODO: 替换为训练输出的检查点路径\n", "weight_path = '<替换为你的 twinkle:// 检查点路径>' # 例如: save_result.path\n", @@ -535,44 +531,60 @@ }, { "cell_type": "markdown", - "id": "13d43fcc", "metadata": {}, "source": [ - "## 合并权重并导出\n", + "## 合并 LoRA 权重(可选)\n", + "\n", + "如果需要将 LoRA 权重合并为完整模型(用于无 LoRA 支持的部署场景),可以使用 PEFT 提供的合并功能。\n", + "\n", + "> **注意**:合并操作需要加载完整模型,请在有足够显存的环境下执行。\n" + ], + "id": "peft_merge_0" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from peft import PeftModel\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "base_model_id = 'Qwen/Qwen3.6-27B'\n", + "lora_path = '<替换为你的 LoRA 检查点路径>'\n", + "output_dir = '<替换为输出目录>'\n", + "\n", + "# 加载基座模型\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " base_model_id, torch_dtype='auto', device_map='auto'\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n", "\n", - "训练得到的 LoRA 权重可以与原始模型合并,导出为完整的 HuggingFace 模型,方便后续部署和推理。\n", + "# 加载 LoRA 适配器并合并\n", + "model = PeftModel.from_pretrained(base_model, lora_path)\n", + "merged_model = model.merge_and_unload()\n", "\n", - "> **注意**:合并操作需要 GPU 资源(需要加载完整模型),请在有足够显存的环境下执行。\n", + "# 保存合并后的完整模型\n", + "merged_model.save_pretrained(output_dir)\n", + "tokenizer.save_pretrained(output_dir)\n", + "print(f'合并完成,模型已保存到 {output_dir}')\n" + ], + "outputs": [], + "execution_count": null, + "id": "peft_merge_1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "合并后的模型为标准 HuggingFace 格式,可直接用 vLLM、Transformers 等框架加载推理:\n", "\n", "```bash\n", - "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n", - "NPROC_PER_NODE=4 \\\n", - "/opt/conda/envs/twinkle/bin/megatron export \\\n", - " --model Qwen/Qwen3.6-35B-A3B \\\n", - " --adapters <替换为你的 LoRA 检查点路径> \\\n", - " --output_dir <替换为输出目录> \\\n", - " --merge_lora true \\\n", - " --to_hf true \\\n", - " --tensor_model_parallel_size 2 \\\n", - " --expert_model_parallel_size 2 \\\n", - " --pipeline_model_parallel_size 2\n", + "# 使用 vLLM 部署合并后的模型\n", + "vllm serve <输出目录> --tensor-parallel-size 2\n", "```\n", "\n", - "**参数说明**:\n", - "\n", - "| 参数 | 说明 |\n", - "|------|------|\n", - "| `--model` | 基座模型 ID |\n", - "| `--adapters` | 训练保存的 LoRA 检查点路径 |\n", - "| `--output_dir` | 合并后的完整模型输出目录 |\n", - "| `--merge_lora true` | 将 LoRA 权重合并到基座模型中 |\n", - "| `--to_hf true` | 导出为 HuggingFace 格式 |\n", - "| `--tensor_model_parallel_size` | 张量并行大小 |\n", - "| `--expert_model_parallel_size` | 专家并行大小(MoE 模型专用) |\n", - "| `--pipeline_model_parallel_size` | 流水线并行大小 |\n", - "\n", - "合并完成后,输出目录中即为完整的 HuggingFace 模型,可直接用于推理或部署。" - ] + "> **提示**:对于 Dense 模型(如 Qwen3.6-27B),LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载,无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n" + ], + "id": "peft_merge_2" }, { "cell_type": "markdown", diff --git a/notebook/multi_modal.ipynb b/notebook/multi_modal.ipynb index 41d32478..4c88d585 100644 --- a/notebook/multi_modal.ipynb +++ b/notebook/multi_modal.ipynb @@ -149,7 +149,7 @@ "metadata": {}, "outputs": [], "source": [ - "base_model = 'Qwen/Qwen3.6-35B-A3B'\n", + "base_model = 'Qwen/Qwen3.6-27B'\n", "base_url = 'http://www.modelscope.cn/twinkle'\n", "\n", "client = init_twinkle_client(base_url=base_url, api_key=api_key)\n", @@ -393,13 +393,9 @@ "\n", "训练完成后,可以直接使用 **线上服务** 进行推理,无需本地 GPU。\n", "\n", - "通过 `save_weights_and_get_sampling_client` 或 `create_sampling_client` 加载训练好的 LoRA 检查点,即可在线采样生成。\n", + "通过 `create_sampling_client` 加载训练好的 LoRA 检查点,即可在线采样生成。\n", "\n", - "> 将下方 `weight_path` 替换为训练输出的检查点路径(`twinkle://...` 格式)。\n", - "\n", - "### ⚠️ MoE 模型 LoRA 注意事项\n", - "\n", - "由于 `Qwen/Qwen3.6-35B-A3B` 是 MoE(Mixture of Experts)架构,在配合 vLLM 采样时存在已知兼容性问题。下面Sample部分仅作为示例代码。训练得到的 LoRA 权重可以上传值 ModelScope 并与原始模型合并,导出为完整的模型,方便后续部署和推理。" + "> 将下方 `weight_path` 替换为训练输出的检查点路径(`twinkle://...` 格式)。\n" ] }, { @@ -417,7 +413,7 @@ "\n", "logger = get_logger()\n", "\n", - "BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'\n", + "BASE_MODEL = 'Qwen/Qwen3.6-27B'\n", "\n", "# TODO: 替换为训练输出的检查点路径\n", "weight_path = '<替换为你的 twinkle:// 检查点路径>'\n", @@ -463,44 +459,60 @@ }, { "cell_type": "markdown", - "id": "792c879b", "metadata": {}, "source": [ - "## 合并权重并导出\n", + "## 合并 LoRA 权重(可选)\n", + "\n", + "如果需要将 LoRA 权重合并为完整模型(用于无 LoRA 支持的部署场景),可以使用 PEFT 提供的合并功能。\n", + "\n", + "> **注意**:合并操作需要加载完整模型,请在有足够显存的环境下执行。\n" + ], + "id": "peft_merge_0" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from peft import PeftModel\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "base_model_id = 'Qwen/Qwen3.6-27B'\n", + "lora_path = '<替换为你的 LoRA 检查点路径>'\n", + "output_dir = '<替换为输出目录>'\n", + "\n", + "# 加载基座模型\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " base_model_id, torch_dtype='auto', device_map='auto'\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n", "\n", - "训练得到的 LoRA 权重可以与原始模型合并,导出为完整的 HuggingFace 模型,方便后续部署和推理。\n", + "# 加载 LoRA 适配器并合并\n", + "model = PeftModel.from_pretrained(base_model, lora_path)\n", + "merged_model = model.merge_and_unload()\n", "\n", - "> **注意**:合并操作需要 GPU 资源(需要加载完整模型),请在有足够显存的环境下执行。\n", + "# 保存合并后的完整模型\n", + "merged_model.save_pretrained(output_dir)\n", + "tokenizer.save_pretrained(output_dir)\n", + "print(f'合并完成,模型已保存到 {output_dir}')\n" + ], + "outputs": [], + "execution_count": null, + "id": "peft_merge_1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "合并后的模型为标准 HuggingFace 格式,可直接用 vLLM、Transformers 等框架加载推理:\n", "\n", "```bash\n", - "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n", - "NPROC_PER_NODE=4 \\\n", - "/opt/conda/envs/twinkle/bin/megatron export \\\n", - " --model Qwen/Qwen3.6-35B-A3B \\\n", - " --adapters <替换为你的 LoRA 检查点路径> \\\n", - " --output_dir <替换为输出目录> \\\n", - " --merge_lora true \\\n", - " --to_hf true \\\n", - " --tensor_model_parallel_size 2 \\\n", - " --expert_model_parallel_size 2 \\\n", - " --pipeline_model_parallel_size 2\n", + "# 使用 vLLM 部署合并后的模型\n", + "vllm serve <输出目录> --tensor-parallel-size 2\n", "```\n", "\n", - "**参数说明**:\n", - "\n", - "| 参数 | 说明 |\n", - "|------|------|\n", - "| `--model` | 基座模型 ID |\n", - "| `--adapters` | 训练保存的 LoRA 检查点路径 |\n", - "| `--output_dir` | 合并后的完整模型输出目录 |\n", - "| `--merge_lora true` | 将 LoRA 权重合并到基座模型中 |\n", - "| `--to_hf true` | 导出为 HuggingFace 格式 |\n", - "| `--tensor_model_parallel_size` | 张量并行大小 |\n", - "| `--expert_model_parallel_size` | 专家并行大小(MoE 模型专用) |\n", - "| `--pipeline_model_parallel_size` | 流水线并行大小 |\n", - "\n", - "合并完成后,输出目录中即为完整的 HuggingFace 模型,可直接用于推理或部署。" - ] + "> **提示**:对于 Dense 模型(如 Qwen3.6-27B),LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载,无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n" + ], + "id": "peft_merge_2" }, { "cell_type": "markdown", diff --git a/notebook/sample.ipynb b/notebook/sample.ipynb index c7cfedfe..34bd2d97 100644 --- a/notebook/sample.ipynb +++ b/notebook/sample.ipynb @@ -108,7 +108,7 @@ "\n", "logger = get_logger()\n", "\n", - "BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'\n", + "BASE_MODEL = 'Qwen/Qwen3.6-27B'\n", "\n", "# TODO: 替换为你的训练检查点路径\n", "weight_path = '<替换为你的 twinkle:// 检查点路径>' # 例如: 'twinkle://xxx/weights/twinkle-lora-2'\n", @@ -244,31 +244,60 @@ }, { "cell_type": "markdown", - "id": "6643d9ae", "metadata": {}, "source": [ - "## 合并权重并导出\n", + "## 合并 LoRA 权重(可选)\n", "\n", - "训练得到的 LoRA 权重可以与原始模型合并,导出为完整的 HuggingFace 模型,方便后续部署和推理。\n", + "如果需要将 LoRA 权重合并为完整模型(用于无 LoRA 支持的部署场景),可以使用 PEFT 提供的合并功能。\n", "\n", - "> **注意**:合并操作需要 GPU 资源(需要加载完整模型),请在有足够显存的环境下执行。\n", + "> **注意**:合并操作需要加载完整模型,请在有足够显存的环境下执行。\n" + ], + "id": "peft_merge_0" + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "from peft import PeftModel\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "base_model_id = 'Qwen/Qwen3.6-27B'\n", + "lora_path = '<替换为你的 LoRA 检查点路径>'\n", + "output_dir = '<替换为输出目录>'\n", + "\n", + "# 加载基座模型\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " base_model_id, torch_dtype='auto', device_map='auto'\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n", + "\n", + "# 加载 LoRA 适配器并合并\n", + "model = PeftModel.from_pretrained(base_model, lora_path)\n", + "merged_model = model.merge_and_unload()\n", + "\n", + "# 保存合并后的完整模型\n", + "merged_model.save_pretrained(output_dir)\n", + "tokenizer.save_pretrained(output_dir)\n", + "print(f'合并完成,模型已保存到 {output_dir}')\n" + ], + "outputs": [], + "execution_count": null, + "id": "peft_merge_1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "合并后的模型为标准 HuggingFace 格式,可直接用 vLLM、Transformers 等框架加载推理:\n", "\n", "```bash\n", - "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n", - "NPROC_PER_NODE=4 \\\n", - "/opt/conda/envs/twinkle/bin/megatron export \\\n", - " --model Qwen/Qwen3.6-35B-A3B \\\n", - " --adapters <替换为你的 LoRA 检查点路径> \\\n", - " --output_dir <替换为输出目录> \\\n", - " --merge_lora true \\\n", - " --to_hf true \\\n", - " --tensor_model_parallel_size 2 \\\n", - " --expert_model_parallel_size 2 \\\n", - " --pipeline_model_parallel_size 2\n", + "# 使用 vLLM 部署合并后的模型\n", + "vllm serve <输出目录> --tensor-parallel-size 2\n", "```\n", "\n", - "合并完成后,输出目录中即为完整的 HuggingFace 模型,可直接用于推理或部署。" - ] + "> **提示**:对于 Dense 模型(如 Qwen3.6-27B),LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载,无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n" + ], + "id": "peft_merge_2" }, { "cell_type": "markdown", diff --git a/notebook/self_cognition.ipynb b/notebook/self_cognition.ipynb index b5cc300c..5b1d91a7 100644 --- a/notebook/self_cognition.ipynb +++ b/notebook/self_cognition.ipynb @@ -153,7 +153,7 @@ "\n", "from tinker import ServiceClient\n", "\n", - "base_model = 'Qwen/Qwen3.6-35B-A3B'\n", + "base_model = 'Qwen/Qwen3.6-27B'\n", "base_url = 'http://www.modelscope.cn/twinkle'\n", "api_key = getpass(\"ModelScope Token: \")" ] @@ -297,13 +297,9 @@ "\n", "加载训练好的 LoRA 检查点,向模型提问「你是谁?」,观察模型是否以自定义身份回答。\n", "\n", - "### ⚠️ MoE 模型 LoRA 注意事项\n", - "\n", - "由于 `Qwen/Qwen3.6-35B-A3B` 是 MoE(Mixture of Experts)架构,在配合 vLLM 采样时存在已知兼容性问题。下面Sample部分仅作为示例代码。训练得到的 LoRA 权重可以上传至 ModelScope 并与原始模型合并,导出为完整的模型,方便后续部署和推理。\n", - "\n", "### 2.1 加载检查点并创建采样客户端\n", "\n", - "> 将下方 `weight_path` 替换为 Part 1 训练输出的检查点路径。" + "> 将下方 `weight_path` 替换为 Part 1 训练输出的检查点路径。\n" ] }, { @@ -314,11 +310,11 @@ "outputs": [], "source": [ "# TODO: 替换为 Part 1 输出的检查点路径\n", - "weight_path = 'twinkle://20260420_142308-Qwen_Qwen3_6-35B-A3B-38777028/weights/twinkle-lora-0'\n", + "weight_path = '<替换为 Part 1 输出的检查点路径>'\n", "\n", "service_client = ServiceClient(base_url=base_url, api_key=api_key)\n", "sampling_client = service_client.create_sampling_client(model_path=weight_path, base_model=base_model)\n", - "print('采样客户端创建成功')" + "print('采样客户端创建成功')\n" ] }, { @@ -416,40 +412,56 @@ "id": "66f91299", "metadata": {}, "source": [ - "## 合并权重并导出\n", + "## 合并 LoRA 权重(可选)\n", + "\n", + "如果需要将 LoRA 权重合并为完整模型(用于无 LoRA 支持的部署场景),可以使用 PEFT 提供的合并功能。\n", + "\n", + "> **注意**:合并操作需要加载完整模型,请在有足够显存的环境下执行。\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "id": "peft_merge_code", + "source": [ + "from peft import PeftModel\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", + "\n", + "base_model_id = 'Qwen/Qwen3.6-27B'\n", + "lora_path = '<替换为你的 LoRA 检查点路径>'\n", + "output_dir = '<替换为输出目录>'\n", "\n", - "训练得到的 LoRA 权重可以与原始模型合并,导出为完整的 HuggingFace 模型,方便后续部署和推理。\n", + "# 加载基座模型\n", + "base_model_hf = AutoModelForCausalLM.from_pretrained(\n", + " base_model_id, torch_dtype='auto', device_map='auto'\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n", "\n", - "> **注意**:合并操作需要 GPU 资源(需要加载完整模型),请在有足够显存的环境下执行。\n", + "# 加载 LoRA 适配器并合并\n", + "peft_model = PeftModel.from_pretrained(base_model_hf, lora_path)\n", + "merged_model = peft_model.merge_and_unload()\n", + "\n", + "# 保存合并后的完整模型\n", + "merged_model.save_pretrained(output_dir)\n", + "tokenizer.save_pretrained(output_dir)\n", + "print(f'合并完成,模型已保存到 {output_dir}')\n" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "metadata": {}, + "id": "peft_merge_note", + "source": [ + "合并后的模型为标准 HuggingFace 格式,可直接用 vLLM、Transformers 等框架加载推理:\n", "\n", "```bash\n", - "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n", - "NPROC_PER_NODE=4 \\\n", - "/opt/conda/envs/twinkle/bin/megatron export \\\n", - " --model Qwen/Qwen3.6-35B-A3B \\\n", - " --adapters <替换为你的 LoRA 检查点路径> \\\n", - " --output_dir <替换为输出目录> \\\n", - " --merge_lora true \\\n", - " --to_hf true \\\n", - " --tensor_model_parallel_size 2 \\\n", - " --expert_model_parallel_size 2 \\\n", - " --pipeline_model_parallel_size 2\n", + "# 使用 vLLM 部署合并后的模型\n", + "vllm serve <输出目录> --tensor-parallel-size 2\n", "```\n", "\n", - "**参数说明**:\n", - "\n", - "| 参数 | 说明 |\n", - "|------|------|\n", - "| `--model` | 基座模型 ID |\n", - "| `--adapters` | 训练保存的 LoRA 检查点路径 |\n", - "| `--output_dir` | 合并后的完整模型输出目录 |\n", - "| `--merge_lora true` | 将 LoRA 权重合并到基座模型中 |\n", - "| `--to_hf true` | 导出为 HuggingFace 格式 |\n", - "| `--tensor_model_parallel_size` | 张量并行大小 |\n", - "| `--expert_model_parallel_size` | 专家并行大小(MoE 模型专用) |\n", - "| `--pipeline_model_parallel_size` | 流水线并行大小 |\n", - "\n", - "合并完成后,输出目录中即为完整的 HuggingFace 模型,可直接用于推理或部署。" + "> **提示**:对于 Dense 模型(如 Qwen3.6-27B),LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载,无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n" ] }, { @@ -470,7 +482,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "vllm19", "language": "python", "name": "python3" }, @@ -484,7 +496,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.13" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/notebook/short_math_grpo.ipynb b/notebook/short_math_grpo.ipynb index 8e8b1b31..f7344bcf 100644 --- a/notebook/short_math_grpo.ipynb +++ b/notebook/short_math_grpo.ipynb @@ -86,46 +86,18 @@ "## 第一步:导入依赖与全局配置\n", "\n", "> **为什么使用 Twinkle 客户端语法?**\n", - "> Twinkle 提供 `tinker` 和 `twinkle` 两套客户端 API。其中 **tinker** 接口不支持设置 `target_modules`、`LoraConfig` 等细节调控,而 GRPO 训练在 MoE 模型上需要显式指定 LoRA 的 target modules(否则会触发 vLLM 兼容性问题)。\n", - "> 因此本 Notebook 使用 **twinkle 客户端语法**,以获得对训练参数的完整控制。\n", + "> Twinkle 提供 `tinker` 和 `twinkle` 两套客户端 API。其中 **twinkle** 客户端提供更完整的参数控制(如 `LoraConfig`、`target_modules`),适合 GRPO 等需要精细调参的训练场景。\n", "\n", "| 配置项 | 默认值 | 说明 |\n", "|--------|--------|------|\n", - "| `MODEL_ID` | ms://Qwen/Qwen3.6-35B-A3B | 基座模型(需加 `ms://` 前缀) |\n", + "| `MODEL_ID` | ms://Qwen/Qwen3.6-27B | 基座模型(需加 `ms://` 前缀) |\n", "| `NUM_GENERATIONS` | 4 | 每个 prompt 生成几条回答 |\n", "| `MAX_NEW_TOKENS` | 1024 | 单条回答最大 token 数 |\n", "| `LEARNING_RATE` | 2e-5 | 学习率 |\n", "| `MAX_STEPS` | 100 | 最大训练步数 |\n", "| `BATCH_SIZE` | 2 | 每步的 prompt 数量(实际训练样本 = BATCH_SIZE × NUM_GENERATIONS) |\n", "| `TEMPERATURE` | 1.0 | 采样温度,RL 训练中通常设为 1.0 保持多样性 |\n", - "| `SYNC_INTERVAL` | 1 | 每隔多少步同步权重到采样端 |" - ] - }, - { - "cell_type": "markdown", - "id": "ef3af352", - "metadata": {}, - "source": [ - "### ⚠️ MoE 模型 LoRA 注意事项\n", - "\n", - "由于 `Qwen/Qwen3.6-35B-A3B` 是 MoE(Mixture of Experts)架构,在配合 vLLM 采样时存在已知兼容性问题。\n", - "如果你在本地使用 Megatron 进行 GRPO 训练,建议显式指定 `target_modules`(而非 `all-linear`):\n", - "\n", - "```python\n", - "target_modules:\n", - " - mlp.linear_fc1\n", - " - mlp.linear_fc2\n", - " - attn.proj\n", - " - shared_experts.linear_fc1\n", - " - shared_experts.linear_fc2\n", - " - linear_qkv\n", - " - in_proj\n", - " - out_proj\n", - " - linear_proj\n", - "```\n", - "\n", - "> **注意**:此配置是一个示例,由于问题来自 vLLM 侧的 MoE LoRA 支持尚不完善,实际训练效果可能受限。\n", - "> 如果不需要在线采样(vLLM),使用 `all-linear` 仍然可以正常训练。" + "| `SYNC_INTERVAL` | 1 | 每隔多少步同步权重到采样端 |\n" ] }, { @@ -177,7 +149,7 @@ "logger = get_logger()\n", "\n", "# ========== 全局配置 ==========\n", - "MODEL_ID = 'ms://Qwen/Qwen3.6-35B-A3B'\n", + "MODEL_ID = 'ms://Qwen/Qwen3.6-27B'\n", "NUM_GENERATIONS = 4\n", "MAX_NEW_TOKENS = 1024\n", "LEARNING_RATE = 2e-5\n", @@ -355,9 +327,7 @@ "\n", "- **`MultiLoraTransformersModel`**:支持 LoRA 适配器、损失函数、优化器、模板等全部设置\n", "- **`vLLMSampler`**:采样端,支持 `adapter_uri` 动态加载最新 LoRA 权重\n", - "- **`LoraConfig`**:可以精确控制 `target_modules`,这是使用 twinkle 语法的关键优势\n", - "\n", - "> 对于 MoE 模型,必须显式指定 `target_modules` 而非 `all-linear`,以避免 vLLM 兼容性问题。\n" + "- **`LoraConfig`**:可以精确控制 `target_modules`、`rank` 等参数\n" ] }, { @@ -376,15 +346,10 @@ "# 配置训练模型\n", "model = MultiLoraTransformersModel(model_id=MODEL_ID)\n", "\n", - "# LoRA 配置 —— 显式指定 target_modules(MoE 模型关键)\n", + "# LoRA 配置\n", "lora_config = LoraConfig(\n", - " target_modules=[\n", - " 'mlp.linear_fc1', 'mlp.linear_fc2',\n", - " 'attn.proj',\n", - " 'shared_experts.linear_fc1', 'shared_experts.linear_fc2',\n", - " 'linear_qkv', 'in_proj', 'out_proj', 'linear_proj',\n", - " ],\n", - " r=8,\n", + " target_modules='all-linear',\n", + " r=16,\n", " lora_alpha=32,\n", " lora_dropout=0.05,\n", ")\n", @@ -420,7 +385,7 @@ " 'logprobs': 1,\n", "}\n", "\n", - "print('模型和采样端配置完成')" + "print('模型和采样端配置完成')\n" ] }, { @@ -594,9 +559,9 @@ "\n", "训练完成后,可以直接使用 **线上服务** 进行推理,无需本地 GPU。\n", "\n", - "通过 `save_weights_and_get_sampling_client` 或 `create_sampling_client` 加载训练好的 LoRA 检查点,即可在线采样生成。\n", + "通过 `create_sampling_client` 加载训练好的 LoRA 检查点,即可在线采样生成。\n", "\n", - "> 将下方 `weight_path` 替换为训练输出的检查点路径(`twinkle://...` 格式)。" + "> 将下方 `weight_path` 替换为训练输出的检查点路径(`twinkle://...` 格式)。\n" ] }, { @@ -614,7 +579,7 @@ "\n", "logger = get_logger()\n", "\n", - "BASE_MODEL = 'Qwen/Qwen3.6-35B-A3B'\n", + "BASE_MODEL = 'Qwen/Qwen3.6-27B'\n", "\n", "# TODO: 替换为训练输出的检查点路径\n", "weight_path = '<替换为你的 twinkle:// 检查点路径>' # 例如: save_result.path\n", @@ -661,43 +626,59 @@ }, { "cell_type": "markdown", - "id": "4bc85a32", + "id": "peft_merge_0", + "metadata": {}, + "source": [ + "## 合并 LoRA 权重(可选)\n", + "\n", + "如果需要将 LoRA 权重合并为完整模型(用于无 LoRA 支持的部署场景),可以使用 PEFT 提供的合并功能。\n", + "\n", + "> **注意**:合并操作需要加载完整模型,请在有足够显存的环境下执行。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "peft_merge_1", "metadata": {}, + "outputs": [], "source": [ - "## 合并权重并导出\n", + "from peft import PeftModel\n", + "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "\n", - "训练得到的 LoRA 权重可以与原始模型合并,导出为完整的 HuggingFace 模型,方便后续部署和推理。\n", + "base_model_id = 'Qwen/Qwen3.6-27B'\n", + "lora_path = '<替换为你的 LoRA 检查点路径>'\n", + "output_dir = '<替换为输出目录>'\n", "\n", - "> **注意**:合并操作需要 GPU 资源(需要加载完整模型),请在有足够显存的环境下执行。\n", + "# 加载基座模型\n", + "base_model = AutoModelForCausalLM.from_pretrained(\n", + " base_model_id, torch_dtype='auto', device_map='auto'\n", + ")\n", + "tokenizer = AutoTokenizer.from_pretrained(base_model_id)\n", + "\n", + "# 加载 LoRA 适配器并合并\n", + "model = PeftModel.from_pretrained(base_model, lora_path)\n", + "merged_model = model.merge_and_unload()\n", + "\n", + "# 保存合并后的完整模型\n", + "merged_model.save_pretrained(output_dir)\n", + "tokenizer.save_pretrained(output_dir)\n", + "print(f'合并完成,模型已保存到 {output_dir}')\n" + ] + }, + { + "cell_type": "markdown", + "id": "peft_merge_2", + "metadata": {}, + "source": [ + "合并后的模型为标准 HuggingFace 格式,可直接用 vLLM、Transformers 等框架加载推理:\n", "\n", "```bash\n", - "CUDA_VISIBLE_DEVICES=0,1,2,3 \\\n", - "NPROC_PER_NODE=4 \\\n", - "/opt/conda/envs/twinkle/bin/megatron export \\\n", - " --model Qwen/Qwen3.6-35B-A3B \\\n", - " --adapters <替换为你的 LoRA 检查点路径> \\\n", - " --output_dir <替换为输出目录> \\\n", - " --merge_lora true \\\n", - " --to_hf true \\\n", - " --tensor_model_parallel_size 2 \\\n", - " --expert_model_parallel_size 2 \\\n", - " --pipeline_model_parallel_size 2\n", + "# 使用 vLLM 部署合并后的模型\n", + "vllm serve <输出目录> --tensor-parallel-size 2\n", "```\n", "\n", - "**参数说明**:\n", - "\n", - "| 参数 | 说明 |\n", - "|------|------|\n", - "| `--model` | 基座模型 ID |\n", - "| `--adapters` | 训练保存的 LoRA 检查点路径 |\n", - "| `--output_dir` | 合并后的完整模型输出目录 |\n", - "| `--merge_lora true` | 将 LoRA 权重合并到基座模型中 |\n", - "| `--to_hf true` | 导出为 HuggingFace 格式 |\n", - "| `--tensor_model_parallel_size` | 张量并行大小 |\n", - "| `--expert_model_parallel_size` | 专家并行大小(MoE 模型专用) |\n", - "| `--pipeline_model_parallel_size` | 流水线并行大小 |\n", - "\n", - "合并完成后,输出目录中即为完整的 HuggingFace 模型,可直接用于推理或部署。" + "> **提示**:对于 Dense 模型(如 Qwen3.6-27B),LoRA 权重可以直接通过 vLLM 的 `enable_lora` 加载,无需合并。只有在不支持动态 LoRA 的部署场景下才需要合并。\n" ] }, { diff --git a/src/twinkle/server/gateway/server.py b/src/twinkle/server/gateway/server.py index dd89b84c..755c5d2b 100644 --- a/src/twinkle/server/gateway/server.py +++ b/src/twinkle/server/gateway/server.py @@ -39,7 +39,7 @@ def __init__(self, self.http_options = http_options or {} self.proxy = ServiceProxy(http_options=http_options, route_prefix=self.route_prefix) self.supported_models = self._normalize_models(supported_models) or [ - types.SupportedModel(model_name='Qwen/Qwen3.6-35B-A3B'), + types.SupportedModel(model_name='Qwen/Qwen3.6-27B'), ] self._modelscope_config_lock = asyncio.Lock() From 8d0c2c1b78d9596a2cd5a97f2fdea5a7a3c17869 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 22 Apr 2026 16:07:00 +0800 Subject: [PATCH 2/3] fix --- cookbook/rl/short_math_grpo_moe.py | 2 +- cookbook/rl/short_math_grpo_multi_lora.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cookbook/rl/short_math_grpo_moe.py b/cookbook/rl/short_math_grpo_moe.py index 6ad5cc2f..9d870eac 100644 --- a/cookbook/rl/short_math_grpo_moe.py +++ b/cookbook/rl/short_math_grpo_moe.py @@ -28,7 +28,7 @@ logger = get_logger() # ========== Configuration ========== -MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-27B') +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B') USE_MEGATRON = bool(int(os.environ.get('USE_MEGATRON', '1'))) MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) diff --git a/cookbook/rl/short_math_grpo_multi_lora.py b/cookbook/rl/short_math_grpo_multi_lora.py index 96d7ef9b..9dad8df3 100644 --- a/cookbook/rl/short_math_grpo_multi_lora.py +++ b/cookbook/rl/short_math_grpo_multi_lora.py @@ -5,7 +5,7 @@ weights to a local directory, then passes the path to vLLMSampler via `adapter_path` so vLLM loads the latest adapter from disk. -Model: Qwen/Qwen3.6-27B (MoE, 35B total / 3B active) +Model: Qwen/Qwen3.6-35B-A3B (MoE, 35B total / 3B active) Model mesh: tp=2, ep=2, pp=2, sequence_parallel=True (8 GPUs) Sampler mesh: dp=2, tp=2, gpus_per_worker=2 (4 GPUs) @@ -35,7 +35,7 @@ logger = get_logger() # ========== Configuration ========== -MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-27B') +MODEL_ID = os.environ.get('MODEL_ID', 'ms://Qwen/Qwen3.6-35B-A3B') MODEL_GPUS = int(os.environ.get('MODEL_GPUS', 4)) SAMPLER_GPUS = int(os.environ.get('SAMPLER_GPUS', 2)) From 5aaf97801bb4cfaac45f2b2cd7400a112d433517 Mon Sep 17 00:00:00 2001 From: tastelikefeet Date: Wed, 22 Apr 2026 16:12:08 +0800 Subject: [PATCH 3/3] fix --- README.md | 2 +- README_ZH.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7c01765e..1ee5fc7e 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ supported on Twinkle✨ framework. > For serverless training service accessed via `base_url=https://www.modelscope.cn/twinkle`, it > is currently provided via the Tinker-compatible APIs. We will be rolling out services that support > both Tinker APIs, as well as the full-fledged Twinkle✨ native APIs. The serverless endpoint is backed -> by one training base at a time, and currently it is [Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-27B). +> by one training base at a time, and currently it is [Qwen3.6-27B](https://modelscope.cn/models/Qwen/Qwen3.6-27B). | Model Type | Model ID on [ModelScope](https://modelscope.cn) | Model Size | Requires | Support Megatron | HF Model ID | |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:| diff --git a/README_ZH.md b/README_ZH.md index 3496abb8..b52c85c1 100644 --- a/README_ZH.md +++ b/README_ZH.md @@ -130,7 +130,7 @@ Twinkle✨支持相同的算法接口运行在单GPU、torchrun多机、Ray、Cl 随着新模型的发布,我们将添加对更多模型的支持。下表列出了 Twinkle✨ 框架当前支持的模型。 >[!Note] -> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3.6-35B-A3B](https://modelscope.cn/models/Qwen/Qwen3.6-27B)。 +> 通过 `base_url=https://www.modelscope.cn/twinkle` 访问的无服务器训练服务,目前是通过兼容Tinker的API提供的。我们将陆续推出同时支持Tinker API和完整Twinkle✨原生 API的服务。无服务器端点每次由一个训练基座支持,目前使用的是[Qwen3.6-27B](https://modelscope.cn/models/Qwen/Qwen3.6-27B)。 | Model Type | Model ID 举例 | Model Size | Requires | Support Megatron | HF Model ID | |---------------------|-----------------------------------------------------------------------------------------------------------------|:---------------------------------------:|----------------------|:----------------:|:---------------------------------------------------------------------------------------------------------:|