From 37e59bcb0165a8be1ce22bfbef016a78c20da9b6 Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Thu, 16 Apr 2026 16:27:16 +0800 Subject: [PATCH 1/6] fix model id --- cookbook/client/server/megatron/run.sh | 4 ++++ src/twinkle/model/megatron/megatron.py | 1 + 2 files changed, 5 insertions(+) diff --git a/cookbook/client/server/megatron/run.sh b/cookbook/client/server/megatron/run.sh index d001023f..bb288df2 100644 --- a/cookbook/client/server/megatron/run.sh +++ b/cookbook/client/server/megatron/run.sh @@ -385,6 +385,10 @@ print_info "日志输出到: $LOG_FILE" echo "" # 启动服务器并实时显示日志 +touch "$LOG_FILE" # 预创建文件,避免 tail -f 在文件尚未写入时报错 nohup python -m twinkle.server --config "$SERVER_CONFIG_FILE" > "$LOG_FILE" 2>&1 & SERVER_PID=$! print_success "Twinkle Server 已启动 (PID: $SERVER_PID)" + +# 实时显示日志(阻塞进程) +tail -f "$LOG_FILE" diff --git a/src/twinkle/model/megatron/megatron.py b/src/twinkle/model/megatron/megatron.py index f087d3a6..4bea32b7 100644 --- a/src/twinkle/model/megatron/megatron.py +++ b/src/twinkle/model/megatron/megatron.py @@ -1327,6 +1327,7 @@ def set_template(self, template_cls: Union[Template, Type[Template], str], **kwa """ adapter_name = kwargs.pop('adapter_name', self._get_default_group()) optimizer_config = self.optimizer_group[adapter_name] + kwargs['model_id'] = self.tokenizer_id optimizer_config.template = construct_class(template_cls, Template, twinkle.template, **kwargs) @remote_function(dispatch='all') From 85dbe59c97775393990ae84a80cb2cf8cdf107ea Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Thu, 16 Apr 2026 17:36:28 +0800 Subject: [PATCH 2/6] fix model id --- src/twinkle/server/utils/template_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/twinkle/server/utils/template_utils.py b/src/twinkle/server/utils/template_utils.py index ad015175..c593a8c0 100644 --- a/src/twinkle/server/utils/template_utils.py +++ b/src/twinkle/server/utils/template_utils.py @@ -10,6 +10,7 @@ # Key: model name pattern to match, Value: template name MODEL_TEMPLATE_MAPPING = { 'Qwen3.5': 'Qwen3_5Template', + 'Qwen3.6': 'Qwen3_5Template', # Add more model-template mappings here as needed # 'ModelName': 'TemplateName', } From b756f7efe075f0a35f4d6353008000d981aba1ba Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Fri, 17 Apr 2026 11:26:06 +0800 Subject: [PATCH 3/6] update upload --- Dockerfile | 3 +- client_tools/client_generator.py | 31 +++++++++- .../client/server/megatron/server_config.yaml | 2 +- .../client/tinker/self_host/upload_to_hub.py | 62 +++++++++++++++++++ .../client/twinkle/self_host/upload_to_hub.py | 61 ++++++++++++++++++ src/twinkle/server/model/twinkle_handlers.py | 37 +++++++++-- src/twinkle_client/dataset/base.py | 13 ++++ src/twinkle_client/dataset/lazy_dataset.py | 62 ++++++++++++++++++- .../model/multi_lora_transformers.py | 31 +++++++++- src/twinkle_client/sampler/vllm_sampler.py | 4 +- src/twinkle_client/types/__init__.py | 1 + src/twinkle_client/types/model.py | 24 +++++-- 12 files changed, 309 insertions(+), 22 deletions(-) create mode 100644 cookbook/client/tinker/self_host/upload_to_hub.py create mode 100644 cookbook/client/twinkle/self_host/upload_to_hub.py diff --git a/Dockerfile b/Dockerfile index 8107ebcb..5d6f7feb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,8 @@ RUN SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") & CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \ pip install --no-build-isolation "transformer_engine[pytorch]" --no-cache-dir -RUN pip install megatron_core mcore_bridge --no-cache-dir +# TODO: mcore_bridge use main branch version +RUN pip install megatron_core git+https://github.com/modelscope/mcore-bridge.git --no-cache-dir # Install flash-attention (default arch 8.0;9.0, override via build-arg if needed) ARG TORCH_CUDA_ARCH_LIST="8.0;9.0" diff --git a/client_tools/client_generator.py b/client_tools/client_generator.py index 8927315d..15bfead6 100644 --- a/client_tools/client_generator.py +++ b/client_tools/client_generator.py @@ -438,7 +438,8 @@ def generate_models(): client_module_path.mkdir(parents=True, exist_ok=True) model_code = AUTO_GEN_WARNING + '''from typing import Any, Dict, Optional -from twinkle_client.http import http_post +import time +from twinkle_client.http import http_get, http_post from twinkle_client.types.model import ( CalculateLossResponse, CalculateMetricResponse, @@ -673,14 +674,20 @@ def upload_to_hub( hub_model_id: str, hub_token: Optional[str] = None, async_upload: bool = True, + poll_interval: float = 5.0, ) -> None: """Upload model checkpoint to hub. + Submits the upload task to the server and polls for completion. + Blocks until the upload finishes or raises on failure. + Args: checkpoint_dir: The directory path of the checkpoint to upload. hub_model_id: The hub model id. hub_token: The hub token (optional). - async_upload: Whether to use async upload (default: True). + async_upload: Deprecated, has no effect. The server always runs the + upload in the background and the client polls for completion. + poll_interval: Seconds between status poll requests (default: 5). """ response = http_post( url=f'{self.server_url}/upload_to_hub', @@ -688,10 +695,28 @@ def upload_to_hub( 'checkpoint_dir': checkpoint_dir, 'hub_model_id': hub_model_id, 'hub_token': hub_token, - 'async_upload': async_upload, } ) response.raise_for_status() + request_id = response.json().get('request_id') + if not request_id: + return + + print(f'[upload_to_hub] Upload started (task {request_id}), waiting for completion...') + while True: + status_resp = http_get(url=f'{self.server_url}/upload_status/{request_id}') + status_resp.raise_for_status() + data = status_resp.json() + status = data.get('status', 'unknown') + if status == 'completed': + print(f'[upload_to_hub] Upload completed successfully.') + return + elif status == 'failed': + error = data.get('error', 'Unknown error') + raise RuntimeError(f'[upload_to_hub] Upload failed: {error}') + else: + print(f'[upload_to_hub] Status: {status}...') + time.sleep(poll_interval) ''' # Write the model client file diff --git a/cookbook/client/server/megatron/server_config.yaml b/cookbook/client/server/megatron/server_config.yaml index abedb10a..123cad7f 100644 --- a/cookbook/client/server/megatron/server_config.yaml +++ b/cookbook/client/server/megatron/server_config.yaml @@ -87,7 +87,7 @@ applications: use_megatron: true # Use Megatron-LM backend model_id: "ms://Qwen/Qwen3.6-35B-A3B" # ModelScope model identifier max_length: 32000 # model max length - max_loras: 5 # model max loras + max_loras: 3 # model max loras nproc_per_node: 4 # Number of GPU processes per node device_group: name: model diff --git a/cookbook/client/tinker/self_host/upload_to_hub.py b/cookbook/client/tinker/self_host/upload_to_hub.py new file mode 100644 index 00000000..18088e56 --- /dev/null +++ b/cookbook/client/tinker/self_host/upload_to_hub.py @@ -0,0 +1,62 @@ +# Tinker-Compatible Client - Upload Checkpoint to Hub Example +# +# This script demonstrates how to upload a Tinker checkpoint to ModelScope Hub. +# Tinker checkpoints use the same twinkle:// path format as Twinkle checkpoints, +# so the upload is handled identically via the Twinkle upload interface. +# +# How it works: +# 1. The server submits the upload as a background task and returns a +# request_id immediately, so the HTTP call never times out. +# 2. The client polls /upload_status/{request_id} every few seconds and +# blocks until the upload completes or raises on failure. +# +# Prerequisites: +# - Server must be running (see server.py / server_config.yaml) +# - A ModelScope API token with write access to the target repository + +import dotenv + +dotenv.load_dotenv('.env') + +from twinkle import get_logger, init_twinkle_client +from twinkle_client.model import MultiLoraTransformersModel + +logger = get_logger() + +# ── Configuration ───────────────────────────────────────────────────────────── +base_model = 'Qwen/Qwen3.5-4B' +base_url = 'http://localhost:8000' +api_key = 'EMPTY_TOKEN' # token used for model training / server access + +# Checkpoint to upload: the twinkle:// path returned by training_client.save_state(), +# e.g. 'twinkle://20260301_142318-Qwen_Qwen3-4B-199d2cdb/weights/my-lora-epoch-0' +tinker_path = 'twinkle://REPLACE_ME/weights/REPLACE_ME' + +# ModelScope destination (must belong to your account) +hub_model_id = 'your_username/your-model-name' +hub_token = None # Set to your ModelScope API token, or None to use server default +# ── End of configuration ────────────────────────────────────────────────────── + + +def upload(): + # Step 1: Initialize the Twinkle client. + # Tinker checkpoints (twinkle:// paths) are resolved by the same checkpoint + # manager on the server, so init_twinkle_client is sufficient for upload. + init_twinkle_client(base_url=base_url, api_key=api_key) + + # Step 2: Create the model client (no training state needed for upload) + model = MultiLoraTransformersModel(model_id=f'ms://{base_model}') + + # Step 3: Upload checkpoint to ModelScope Hub. + # The client polls for completion automatically; progress is printed to stdout. + logger.info(f'Uploading {tinker_path!r} → {hub_model_id!r} ...') + model.upload_to_hub( + checkpoint_dir=tinker_path, + hub_model_id=hub_model_id, + hub_token=hub_token, + ) + logger.info(f'Upload complete: https://modelscope.cn/models/{hub_model_id}') + + +if __name__ == '__main__': + upload() diff --git a/cookbook/client/twinkle/self_host/upload_to_hub.py b/cookbook/client/twinkle/self_host/upload_to_hub.py new file mode 100644 index 00000000..0505d27d --- /dev/null +++ b/cookbook/client/twinkle/self_host/upload_to_hub.py @@ -0,0 +1,61 @@ +# Twinkle Client - Upload Checkpoint to Hub Example +# +# This script demonstrates how to upload a saved checkpoint to ModelScope Hub +# using the Twinkle client. No training is required: any existing checkpoint +# (obtained from a previous run via model.save()) can be uploaded directly. +# +# How it works: +# 1. The server submits the upload as a background task and returns a +# request_id immediately, so the HTTP call never times out. +# 2. The client polls /upload_status/{request_id} every few seconds and +# blocks until the upload completes or raises on failure. +# +# Prerequisites: +# - Server must be running (see server.py / server_config.yaml) +# - A ModelScope API token with write access to the target repository + +import dotenv + +dotenv.load_dotenv('.env') + +from twinkle import get_logger, init_twinkle_client +from twinkle_client.model import MultiLoraTransformersModel + +logger = get_logger() + +# ── Configuration ───────────────────────────────────────────────────────────── +base_model = 'Qwen/Qwen3.5-4B' +base_url = 'http://localhost:8000' +api_key = 'EMPTY_TOKEN' # token used for model training / server access + +# Checkpoint to upload: either a twinkle:// path or a local directory path. +# Example twinkle:// path (from model.save()): +# 'twinkle://20260410_131831-Qwen_Qwen3_5-4B-85279a20/weights/my-checkpoint' +twinkle_path = 'twinkle://REPLACE_ME/weights/REPLACE_ME' + +# ModelScope destination (must belong to your account) +hub_model_id = 'your_username/your-model-name' +hub_token = None # Set to your ModelScope API token, or None to use server default +# ── End of configuration ────────────────────────────────────────────────────── + + +def upload(): + # Step 1: Initialize the Twinkle client + init_twinkle_client(base_url=base_url, api_key=api_key) + + # Step 2: Create the model client (no training state needed for upload) + model = MultiLoraTransformersModel(model_id=f'ms://{base_model}') + + # Step 3: Upload checkpoint to ModelScope Hub. + # The client polls for completion automatically; progress is printed to stdout. + logger.info(f'Uploading {twinkle_path!r} → {hub_model_id!r} ...') + model.upload_to_hub( + checkpoint_dir=twinkle_path, + hub_model_id=hub_model_id, + hub_token=hub_token, + ) + logger.info(f'Upload complete: https://modelscope.cn/models/{hub_model_id}') + + +if __name__ == '__main__': + upload() diff --git a/src/twinkle/server/model/twinkle_handlers.py b/src/twinkle/server/model/twinkle_handlers.py index 535a3bd7..ed2e62b9 100644 --- a/src/twinkle/server/model/twinkle_handlers.py +++ b/src/twinkle/server/model/twinkle_handlers.py @@ -8,6 +8,7 @@ """ from __future__ import annotations +import asyncio import torch import traceback from fastapi import Depends, FastAPI, HTTPException, Request @@ -347,12 +348,12 @@ async def _task(): await run_task(self.schedule_task_and_wait(_task, task_type='load')) - @app.post('/twinkle/upload_to_hub') + @app.post('/twinkle/upload_to_hub', response_model=types.UploadToHubResponse) async def upload_to_hub( request: Request, body: types.UploadToHubRequest, self: ModelManagement = Depends(self_fn), - ) -> None: + ) -> types.UploadToHubResponse: token = await self._on_request_start(request) async def _task(): @@ -370,13 +371,39 @@ async def _task(): checkpoint_manager.get_ckpt_dir(model_id=model_id_to_load, checkpoint_id=checkpoint_id)) else: checkpoint_dir = body.checkpoint_dir - self.model.upload_to_hub( + # Run blocking upload in thread pool so the event loop is not blocked. + # async_upload is intentionally ignored here: the task queue + client polling + # already provide the fire-and-forget / wait semantics without holding the + # HTTP connection open for the full duration of the upload. + await asyncio.to_thread( + self.model.upload_to_hub, checkpoint_dir=checkpoint_dir, hub_model_id=body.hub_model_id, hub_token=body.hub_token or token, - async_upload=body.async_upload) + async_upload=False, + ) + + future_ref = await self.schedule_task(_task, task_type='upload_to_hub') + request_id = future_ref.get('request_id') + if request_id is None: + raise HTTPException(status_code=500, detail=f'Upload task scheduling failed: {future_ref}') + return types.UploadToHubResponse(request_id=request_id) - await run_task(self.schedule_task_and_wait(_task, task_type='upload_to_hub')) + @app.get('/twinkle/upload_status/{request_id}', response_model=types.UploadStatusResponse) + async def upload_status( + request: Request, + request_id: str, + self: ModelManagement = Depends(self_fn), + ) -> types.UploadStatusResponse: + await self._on_request_start(request) + record = await self.state.get_future(request_id) + if record is None: + raise HTTPException(status_code=404, detail=f'Upload task not found: {request_id}') + status = record.get('status', 'unknown') + error = None + if status == 'failed': + error = record.get('result', {}).get('error', 'Unknown error') + return types.UploadStatusResponse(request_id=request_id, status=status, error=error) @app.post('/twinkle/add_adapter_to_model', response_model=types.AddAdapterResponse) async def add_adapter_to_model( diff --git a/src/twinkle_client/dataset/base.py b/src/twinkle_client/dataset/base.py index 0487f733..c9d78ccc 100644 --- a/src/twinkle_client/dataset/base.py +++ b/src/twinkle_client/dataset/base.py @@ -78,6 +78,19 @@ def check(self, **kwargs): return response.json()["result"] + def cast_column(self, column: str, decode: bool = True): + response = http_post( + url=f'{self.server_url}/call', + json_data={ + 'processor_id': self.processor_id, + 'function': 'cast_column', + **{'column': column, 'decode': decode}, + } + ) + response.raise_for_status() + return response.json()["result"] + + def map(self, preprocess_func: Union[Preprocessor, Callable, str, Type[Preprocessor]], dataset_meta: DatasetMeta = None, init_args: Dict[str, Any] = None, **kwargs): response = http_post( url=f'{self.server_url}/call', diff --git a/src/twinkle_client/dataset/lazy_dataset.py b/src/twinkle_client/dataset/lazy_dataset.py index 62b13dea..bb3b204e 100644 --- a/src/twinkle_client/dataset/lazy_dataset.py +++ b/src/twinkle_client/dataset/lazy_dataset.py @@ -9,9 +9,12 @@ # 2. Run: python client_tools/client_generator.py # ============================================================================ +from typing import Any, Callable, Dict, Type, Union from twinkle_client.http import http_post from twinkle.dataset import Dataset from twinkle.dataset import DatasetMeta +from twinkle.preprocessor import DataFilter +from twinkle.preprocessor import Preprocessor from .base import Dataset class LazyDataset(Dataset): @@ -33,13 +36,68 @@ def __init__(self, dataset_meta: DatasetMeta, **kwargs): self.processor_id = response.json()['processor_id'] - def encode(self, **kwargs): + def map(self, preprocess_func: Union[Preprocessor, Callable, str, Type[Preprocessor]], dataset_meta: DatasetMeta = None, init_args: Dict[str, Any] = None, **kwargs): + response = http_post( + url=f'{self.server_url}/call', + json_data={ + 'processor_id': self.processor_id, + 'function': 'map', + **{'preprocess_func': preprocess_func, 'dataset_meta': dataset_meta, 'init_args': init_args}, + **kwargs + } + ) + response.raise_for_status() + return response.json()["result"] + + + def filter(self, filter_func: Union[Callable, str, Type[DataFilter], DataFilter], dataset_meta: DatasetMeta = None, init_args: Dict[str, Any] = None, **kwargs): + response = http_post( + url=f'{self.server_url}/call', + json_data={ + 'processor_id': self.processor_id, + 'function': 'filter', + **{'filter_func': filter_func, 'dataset_meta': dataset_meta, 'init_args': init_args}, + **kwargs + } + ) + response.raise_for_status() + return response.json()["result"] + + + def add_dataset(self, dataset_meta: DatasetMeta, **kwargs): + response = http_post( + url=f'{self.server_url}/call', + json_data={ + 'processor_id': self.processor_id, + 'function': 'add_dataset', + **{'dataset_meta': dataset_meta}, + **kwargs + } + ) + response.raise_for_status() + return response.json()["result"] + + + def mix_dataset(self, interleave = True): + response = http_post( + url=f'{self.server_url}/call', + json_data={ + 'processor_id': self.processor_id, + 'function': 'mix_dataset', + **{'interleave': interleave}, + } + ) + response.raise_for_status() + return response.json()["result"] + + + def encode(self, add_generation_prompt: bool = False, **kwargs): response = http_post( url=f'{self.server_url}/call', json_data={ 'processor_id': self.processor_id, 'function': 'encode', - **{}, + **{'add_generation_prompt': add_generation_prompt}, **kwargs } ) diff --git a/src/twinkle_client/model/multi_lora_transformers.py b/src/twinkle_client/model/multi_lora_transformers.py index 743125d9..c2509f42 100644 --- a/src/twinkle_client/model/multi_lora_transformers.py +++ b/src/twinkle_client/model/multi_lora_transformers.py @@ -9,7 +9,8 @@ # 2. Run: python client_tools/client_generator.py # ============================================================================ from typing import Any, Dict, Optional -from twinkle_client.http import http_post +import time +from twinkle_client.http import http_get, http_post from twinkle_client.types.model import ( CalculateLossResponse, CalculateMetricResponse, @@ -244,14 +245,20 @@ def upload_to_hub( hub_model_id: str, hub_token: Optional[str] = None, async_upload: bool = True, + poll_interval: float = 5.0, ) -> None: """Upload model checkpoint to hub. + Submits the upload task to the server and polls for completion. + Blocks until the upload finishes or raises on failure. + Args: checkpoint_dir: The directory path of the checkpoint to upload. hub_model_id: The hub model id. hub_token: The hub token (optional). - async_upload: Whether to use async upload (default: True). + async_upload: Deprecated, has no effect. The server always runs the + upload in the background and the client polls for completion. + poll_interval: Seconds between status poll requests (default: 5). """ response = http_post( url=f'{self.server_url}/upload_to_hub', @@ -259,7 +266,25 @@ def upload_to_hub( 'checkpoint_dir': checkpoint_dir, 'hub_model_id': hub_model_id, 'hub_token': hub_token, - 'async_upload': async_upload, } ) response.raise_for_status() + request_id = response.json().get('request_id') + if not request_id: + return + + print(f'[upload_to_hub] Upload started (task {request_id}), waiting for completion...') + while True: + status_resp = http_get(url=f'{self.server_url}/upload_status/{request_id}') + status_resp.raise_for_status() + data = status_resp.json() + status = data.get('status', 'unknown') + if status == 'completed': + print(f'[upload_to_hub] Upload completed successfully.') + return + elif status == 'failed': + error = data.get('error', 'Unknown error') + raise RuntimeError(f'[upload_to_hub] Upload failed: {error}') + else: + print(f'[upload_to_hub] Status: {status}...') + time.sleep(poll_interval) diff --git a/src/twinkle_client/sampler/vllm_sampler.py b/src/twinkle_client/sampler/vllm_sampler.py index 7839f557..a361b1e7 100644 --- a/src/twinkle_client/sampler/vllm_sampler.py +++ b/src/twinkle_client/sampler/vllm_sampler.py @@ -68,7 +68,7 @@ def sample( num_samples: Number of completions to generate per prompt. Returns: - A list of sampleResponseModel with 'sequences' list, each containing tokens, logprobs, stop_reason. + SampleResponseModel with 'sequences' list, each containing tokens, logprobs, stop_reason. """ json_data = { 'inputs': inputs, @@ -94,7 +94,7 @@ def set_template(self, template_cls: str, adapter_name: str = '', **kwargs) -> S ) response.raise_for_status() return SetTemplateResponse(**response.json()) - + def apply_patch(self, patch_cls: str, **kwargs) -> None: """Apply a patch to the model.""" response = http_post( diff --git a/src/twinkle_client/types/__init__.py b/src/twinkle_client/types/__init__.py index 00b1f967..51541dba 100644 --- a/src/twinkle_client/types/__init__.py +++ b/src/twinkle_client/types/__init__.py @@ -43,6 +43,7 @@ StepResponse, UploadToHubRequest, UploadToHubResponse, + UploadStatusResponse, ZeroGradResponse, ) from .processor import ( diff --git a/src/twinkle_client/types/model.py b/src/twinkle_client/types/model.py index e594bae4..2474c9e9 100644 --- a/src/twinkle_client/types/model.py +++ b/src/twinkle_client/types/model.py @@ -4,8 +4,8 @@ These models are used by both the server-side handler and the twinkle client. """ -from pydantic import BaseModel -from typing import Any, Dict, List, Optional +from pydantic import BaseModel, field_validator +from typing import Any, Dict, List, Optional, Union class CreateRequest(BaseModel): @@ -71,11 +71,18 @@ class Config: class UploadToHubRequest(BaseModel): - checkpoint_dir: str + checkpoint_dir: Union[str, Dict] hub_model_id: str hub_token: Optional[str] = None async_upload: bool = False + @field_validator('checkpoint_dir', mode='before') + @classmethod + def extract_checkpoint_dir(cls, v): + if isinstance(v, dict): + return v['twinkle_path'] + return v + class Config: extra = 'allow' @@ -264,9 +271,16 @@ class SetProcessorResponse(OkResponse): pass -class UploadToHubResponse(OkResponse): +class UploadToHubResponse(BaseModel): """Response for /upload_to_hub endpoint.""" - pass + request_id: str + + +class UploadStatusResponse(BaseModel): + """Response for /upload_status/{request_id} endpoint.""" + request_id: str + status: str # pending / queued / running / completed / failed + error: Optional[str] = None class ClipGradAndStepResponse(OkResponse): From d7b98be2a1362f906cc0408afc5bb753f4a70041 Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Fri, 17 Apr 2026 11:31:08 +0800 Subject: [PATCH 4/6] update upload --- Dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5d6f7feb..8107ebcb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,8 +22,7 @@ RUN SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") & CPLUS_INCLUDE_PATH=$SITE_PACKAGES/nvidia/cudnn/include \ pip install --no-build-isolation "transformer_engine[pytorch]" --no-cache-dir -# TODO: mcore_bridge use main branch version -RUN pip install megatron_core git+https://github.com/modelscope/mcore-bridge.git --no-cache-dir +RUN pip install megatron_core mcore_bridge --no-cache-dir # Install flash-attention (default arch 8.0;9.0, override via build-arg if needed) ARG TORCH_CUDA_ARCH_LIST="8.0;9.0" From 1640e3e4dd3e72ced53847d8c2183c0b666d4406 Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Fri, 17 Apr 2026 12:28:08 +0800 Subject: [PATCH 5/6] update upload --- src/twinkle/hub/hub.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/twinkle/hub/hub.py b/src/twinkle/hub/hub.py index 15fc1ef5..08f374de 100644 --- a/src/twinkle/hub/hub.py +++ b/src/twinkle/hub/hub.py @@ -374,17 +374,26 @@ def push_to_hub(cls, ignore_patterns = [] if revision is None or revision == 'main': revision = 'master' - result = push_to_hub( - repo_id, - folder_path, - token or cls.ms_token, - private, - commit_message=commit_message, - ignore_file_pattern=ignore_patterns, - revision=revision, - tag=path_in_repo) + try: + result = push_to_hub( + repo_id, + folder_path, + token or cls.ms_token, + private, + commit_message=commit_message, + ignore_file_pattern=ignore_patterns, + revision=revision, + tag=path_in_repo) + except Exception as exc: + raise RuntimeError( + f'ModelScope push_to_hub raised an exception ' + f'(repo_id={repo_id!r}, folder_path={folder_path!r}): {exc}') from exc if not result: - raise Exception('Failed to push to hub') + raise RuntimeError( + f'ModelScope push_to_hub returned a falsy result ' + f'(repo_id={repo_id!r}, folder_path={folder_path!r}). ' + f'This usually indicates an invalid/expired token or insufficient write permission.' + ) @classmethod def load_dataset(cls, From c427370f81b37da5fcae6d267437b5562b186feb Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Fri, 17 Apr 2026 12:29:12 +0800 Subject: [PATCH 6/6] update upload --- src/twinkle/hub/hub.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/twinkle/hub/hub.py b/src/twinkle/hub/hub.py index 08f374de..5ee00cd0 100644 --- a/src/twinkle/hub/hub.py +++ b/src/twinkle/hub/hub.py @@ -385,15 +385,12 @@ def push_to_hub(cls, revision=revision, tag=path_in_repo) except Exception as exc: - raise RuntimeError( - f'ModelScope push_to_hub raised an exception ' - f'(repo_id={repo_id!r}, folder_path={folder_path!r}): {exc}') from exc + raise RuntimeError(f'ModelScope push_to_hub raised an exception ' + f'(repo_id={repo_id!r}, folder_path={folder_path!r}): {exc}') from exc if not result: - raise RuntimeError( - f'ModelScope push_to_hub returned a falsy result ' - f'(repo_id={repo_id!r}, folder_path={folder_path!r}). ' - f'This usually indicates an invalid/expired token or insufficient write permission.' - ) + raise RuntimeError(f'ModelScope push_to_hub returned a falsy result ' + f'(repo_id={repo_id!r}, folder_path={folder_path!r}). ' + f'This usually indicates an invalid/expired token or insufficient write permission.') @classmethod def load_dataset(cls,