Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
5cd3c0f
docs: add transformers resume design spec
kevssim Mar 27, 2026
91eeaeb
docs: refine transformers resume design spec
kevssim Mar 27, 2026
6eebda8
docs: trim resume state fields
kevssim Mar 27, 2026
cdd9c1b
docs: add npu resume compatibility requirements
kevssim Mar 27, 2026
1542492
chore: ignore local worktrees
kevssim Mar 27, 2026
9883118
wip
kevssim Mar 30, 2026
d41a634
wip
kevssim Mar 30, 2026
21f9918
wip
kevssim Mar 30, 2026
1e59531
fix
kevssim Mar 30, 2026
9bb3f39
wip
kevssim Mar 30, 2026
fdf1f71
fix
kevssim Mar 30, 2026
6cf5160
wip
kevssim Mar 30, 2026
144ffe6
Merge branch 'modelscope:main' into resume_from_ckpt
kevssim Mar 31, 2026
e21f870
lint
kevssim Mar 31, 2026
3359209
Merge branch 'resume_from_ckpt' of https://github.com/kevssim/twinkle…
kevssim Mar 31, 2026
70ebe50
wip
kevssim Mar 31, 2026
483778d
wip
kevssim Mar 31, 2026
039789b
wip
kevssim Mar 31, 2026
54de1a4
wip
kevssim Mar 31, 2026
920ab86
wip
kevssim Mar 31, 2026
ffd6304
lint
kevssim Mar 31, 2026
582bd41
wip
kevssim Mar 31, 2026
9cb6106
wip
kevssim Apr 1, 2026
c0cf72e
wip
kevssim Apr 1, 2026
505a75c
wip
kevssim Apr 1, 2026
a222b5b
fix
kevssim Apr 1, 2026
7499e00
wip
kevssim Apr 1, 2026
cd0b094
doc
kevssim Apr 1, 2026
abf2c2f
wip
kevssim Apr 1, 2026
8bf7a6a
lint
kevssim Apr 1, 2026
27e76c6
Merge remote-tracking branch 'origin/main' into resume_from_ckpt
kevssim Apr 2, 2026
5d68910
Merge remote-tracking branch 'origin' into resume_from_ckpt
kevssim Apr 16, 2026
9326e64
wip
kevssim Apr 16, 2026
670f0c1
feat: add resume_from_checkpoint abstract method to TwinkleModel base
kevssim Apr 21, 2026
784730c
feat(dataloader): add resume_from_checkpoint wrapping skip_consumed_s…
kevssim Apr 21, 2026
3db38e9
feat(transformers): replace load_training_state/read_training_progres…
kevssim Apr 21, 2026
94679d5
feat(megatron): add resume_from_checkpoint and save trainer_state.json
kevssim Apr 21, 2026
832ce87
refactor(cookbook): use model.resume_from_checkpoint API
kevssim Apr 21, 2026
e3a3cd6
feat(types): replace training state request types with ResumeFromChec…
kevssim Apr 21, 2026
a3effab
feat(server): replace training state endpoints with /resume_from_chec…
kevssim Apr 21, 2026
383336d
feat(client): replace training state methods with resume_from_checkpoint
kevssim Apr 21, 2026
54a1db6
docs: update checkpoint/resume documentation for unified API
kevssim Apr 21, 2026
597cbd9
fix: remove stale load_training_state references from __init__.py, mu…
kevssim Apr 21, 2026
c55ab9f
fix(transformers): pass correct file paths to _load_scaler_state and …
kevssim Apr 21, 2026
8f76b7b
fix: guard rng_state.pt existence check, add Config extra=allow to Re…
kevssim Apr 21, 2026
4ffa5c7
wip
kevssim Apr 21, 2026
0b43055
wip
kevssim Apr 21, 2026
c8bc9ab
wip
kevssim Apr 21, 2026
8c0399e
wip
kevssim Apr 21, 2026
94af275
Merge remote-tracking branch 'origin/main' into resume_from_ckpt
kevssim Apr 21, 2026
10b4a20
refactor: delete resume_utils.py, inline logic in fsdp2.py, update docs
kevssim Apr 21, 2026
3df191a
wip
kevssim Apr 23, 2026
deeb648
Merge remote-tracking branch 'origin/main' into resume_from_ckpt
kevssim Apr 24, 2026
7a657e8
wip
kevssim Apr 27, 2026
ae67122
fix
kevssim Apr 27, 2026
5b15d67
lint
kevssim Apr 27, 2026
f0d36e2
remove
kevssim Apr 27, 2026
d0219df
wip
kevssim Apr 28, 2026
9d5327d
update
kevssim Apr 28, 2026
85b7cf8
doc
kevssim Apr 28, 2026
9af73bc
fix
kevssim Apr 29, 2026
482a451
fix doc
kevssim Apr 30, 2026
2396419
fix
kevssim Apr 30, 2026
9a6fbb9
lint
kevssim Apr 30, 2026
daa9202
update cookbook
kevssim Apr 30, 2026
a75f8b1
fix
kevssim Apr 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions client_tools/client_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,7 @@ def generate_models():
GetStateDictResponse,
GetTrainConfigsResponse,
SaveResponse,
TrainingProgressResponse,
)


Expand Down Expand Up @@ -618,6 +619,15 @@ def load(self, name: str, **kwargs) -> None:
)
response.raise_for_status()

def resume_from_checkpoint(self, name: str, *, resume_only_model: bool = False, **kwargs) -> Dict[str, Any]:
response = http_post(
url=f'{self.server_url}/resume_from_checkpoint',
json_data={'name': name, 'adapter_name': self.adapter_name,
'resume_only_model': resume_only_model, **kwargs}
)
response.raise_for_status()
return TrainingProgressResponse(**response.json()).result

def apply_patch(self, patch_cls: str, **kwargs) -> None:
"""Apply a patch to the model."""
response = http_post(
Expand Down
19 changes: 13 additions & 6 deletions cookbook/client/twinkle/self_host/self_cognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,16 +99,19 @@ def train():
# model.set_lr_scheduler('LinearLR')

# Step 6: Optionally resume from a previous checkpoint
start_step = 0
if resume_path:
logger.info(f'Resuming training from {resume_path}')
model.load(resume_path, load_optimizer=True)
logger.info(f'Resuming from checkpoint {resume_path}')
progress = model.resume_from_checkpoint(resume_path)
dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
start_step = progress['cur_step']

# Step 7: Run the training loop
logger.info(model.get_train_configs().model_dump())

for epoch in range(3):
logger.info(f'Starting epoch {epoch}')
for step, batch in enumerate(dataloader):
for cur_step, batch in enumerate(dataloader, start=start_step + 1):
# Forward pass + backward pass (computes gradients)
model.forward_backward(inputs=batch)

Expand All @@ -125,13 +128,17 @@ def train():
# model.lr_step()

# Log the loss every 2 steps (aligned with gradient accumulation)
if step % 2 == 0:
if cur_step % 2 == 0:
# Print metric
metric = model.calculate_metric(is_training=True)
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric.result}')
logger.info(f'Current is step {cur_step} of {len(dataloader)}, metric: {metric.result}')

# Step 8: Save the trained checkpoint
twinkle_path = model.save(name=f'twinkle-epoch-{epoch}', save_optimizer=True)
twinkle_path = model.save(
name=f'twinkle-epoch-{epoch}',
save_optimizer=True,
consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
)
logger.info(f'Saved checkpoint: {twinkle_path}')

# Step 9: Upload the checkpoint to ModelScope Hub
Expand Down
126 changes: 79 additions & 47 deletions cookbook/megatron/tp.py
Original file line number Diff line number Diff line change
@@ -1,81 +1,113 @@
import os
from pathlib import Path

from peft import LoraConfig
from tqdm import tqdm

import twinkle
from twinkle import DeviceMesh, Platform, get_device_placement, get_logger
from twinkle import DeviceMesh, get_device_placement, get_logger
from twinkle.dataloader import DataLoader
from twinkle.dataset import Dataset, DatasetMeta
from twinkle.model import MegatronModel
from twinkle.preprocessor import SelfCognitionProcessor
# Construct a device_mesh, tp=pp=dp=2
device_mesh = DeviceMesh.from_sizes(dp_size=2, tp_size=2, pp_size=2)
# use torchrun mode
twinkle.initialize(mode='local', global_device_mesh=device_mesh)

logger = get_logger()

MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
DATASET_ID = 'ms://swift/self-cognition'
TEMPLATE_NAME = 'Qwen3_5Template'
MODEL_NAME = 'twinkle大模型'
MODEL_AUTHOR = 'ModelScope社区'
DP_SIZE = 2
TP_SIZE = 2
PP_SIZE = 2
BATCH_SIZE = 16
LEARNING_RATE = 1e-4
LOG_INTERVAL = 5
EVAL_INTERVAL = 20
EVAL_SAMPLES = 100
TRAIN_SAMPLES = 1000

OUTPUT_DIR = './output/megatron_tp'
RESUME_FROM_CHECKPOINT = None
RESUME_ONLY_MODEL = False
IGNORE_DATA_SKIP = False
ADAPTER_NAME = 'default'

device_mesh = DeviceMesh.from_sizes(dp_size=DP_SIZE, tp_size=TP_SIZE, pp_size=PP_SIZE)
twinkle.initialize(mode='local', global_device_mesh=device_mesh)


def eval(model):
# 100 Samples
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100)))
dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
def build_dataset(num_samples: int) -> Dataset:
dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=range(num_samples)))
dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID)
dataset.map(SelfCognitionProcessor(MODEL_NAME, MODEL_AUTHOR))
dataset.encode()
dataloader = DataLoader(dataset=dataset, batch_size=16)
for step, batch in tqdm(enumerate(dataloader)):
return dataset


def save_checkpoint(model: MegatronModel, checkpoint_name: str, dataloader: DataLoader):
model.save(
checkpoint_name,
output_dir=OUTPUT_DIR,
adapter_name=ADAPTER_NAME,
save_optimizer=True,
consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
)


def evaluate(model):
dataloader = DataLoader(dataset=build_dataset(EVAL_SAMPLES), batch_size=BATCH_SIZE)
for batch in tqdm(dataloader):
model.forward_only(inputs=batch)
metrics = model.calculate_metric(is_training=False)
return metrics
return model.calculate_metric(is_training=False)


def train():
# 1000 samples
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
# Set template to prepare encoding
dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
# Preprocess the dataset to standard format
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
# Encode dataset
dataset.encode()
# Global batch size = 1, dp_size = 1
dataloader = DataLoader(dataset=dataset, batch_size=16)
# Use a MegatronModel
model = MegatronModel(model_id='ms://Qwen/Qwen3.5-4B')
dataset = build_dataset(TRAIN_SAMPLES)
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)

model = MegatronModel(model_id=MODEL_ID)

lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')

# Add a lora to model, with name `default`
# Comment this to use full-parameter training
model.add_adapter_to_model('default', lora_config)
# Add Optimizer for lora `default`
model.set_optimizer(optimizer_cls='default', lr=1e-4)
# Add LRScheduler for lora `default`
model.add_adapter_to_model(ADAPTER_NAME, lora_config)
model.set_optimizer(optimizer_cls='default', lr=LEARNING_RATE)
model.set_lr_scheduler(scheduler_cls='default', lr_warmup_steps=5, lr_decay_steps=len(dataloader))

start_step = 0
if RESUME_FROM_CHECKPOINT:
checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
kwargs = {}
if ADAPTER_NAME:
kwargs['adapter_name'] = ADAPTER_NAME
progress = model.resume_from_checkpoint(
str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
if not IGNORE_DATA_SKIP:
dataloader.resume_from_checkpoint(progress['consumed_train_samples'])
start_step = progress['cur_step']

logger.info(get_device_placement())
# Print the training config
logger.info(model.get_train_configs())
logger.info(f'Total steps: {len(dataloader)}')
loss_metric = 99.0
# lora: 10G * 8
# full: 40G * 8
for step, batch in enumerate(dataloader):
# Do forward and backward

best_loss = float('inf')

for step, batch in enumerate(dataloader, start=start_step):
model.forward_backward(inputs=batch)
# Step
model.clip_grad_and_step()
if step % 5 == 0:
# Print metric
if step % LOG_INTERVAL == 0:
metric = model.calculate_metric(is_training=True)
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
if step > 0 and step % 20 == 0:
metrics = eval(model)
if step > 0 and step % EVAL_INTERVAL == 0:
metrics = evaluate(model)
logger.info(f'Eval metric: {metrics}')
metrics['step'] = step
if loss_metric > float(metrics['loss']):
model.save(f'checkpoint-{step}')
loss_metric = float(metrics['loss'])
model.save(f'last-checkpoint')
current_loss = float(metrics['loss'])
if current_loss < best_loss:
save_checkpoint(model, f'checkpoint-{step}', dataloader)
best_loss = current_loss
save_checkpoint(model, 'last-checkpoint', dataloader)


if __name__ == '__main__':
Expand Down
115 changes: 78 additions & 37 deletions cookbook/transformers/fsdp2.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

from peft import LoraConfig
from tqdm import tqdm

Expand All @@ -8,77 +10,116 @@
from twinkle.model import TransformersModel
from twinkle.preprocessor import SelfCognitionProcessor

# Construct a device_mesh, fsdp_size=2, dp=4
device_mesh = DeviceMesh.from_sizes(fsdp_size=2, dp_size=4)
logger = get_logger()

MODEL_ID = 'ms://Qwen/Qwen3.5-4B'
DATASET_ID = 'ms://swift/self-cognition'
TEMPLATE_NAME = 'Qwen3_5Template'
MODEL_NAME = 'twinkle大模型'
MODEL_AUTHOR = 'ModelScope社区'
FSDP_SIZE = 2
DP_SIZE = 4
BATCH_SIZE = 8
LEARNING_RATE = 1e-4
GRADIENT_ACCUMULATION_STEPS = 2
LOG_INTERVAL = 20
EVAL_INTERVAL = 40
EVAL_SAMPLES = 100
TRAIN_SAMPLES = 1000

OUTPUT_DIR = './output/fsdp2'
RESUME_FROM_CHECKPOINT = None
RESUME_ONLY_MODEL = False
IGNORE_DATA_SKIP = False
ADAPTER_NAME = 'default'

# Construct a device_mesh
device_mesh = DeviceMesh.from_sizes(fsdp_size=FSDP_SIZE, dp_size=DP_SIZE)
# use torchrun mode
twinkle.initialize(mode='local', global_device_mesh=device_mesh)

logger = get_logger()


def eval(model):
# 100 Samples
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(100)))
dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
def build_dataset(num_samples: int) -> Dataset:
dataset = Dataset(dataset_meta=DatasetMeta(DATASET_ID, data_slice=range(num_samples)))
dataset.set_template(TEMPLATE_NAME, model_id=MODEL_ID)
dataset.map(SelfCognitionProcessor(MODEL_NAME, MODEL_AUTHOR))
dataset.encode()
dataloader = DataLoader(dataset=dataset, batch_size=8)
for step, batch in tqdm(enumerate(dataloader)):
return dataset


def save_checkpoint(model: TransformersModel, checkpoint_name: str, dataloader: DataLoader):
model.save(
checkpoint_name,
output_dir=OUTPUT_DIR,
adapter_name=ADAPTER_NAME,
save_optimizer=True,
consumed_train_samples=dataloader.get_state()['consumed_train_samples'],
)


def evaluate(model):
dataloader = DataLoader(dataset=build_dataset(EVAL_SAMPLES), batch_size=BATCH_SIZE)
for batch in tqdm(dataloader):
model.forward_only(inputs=batch)
model.calculate_loss()
metrics = model.calculate_metric(is_training=False)
return metrics
return model.calculate_metric(is_training=False)


def train():
# 1000 samples
dataset = Dataset(dataset_meta=DatasetMeta('ms://swift/self-cognition', data_slice=range(1000)))
# Set template to prepare encoding
dataset.set_template('Qwen3_5Template', model_id='ms://Qwen/Qwen3.5-4B')
# Preprocess the dataset to standard format
dataset.map(SelfCognitionProcessor('twinkle大模型', 'ModelScope社区'))
# Encode dataset
dataset.encode()
dataset = build_dataset(TRAIN_SAMPLES)
# Global batch size = 8, for GPUs, so 1 sample per GPU
dataloader = DataLoader(dataset=dataset, batch_size=8)
dataloader = DataLoader(dataset=dataset, batch_size=BATCH_SIZE)
# Use a TransformersModel
model = TransformersModel(model_id='ms://Qwen/Qwen3.5-4B')
model = TransformersModel(model_id=MODEL_ID)
model.model._no_split_modules = {'Qwen3_5DecoderLayer'}

lora_config = LoraConfig(r=8, lora_alpha=32, target_modules='all-linear')

# Add a lora to model, with name `default`
# Comment this to use full-parameter training
model.add_adapter_to_model('default', lora_config, gradient_accumulation_steps=2)
model.add_adapter_to_model(ADAPTER_NAME, lora_config, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS)
# Add Optimizer for lora `default`
model.set_optimizer(optimizer_cls='AdamW', lr=1e-4)
model.set_optimizer(optimizer_cls='AdamW', lr=LEARNING_RATE)
# Add LRScheduler for lora `default`
model.set_lr_scheduler(
scheduler_cls='CosineWarmupScheduler', num_warmup_steps=5, num_training_steps=len(dataloader))

if RESUME_FROM_CHECKPOINT:
checkpoint_path = Path(RESUME_FROM_CHECKPOINT).expanduser().resolve()
kwargs = {}
if ADAPTER_NAME:
kwargs['adapter_name'] = ADAPTER_NAME
progress = model.resume_from_checkpoint(
str(checkpoint_path), resume_only_model=RESUME_ONLY_MODEL, **kwargs)
if not IGNORE_DATA_SKIP:
dataloader.resume_from_checkpoint(progress['consumed_train_samples'])

logger.info(get_device_placement())
# Print the training config
logger.info(model.get_train_configs())
logger.info(f'Total steps: {len(dataloader)}')
loss_metric = 99.0
optimizer_group = model.optimizer_group[ADAPTER_NAME]
best_loss = float('inf')
# lora: 8G * 8
# full: 18G * 8
for step, batch in enumerate(dataloader):
for batch in dataloader:
# Do forward and backward
model.forward_backward(inputs=batch)
# Step
model.clip_grad_and_step()
if step % 20 == 0:
cur_step = optimizer_group.cur_step
if cur_step % LOG_INTERVAL == 0:
# Print metric
metric = model.calculate_metric(is_training=True)
logger.info(f'Current is step {step} of {len(dataloader)}, metric: {metric}')
if step > 0 and step % 40 == 0:
metrics = eval(model)
logger.info(f'Current is step {cur_step} of {len(dataloader)}, metric: {metric}')
if cur_step > 0 and cur_step % EVAL_INTERVAL == 0:
metrics = evaluate(model)
logger.info(f'Eval metric: {metrics}')
metrics['step'] = step
if loss_metric > float(metrics['loss']):
model.save(f'checkpoint-{step}')
loss_metric = float(metrics['loss'])
model.save(f'last-checkpoint')
metrics['step'] = cur_step
current_loss = float(metrics['loss'])
if current_loss < best_loss:
save_checkpoint(model, f'checkpoint-{cur_step}', dataloader)
best_loss = current_loss
save_checkpoint(model, 'last-checkpoint', dataloader)


if __name__ == '__main__':
Expand Down
Loading
Loading