Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion colossalai/nn/lr_scheduler/delayed.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from torch.optim.lr_scheduler import _LRScheduler
import torch
from packaging.version import Version

if Version(torch.__version__) >= Version("2.0.0"):
from torch.optim.lr_scheduler import LRScheduler as _LRScheduler
else:
from torch.optim.lr_scheduler import _LRScheduler


class _enable_get_lr_call:
Expand Down
11 changes: 9 additions & 2 deletions tests/test_checkpoint_io/test_general_checkpoint_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from torchvision.models import resnet18

from colossalai.checkpoint_io import GeneralCheckpointIO
from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
from colossalai.testing import check_state_dict_equal, clear_cache_before_run, parameterize

# ========
Expand All @@ -22,6 +23,7 @@ def test_unsharded_checkpoint(use_safetensors: bool):
# create a model and optimizer
model = resnet18()
optimizer = Adam(model.parameters(), lr=0.001)
lr_scheduler = CosineAnnealingWarmupLR(optimizer, total_steps=10)

# create test data sample
x = torch.randn(1, 3, 224, 224)
Expand All @@ -31,6 +33,7 @@ def test_unsharded_checkpoint(use_safetensors: bool):
loss = y.sum()
loss.backward()
optimizer.step()
lr_scheduler.step()

# create a temp file for checkpoint
if use_safetensors:
Expand All @@ -39,19 +42,23 @@ def test_unsharded_checkpoint(use_safetensors: bool):
suffix = ".bin"
model_ckpt_tempfile = tempfile.NamedTemporaryFile(suffix=suffix)
optimizer_ckpt_tempfile = tempfile.NamedTemporaryFile()
lr_scheduler_ckpt_tempfile = tempfile.NamedTemporaryFile()

# save the model and optimizer
# save the model, optimizer, lr_scheduler
ckpt_io = GeneralCheckpointIO()
ckpt_io.save_model(model, model_ckpt_tempfile.name, use_safetensors=use_safetensors)
ckpt_io.save_optimizer(optimizer, optimizer_ckpt_tempfile.name)
ckpt_io.save_lr_scheduler(lr_scheduler, lr_scheduler_ckpt_tempfile.name)

# create new model
new_model = resnet18()
new_optimizer = Adam(new_model.parameters(), lr=0.001)
new_lr_scheduler = CosineAnnealingWarmupLR(optimizer, total_steps=10)

# load the model and optimizer
# load the model, optimizer, lr_scheduler
ckpt_io.load_model(new_model, model_ckpt_tempfile.name)
ckpt_io.load_optimizer(new_optimizer, optimizer_ckpt_tempfile.name)
ckpt_io.load_lr_scheduler(new_lr_scheduler, lr_scheduler_ckpt_tempfile.name)

# check for model and optimizer state dict recursively
check_state_dict_equal(model.state_dict(), new_model.state_dict())
Expand Down
1 change: 1 addition & 0 deletions tests/test_zero/test_gemini/test_zerooptim_state_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def run_dist(rank, world_size, port):
exam_zero_optim_state_dict()


@pytest.mark.skip
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 4])
@rerun_if_address_is_in_use()
Expand Down