Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .compatibility
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
2.2.2-12.1.0
2.3.0-12.1.0
2.4.0-12.4.1
2.5.1-12.4.1
4 changes: 2 additions & 2 deletions .cuda_ext.json
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
{
"build": [
{
"torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121",
"torch_command": "pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121",
"cuda_image": "hpcaitech/cuda-conda:12.1"
},
{
"torch_command": "pip install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124",
"torch_command": "pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124",
"cuda_image": "hpcaitech/cuda-conda:12.4"
}
]
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ click
fabric
contexttimer
ninja
torch>=2.2.0,<=2.4.1
torch>=2.2.0,<=2.5.1
safetensors
einops
pydantic
Expand Down
3 changes: 2 additions & 1 deletion tests/test_cluster/test_device_mesh_manager.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from colossalai.cluster.device_mesh_manager import DeviceMeshInfo, DeviceMeshManager
from colossalai.initialize import launch
from colossalai.logging import disable_existing_loggers
from colossalai.testing import spawn
from colossalai.testing import rerun_if_address_is_in_use, spawn


def check_device_mesh_manager(rank, world_size, port):
Expand All @@ -24,6 +24,7 @@ def check_device_mesh_manager(rank, world_size, port):
assert device_mesh_with_shape._logical_mesh_id.tolist() == [[0, 1], [2, 3]]


@rerun_if_address_is_in_use()
def test_device_mesh_manager():
spawn(check_device_mesh_manager, 4)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_shardformer/test_model/test_shard_t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
if test_config["precision"] == "fp32":
atol, rtol = 1e-5, 1e-3
else:
atol, rtol = 5e-2, 5e-2
atol, rtol = 9e-2, 0
if (stage_manager is None or stage_manager.is_first_stage()) and booster.plugin.zero_stage == 0:
row_layer_grads = get_grad_tensors_for_check(
t5, sharded_t5, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0
Expand Down