Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions colossalai/booster/plugin/gemini_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,11 @@ def save_sharded_model(self,
save_state_dict(shard, checkpoint_file_path, use_safetensors)

index_file.append_meta_data("total_size", total_size)
index_file.write_index_file(save_index_file)
logging.info(f"The model is going to be split to checkpoint shards. "

# only save the index file on the master rank
if self.coordinator.is_master():
index_file.write_index_file(save_index_file)
logging.info(f"The model is split into checkpoint shards. "
f"You can find where each parameters has been saved in the "
f"index located at {save_index_file}.")

Expand Down
18 changes: 10 additions & 8 deletions colossalai/checkpoint_io/index_file.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
from pathlib import Path
from typing import Any, List, Union
import os
import json
from collections import OrderedDict
from pathlib import Path
from typing import Any, Dict, List, Union

from .utils import is_dtensor_checkpoint

Expand All @@ -22,8 +22,10 @@ class CheckpointIndexFile:

def __init__(self, root_path=None) -> None:
self.root_path = root_path
self.metadata: dict = dict()
self.weight_map: dict = dict()

# use ordered dict to preserve the tensor checkpoint order
self.metadata: Dict = OrderedDict()
self.weight_map: Dict = OrderedDict()

@staticmethod
def from_file(index_path: Union[str, Path]):
Expand Down Expand Up @@ -150,19 +152,19 @@ def get_checkpoint_file(self, param_name: str) -> str:
"""
ckpt_path = self.weight_map[param_name]
return ckpt_path

def get_all_param_names(self):
"""
Get all the weight keys.
"""
return list(self.weight_map.keys())

def write_index_file(self, save_index_file):
"""
Write index file.
"""
save_index_file = os.path.join(self.root_path, save_index_file)
index = {"metadata": self.metadata, "weight_map": self.weight_map}
with open(save_index_file, "w", encoding="utf-8") as f:
content = json.dumps(index, indent=2, sort_keys=True) + "\n"
content = json.dumps(index, indent=2) + "\n"
f.write(content)
5 changes: 4 additions & 1 deletion colossalai/zero/gemini/gemini_ddp.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,10 @@ def append(self, name: str, tensor: torch.Tensor) -> Tuple[Optional[OrderedDict]
tensor_size = calculate_tensor_size(tensor)
ret_block = None
ret_block_size = 0
if self.current_block_size + tensor_size > self.max_shard_size:

# before we return the current block and create a new block,
# we need to ensure that the current block is not empty
if self.current_block_size + tensor_size > self.max_shard_size and self.current_block_size > 0:
ret_block = self.current_block
ret_block_size = self.current_block_size
self.current_block = OrderedDict()
Expand Down