diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index b7d464252dc1..716f73d3b469 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -1471,6 +1471,11 @@ def save_checkpoint(self, save_dir, tag=None, client_state={}, save_latest=True) used if not provided. Tag name must be the same across all ranks. client_state: Optional. State dictionary used for saving required training states in the client code. save_latest: Optional. Save a file 'latest' pointing to the latest saved checkpoint. + + Important: all processes must call this method and not just the process with rank 0. It is + because each process needs to save its master weights and scheduler+optimizer states. This + method will hang waiting to synchronize with other processes if it's called just for the + process with rank 0. """ # This is to make sure the checkpoint names are created without collision diff --git a/docs/_tutorials/getting-started.md b/docs/_tutorials/getting-started.md index 5c53f2084f3c..37f104f0739e 100644 --- a/docs/_tutorials/getting-started.md +++ b/docs/_tutorials/getting-started.md @@ -127,6 +127,9 @@ accepts a client state dictionary `client_sd` for saving. These items can be retrieved from `load_checkpoint` as a return argument. In the example above, the `step` value is stored as part of the `client_sd`. +Important: all processes must call this method and not just the process with rank 0. It is because +each process needs to save its master weights and scheduler+optimizer states. This method will hang +waiting to synchronize with other processes if it's called just for the process with rank 0. ## DeepSpeed Configuration DeepSpeed features can be enabled, disabled, or configured using a config JSON