NVIDIA-NeMo · terrykong · Apr 7, 2025 · Mar 26, 2025 · Mar 31, 2025 · Mar 31, 2025
@@ -0,0 +1,22 @@
+# Checkpointing with HuggingFace Models
+
+## Checkpoint Format
+Reinforcer provides two checkpoint formats for HuggingFace models: Torch distributed and HuggingFace format. Torch distributed is used by default for efficiency, and HuggingFace format is provided for compatibility with HuggingFace's `AutoModel.from_pretrained` API. Note that HuggingFace format checkpoints save only the model weights, ignoring the optimizer states. It is recommended to use Torch distributed format to save intermediate checkpoints and to save a HuggingFace checkpoint only at the end of training. 
+
+There are two ways to get a Reinforcer checkpoint in HuggingFace format.
+
+1. (Recommended) Save the HuggingFace checkpoint directly by passing `save_hf=True` to `HFPolicy`'s `save_checkpoint`:
+
+    ```python
+    policy.save_checkpoint(
+        weights_path=<WHERE_TO_SAVE_MODEL_WEIGHTS>,
+        optimizer_path=<WHERE_TO_SAVE_OPTIM_STATE>,
+        save_torch_dist=True,
+        save_hf=True,
+    )
+    ```
+2. Convert a Torch distributed checkpoint checkpoint to HuggingFace format after training. We provide a conversion script for this purpose.
+
+    ```python
+    uv run examples/convert_dcp_to_hf.py --config=<YAML CONFIG USED DURING TRAINING> <ANY CONFIG OVERRIDES USED DURING TRAINING> --dcp-ckpt-path=<PATH TO DIST CHECKPOINT TO CONVERT> --hf-ckpt-path=<WHERE TO SAVE HF CHECKPOINT>
+    ```
@@ -47,4 +47,5 @@ design_docs/logger.md
 design_docs/uv.md
 design_docs/chat_datasets.md
 design_docs/generation.md
+design_docs/checkpointing.md
 ```
@@ -25,6 +25,7 @@ checkpointing:
 
 policy:
   model_name: "meta-llama/Llama-3.2-1B-Instruct"
+  tokenizer_name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
   train_global_batch_size: 512
   train_micro_batch_size: 4
   generation_batch_size: 32 # Only used when generating using HF backend

@@ -7,6 +7,7 @@ grpo:
 
 policy:
   model_name: "meta-llama/Llama-3.1-8B-Instruct"
+  tokenizer_name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
   train_global_batch_size: 512
   train_micro_batch_size: 1
   generation_batch_size: 32 # Only used when generating using HF backend

@@ -18,6 +18,7 @@ checkpointing:
 
 policy:
   model_name: "meta-llama/Llama-3.2-1B"
+  tokenizer_name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
   train_global_batch_size: 32
   train_micro_batch_size: 1
   max_total_sequence_length: 1024

@@ -0,0 +1,92 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import json
+
+from nemo_reinforcer.distributed.virtual_cluster import init_ray, RayVirtualCluster
+from nemo_reinforcer.models.policy.hf_policy import HfPolicy
+from nemo_reinforcer.utils.config import load_config
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Convert Torch DCP checkpoint to HF checkpoint"
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=None,
+        help="Path to config.json file in the checkpoint directory",
+    )
+    parser.add_argument(
+        "--dcp-ckpt-path", type=str, default=None, help="Path to DCP checkpoint"
+    )
+    parser.add_argument(
+        "--hf-ckpt-path", type=str, default=None, help="Path to save HF checkpoint"
+    )
+    # Parse known args for the script
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Main entry point."""
+    args = parse_args()
+
+    with open(args.config, "r") as f:
+        config = json.load(f)
+
+    dcp_ckpt = args.dcp_ckpt_path
+    hf_ckpt = args.hf_ckpt_path
+
+    # Extract individual configs for easier access
+    policy_config = config["policy"]
+    cluster_config = config["cluster"]
+
+    init_ray()
+
+    cluster = RayVirtualCluster(
+        name="convert_cluster",
+        bundle_ct_per_node_list=[cluster_config["gpus_per_node"]]
+        * cluster_config["num_nodes"],
+        use_gpus=True,
+        num_gpus_per_node=cluster_config["gpus_per_node"],
+        max_colocated_worker_groups=1,
+    )
+
+    policy = HfPolicy(
+        cluster=cluster,
+        config=policy_config,
+        weights_path=dcp_ckpt,
+        init_optimizer=False,
+    )
+
+    policy.save_checkpoint(
+        weights_path=os.path.abspath(hf_ckpt),
+        save_hf=True,
+        save_torch_dist=False,
+    )
+
+    print(f"Saved HF checkpoint to: {hf_ckpt}-hf")
+
+    cluster.shutdown()
+    policy.worker_group.shutdown()
+
+
+if __name__ == "__main__":
+    main()
@@ -236,10 +236,10 @@ def setup(
     policy = HfPolicy(
         cluster=cluster,
         config=policy_config,
-        weights_path=Path(last_checkpoint_path) / "policy.pt"
+        weights_path=Path(last_checkpoint_path) / "policy" / "weights"
         if last_checkpoint_path
         else None,
-        optimizer_path=Path(last_checkpoint_path) / "policy_optimizer.pt"
+        optimizer_path=Path(last_checkpoint_path) / "policy" / "optimizer"
         if last_checkpoint_path
         else None,
         init_optimizer=True,
@@ -608,6 +608,13 @@ def grpo_train(
                 and (step + 1) % master_config["checkpointing"]["save_period"] == 0
             ):  # +1 because step is 0-indexed
                 policy.prepare_for_training()
+
+                is_last_checkpoint = (
+                    min(len(dataloader), master_config["grpo"]["max_num_steps"])
+                    - (step + 1)
+                    < master_config["checkpointing"]["save_period"]
+                )
+
                 grpo_save_state["step"] = step + 1
                 grpo_save_state["val_reward"] = val_metrics["accuracy"]
                 grpo_save_state["consumed_samples"] = consumed_samples
@@ -617,8 +624,11 @@ def grpo_train(
                         step + 1, grpo_save_state, master_config
                     )
                     policy.save_checkpoint(
-                        os.path.join(checkpoint_path, "policy.pt"),
-                        os.path.join(checkpoint_path, "policy_optimizer.pt"),
+                        weights_path=os.path.join(checkpoint_path, "policy", "weights"),
+                        optimizer_path=os.path.join(
+                            checkpoint_path, "policy", "optimizer"
+                        ),
+                        save_hf=is_last_checkpoint,
                     )
                     torch.save(
                         dataloader.state_dict(),

@@ -175,10 +175,10 @@ def setup(
     policy = HfPolicy(
         cluster=cluster,
         config=policy_config,
-        weights_path=Path(last_checkpoint_path) / "policy.pt"
+        weights_path=Path(last_checkpoint_path) / "policy" / "weights"
         if last_checkpoint_path
         else None,
-        optimizer_path=Path(last_checkpoint_path) / "policy_optimizer.pt"
+        optimizer_path=Path(last_checkpoint_path) / "policy" / "optimizer"
         if last_checkpoint_path
         else None,
         init_optimizer=True,
@@ -311,9 +311,7 @@ def sft_train(
         sft_save_state = _default_sft_save_state()
         step = 0
     else:
-        step = (
-            sft_save_state["step"] + 1
-        )  # N+1 because the checkpoint is _after_ SFT iteration N
+        step = sft_save_state["step"]
 
     sft_config = master_config["sft"]
     # Validation configuration
@@ -399,19 +397,26 @@ def sft_train(
                 master_config["checkpointing"]["enabled"]
                 and (step + 1) % master_config["checkpointing"]["save_period"] == 0
             ):  # +1 because step is 0-indexed
-                sft_save_state["step"] = step
+                is_last_checkpoint = (
+                    min(len(train_dataloader), master_config["sft"]["max_num_steps"])
+                    - (step + 1)
+                    < master_config["checkpointing"]["save_period"]
+                )
+
+                sft_save_state["step"] = step + 1
                 sft_save_state["val_loss"] = val_metrics["val_loss"]
                 with timer.time("checkpointing"):
                     print(f"Saving checkpoint for step {step + 1}...")
                     checkpoint_path = checkpointer.init_tmp_checkpoint(
                         step + 1, sft_save_state, master_config
                     )
+
                     policy.save_checkpoint(
-                        os.path.join(checkpoint_path, "policy.pt"),
-                        os.path.join(checkpoint_path, "policy_optimizer.pt"),
-                        ## NOTE: below is a workaround to avoid a bug with checkpointing
-                        ## this should be removed once the bug is fixed
-                        offload_to_cpu=False,
+                        weights_path=os.path.join(checkpoint_path, "policy", "weights"),
+                        optimizer_path=os.path.join(
+                            checkpoint_path, "policy", "optimizer"
+                        ),
+                        save_hf=is_last_checkpoint,
                     )
                     torch.save(
                         train_dataloader.state_dict(),

@@ -19,6 +19,7 @@
 
 class PolicyConfig(TypedDict):
     model_name: str
+    tokenizer_name: str
     train_global_batch_size: int
     train_micro_batch_size: int
     learning_rate: float