From 7028f4184646de951db548b608d4d65db6e1a62b Mon Sep 17 00:00:00 2001 From: Vitaly Kurin Date: Thu, 11 Dec 2025 04:06:51 -0800 Subject: [PATCH] Track split_megabytes in wandb. --- megatron/rl/rl_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py index d6079b86448..a53c2dbaca6 100644 --- a/megatron/rl/rl_utils.py +++ b/megatron/rl/rl_utils.py @@ -975,9 +975,13 @@ def maybe_log_training_metrics( advantages = wandb_writer.Table( columns=['advantages'], data=[[x] for x in group_stats.advantages] ) + stats = torch.cuda.memory_stats() + # 1024*1024 = 1048576 + n_split_megabytes = stats.get("inactive_split_bytes.all.current", 0)/1048576 wandb_writer.log( { **{ + 'split_megabytes': n_split_megabytes, 'group_means_hist': wandb_writer.plot.histogram( group_table, 'group_means', 'Group Means' ),