diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py index 6be85d9e0d..d93385ec5f 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/dtensor_policy_worker.py @@ -578,7 +578,12 @@ def train( "generation" in self.cfg and self.cfg["generation"] is not None ): - logits.div_(self.cfg["generation"]["temperature"]) + # The V1 engine returns raw logits before temperature scaling. + # The V0 engine (when VLLM_USE_V1 is not '1') returns scaled logits. + # Therefore, we only divide if we are NOT using the V1 engine. + use_v1_engine = os.environ.get("VLLM_USE_V1") == "1" + if not use_v1_engine: + logits.div_(self.cfg["generation"]["temperature"]) if self.cp_size > 1: seq_index_dtensor = (