From c630ca395a2f4aac3d0f560f978302fe079d993b Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Thu, 13 Nov 2025 23:54:26 +0800 Subject: [PATCH 1/4] fix: Handle case when all trajectories are identical --- src/art/rewards/ruler.py | 47 +++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/src/art/rewards/ruler.py b/src/art/rewards/ruler.py index 3724105d3..98fe2f73f 100644 --- a/src/art/rewards/ruler.py +++ b/src/art/rewards/ruler.py @@ -117,23 +117,44 @@ async def ruler( else: break + # Detect if all trajectories are identical + all_identical = all( + len(msg_list) == common_prefix_len for msg_list in message_lists + ) + + if all_identical and len(message_lists) > 1: + print( + f"[art_ruler] Warning: All {len(message_lists)} trajectories are identical. " + "Using absolute scoring (loses relative grounding benefit)." + ) + # If there is a non-empty common prefix, serialize it once to save tokens. + # Skip this optimization if all trajectories are identical (we'll send the full trajectory instead). user_text = "" - if common_prefix_len > 0: + if common_prefix_len > 0 and not all_identical: common_prefix_messages = message_lists[0][:common_prefix_len] user_text += ( "\n" + json.dumps(common_prefix_messages) + "\n\n\n" ) # Serialize each trajectory (minus the common prefix) for the judge. + # If all trajectories are identical, only serialize one full trajectory to save tokens. serialized_trajectories: List[str] = [] - for idx, full_messages in enumerate(message_lists, start=1): - trimmed_messages = full_messages[common_prefix_len:] + if all_identical: + # Send the full trajectory since they're all identical + full_trajectory = message_lists[0] serialized_trajectories.append( - f'\n' - + json.dumps(trimmed_messages) - + "\n" + f'\n' + json.dumps(full_trajectory) + "\n" ) + else: + # Serialize each unique trajectory + for idx, full_messages in enumerate(message_lists, start=1): + trimmed_messages = full_messages[common_prefix_len:] + serialized_trajectories.append( + f'\n' + + json.dumps(trimmed_messages) + + "\n" + ) user_text += "Trajectories:\n\n" + "\n\n".join(serialized_trajectories) @@ -175,9 +196,19 @@ async def ruler( content = first_choice.message.content or "{}" # type: ignore[attr-defined] parsed = Response.model_validate_json(content) - assert len(parsed.scores) == len(message_lists) - return parsed.scores + # If all trajectories were identical, we only sent one to the judge + # Duplicate the score for all trajectories + if all_identical: + assert len(parsed.scores) == 1 + single_score = parsed.scores[0] + return [ + single_score.model_copy(update={"trajectory_id": str(i)}) + for i in range(1, len(message_lists) + 1) + ] + else: + assert len(parsed.scores) == len(message_lists) + return parsed.scores async def ruler_score_group( From 12cb813bc6c135189c40efc25bcbfe55c2aa462f Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Fri, 14 Nov 2025 09:53:23 +0800 Subject: [PATCH 2/4] chore: Change logger label --- src/art/rewards/ruler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/art/rewards/ruler.py b/src/art/rewards/ruler.py index 98fe2f73f..b08a0080b 100644 --- a/src/art/rewards/ruler.py +++ b/src/art/rewards/ruler.py @@ -124,7 +124,7 @@ async def ruler( if all_identical and len(message_lists) > 1: print( - f"[art_ruler] Warning: All {len(message_lists)} trajectories are identical. " + f"[RULER] Warning: All {len(message_lists)} trajectories are identical. " "Using absolute scoring (loses relative grounding benefit)." ) From e24464ad0d6adc097a059426a1fbc23e8f47d516 Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Fri, 14 Nov 2025 09:54:47 +0800 Subject: [PATCH 3/4] fix: Raise error instead of assert --- src/art/rewards/ruler.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/art/rewards/ruler.py b/src/art/rewards/ruler.py index b08a0080b..66af6b653 100644 --- a/src/art/rewards/ruler.py +++ b/src/art/rewards/ruler.py @@ -200,14 +200,20 @@ async def ruler( # If all trajectories were identical, we only sent one to the judge # Duplicate the score for all trajectories if all_identical: - assert len(parsed.scores) == 1 + if len(parsed.scores) != 1: + raise ValueError( + f"Expected 1 score for identical trajectories, but got {len(parsed.scores)}" + ) single_score = parsed.scores[0] return [ single_score.model_copy(update={"trajectory_id": str(i)}) for i in range(1, len(message_lists) + 1) ] else: - assert len(parsed.scores) == len(message_lists) + if len(parsed.scores) != len(message_lists): + raise ValueError( + f"Expected {len(message_lists)} scores, but got {len(parsed.scores)}" + ) return parsed.scores From f1501be5bc437da35018736ab5b531259a35ef2f Mon Sep 17 00:00:00 2001 From: Vivek Kalyan Date: Fri, 14 Nov 2025 09:55:48 +0800 Subject: [PATCH 4/4] style: Remove trailing spaces --- src/art/rewards/ruler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/art/rewards/ruler.py b/src/art/rewards/ruler.py index 66af6b653..2ea333124 100644 --- a/src/art/rewards/ruler.py +++ b/src/art/rewards/ruler.py @@ -39,7 +39,7 @@ class Response(BaseModel): DEFAULT_RUBRIC = dedent( - """ + """ - A trajectory that achieves its goal should always get a significantly higher score than a trajectory that does not achieve its goal. - A trajectory that achieves its goal more efficiently (eg. by avoiding unproductive detours) should get a higher score than a trajectory that achieves its goal less efficiently. - If one trajectory is only slightly better than another, the difference in scores should be small. If it is significantly better, the difference in scores should be large.