diff --git a/src/art/rewards/ruler.py b/src/art/rewards/ruler.py
index 84d41f4de..ecb4b714d 100644
--- a/src/art/rewards/ruler.py
+++ b/src/art/rewards/ruler.py
@@ -55,6 +55,7 @@ async def ruler(
judge_model: str = "openai/o3",
extra_litellm_params: dict | None = None,
rubric: str = DEFAULT_RUBRIC,
+ tools: list | None = None,
*,
debug: bool = False,
) -> list[TrajectoryScore]:
@@ -81,6 +82,8 @@ async def ruler(
extra_litellm_params: Additional parameters to pass to LiteLLM completion.
Can include temperature, max_tokens, etc.
rubric: The grading rubric. The default rubric works well for most tasks.
+ tools: Optional list of tool definitions available to the agent. When provided,
+ the judge will see which tools were available when evaluating tool usage.
debug: If True, pretty-print the judge's reasoning to help understand scores.
Returns:
@@ -137,6 +140,12 @@ async def ruler(
"\n" + json.dumps(common_prefix_messages) + "\n\n\n"
)
+ # Include available tools so the judge knows which tool calls are valid
+ if tools:
+ user_text += (
+ "\n" + json.dumps(tools) + "\n\n\n"
+ )
+
# Serialize each trajectory (minus the common prefix) for the judge.
# If all trajectories are identical, only serialize one full trajectory to save tokens.
serialized_trajectories: List[str] = []
@@ -292,6 +301,9 @@ async def ruler_score_group(
message_lists.append(traj.messages())
traj.metrics["independent_reward"] = traj.reward
+ # Extract tools from first trajectory (they should all be the same)
+ tools = new_trajectories[0].tools if new_trajectories else None
+
try:
# Call the core ruler function to get scores
scores = await ruler(
@@ -299,6 +311,7 @@ async def ruler_score_group(
judge_model=judge_model,
extra_litellm_params=extra_litellm_params,
rubric=rubric,
+ tools=tools,
debug=debug,
)
except Exception as e: