Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/commit0/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def process_commit0_results(input_file: str, output_file: str) -> None:

# Generate report
report = {
"benchmark": "commit0",
"total_instances": 16, # Fixed as per requirement
"submitted_instances": len(completed_ids),
"completed_instances": len(completed_ids),
Expand Down
63 changes: 41 additions & 22 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,12 @@
EvalMetadata,
EvalOutput,
)
from benchmarks.utils.version import SDK_SHORT_SHA
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from openhands.sdk import LLM, Agent, Conversation, get_logger
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.preset.default import get_default_tools
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace


logger = get_logger(__name__)
Expand Down Expand Up @@ -185,28 +186,46 @@ def prepare_workspace(
logger.info(f"Using base docker image: {base_docker_image}")

if self.metadata.workspace_type == "docker":
# Build agent-server image from base commit0 image
workspace = DockerDevWorkspace(
base_image=base_docker_image,
working_dir="/workspace",
target=build_target,
forward_env=forward_env or [],
)
logger.info(
f"Building workspace from {base_docker_image}. This may take a while..."
)
# Try to build agent-server image from base commit0 image
# Fall back to pre-built image if build fails
try:
workspace = DockerDevWorkspace(
base_image=base_docker_image,
working_dir="/workspace",
target=build_target,
forward_env=forward_env or [],
)
logger.info(
f"Building workspace from {base_docker_image}. This may take a while..."
)
except Exception:
custom_tag = extract_custom_tag(base_docker_image)
suffix = f"-{build_target}" if build_target != "binary" else ""
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
if not image_exists(agent_server_image):
raise RuntimeError(
f"On-the-fly build failed and pre-built image {agent_server_image} does not exist"
)

workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
forward_env=forward_env or [],
)
logger.info(f"Using pre-built image {agent_server_image}")
Comment on lines +191 to +217
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: do we need this fallback? in which cases would the build fail? Could a retry mechanism not be enough?

elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
if not runtime_api_key:
raise ValueError(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
custom_tag = extract_custom_tag(base_docker_image)
suffix = f"-{build_target}" if build_target != "binary" else ""
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)

if not image_exists(agent_server_image):
Expand All @@ -217,7 +236,7 @@ def prepare_workspace(

logger.info(
f"Using remote workspace with image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down Expand Up @@ -578,7 +597,10 @@ def evaluate_instance(

def main() -> None:
prompt_dir = (Path(__file__).parent / "prompts").resolve()
choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
try:
choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
except ValueError:
choices = [str(p) for p in prompt_dir.glob("*.j2")]
Comment on lines +600 to +603
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same question here about the defensive try/except.
Why would this code fail? Does the logic need fixing, rather than adding a defensive fallback code path?

default_prompt_path = prompt_dir / "default.j2"
assert default_prompt_path.exists(), (
f"Default prompt {default_prompt_path} not found"
Expand All @@ -605,12 +627,7 @@ def main() -> None:
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")

llm_config_path = args.llm_config_path
if not os.path.isfile(llm_config_path):
raise ValueError(f"LLM config file {llm_config_path} does not exist")
with open(llm_config_path, "r") as f:
llm_config = f.read()
llm = LLM.model_validate_json(llm_config)
llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))

dataset_description = (
Expand All @@ -630,6 +647,7 @@ def main() -> None:
dataset=args.dataset,
dataset_split=args.split,
max_iterations=args.max_iterations,
conversation_timeout=args.conversation_timeout,
eval_output_dir=structured_output_dir,
details={},
prompt_path=args.prompt_path,
Expand All @@ -639,6 +657,7 @@ def main() -> None:
critic=create_critic(args),
selected_instances_file=args.select,
max_retries=args.max_retries,
skip_failed_samples=args.skip_failed_samples,
workspace_type=args.workspace,
)

Expand Down
1 change: 1 addition & 0 deletions benchmarks/gaia/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"split": "validation",
"level": "2023_all",
"num_workers": 30,
"critic": "pass",
}

# Build defaults (used by build_images.py)
Expand Down
1 change: 1 addition & 0 deletions benchmarks/gaia/eval_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ def process_gaia_results(

# Generate report
report = {
"benchmark": "gaia",
"total_instances": len(submitted_ids),
"submitted_instances": len(submitted_ids),
"completed_instances": len(completed_ids),
Expand Down
53 changes: 35 additions & 18 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
from benchmarks.utils.image_utils import image_exists
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
from benchmarks.utils.version import SDK_SHORT_SHA
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from openhands.sdk import (
LLM,
Agent,
Expand All @@ -42,7 +43,7 @@
)
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.preset.default import get_default_tools
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace


logger = get_logger(__name__)
Expand Down Expand Up @@ -151,12 +152,29 @@ def prepare_workspace(
logger.info(f"Preparing workspace for instance {instance.id}")

if self.metadata.workspace_type == "docker":
# Use DockerDevWorkspace with base image (same as main branch)
workspace = DockerDevWorkspace(
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
working_dir="/workspace",
forward_env=forward_env or [],
)
# Use DockerDevWorkspace with base image
# Fall back to pre-built image if build fails
try:
workspace = DockerDevWorkspace(
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
working_dir="/workspace",
forward_env=forward_env or [],
)
except Exception as build_error:
build_target = os.getenv("GAIA_BUILD_TARGET", "binary-minimal")
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-{build_target}"
)
if not image_exists(agent_server_image):
raise RuntimeError(
f"On-the-fly build failed and pre-built image {agent_server_image} does not exist"
)
workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
forward_env=forward_env or [],
)
logger.info(f"Using pre-built image {agent_server_image}")
elif self.metadata.workspace_type == "remote":
# For workflow, use APIRemoteWorkspace with pre-built GAIA image
# GAIA uses a universal agent server image (one image for all instances)
Expand All @@ -169,9 +187,8 @@ def prepare_workspace(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary"
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
)

if not image_exists(agent_server_image):
Expand All @@ -182,7 +199,7 @@ def prepare_workspace(

logger.info(
f"Using remote workspace with GAIA image {agent_server_image} "
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down Expand Up @@ -349,7 +366,9 @@ def evaluate_instance(
else:
conversation.send_message(instruction)
# Run conversation with fake user responses to handle agent messages
run_conversation_with_fake_user_response(conversation)
run_conversation_with_fake_user_response(
conversation, run_timeout=self.metadata.conversation_timeout
)

# Extract answer from conversation history
model_answer_raw = self._extract_answer_from_history(conversation.state.events)
Expand Down Expand Up @@ -565,12 +584,7 @@ def main() -> None:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")

# Load LLM config
llm_config_path = args.llm_config_path
if not os.path.isfile(llm_config_path):
raise ValueError(f"LLM config file {llm_config_path} does not exist")
with open(llm_config_path, "r") as f:
llm_config = f.read()
llm = LLM.model_validate_json(llm_config)
llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))

# Construct dataset description
Expand All @@ -591,12 +605,15 @@ def main() -> None:
dataset=args.dataset,
dataset_split=args.split,
max_iterations=args.max_iterations,
conversation_timeout=args.conversation_timeout,
eval_output_dir=structured_output_dir,
details={"level": args.level},
eval_limit=args.n_limit,
max_attempts=args.max_attempts,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
skip_failed_samples=args.skip_failed_samples,
workspace_type=args.workspace,
)

Expand Down
20 changes: 13 additions & 7 deletions benchmarks/multiswebench/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,16 @@
--image ghcr.io/openhands/eval-agent-server --target source-minimal
"""

import json
import os
from pathlib import Path

from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
from benchmarks.utils.build_utils import (
build_all_images,
default_build_output_dir,
get_build_parser,
)
from benchmarks.utils.dataset import get_dataset
from openhands.sdk import get_logger


Expand All @@ -37,7 +38,7 @@ def get_official_docker_image(

# For Multi-SWE-Bench, the image naming depends on the language
repo = instance["repo"]
version = instance["version"]
version = instance.get("version", "")

if LANGUAGE == "python":
# Use SWE-bench style naming for Python
Expand All @@ -52,7 +53,7 @@ def get_official_docker_image(
else:
org = instance.get("org", repo)
repo_name = repo
official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base"
official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base".lower()

logger.debug(f"Multi-SWE-Bench image: {official_image_name}")
return official_image_name
Expand All @@ -79,12 +80,16 @@ def extract_custom_tag(base_image: str) -> str:

def get_base_images_from_dataset(dataset_name: str, split: str) -> list[str]:
"""Get all unique base images from the dataset."""
dataset = get_dataset(dataset_name, split)
local_path = download_and_concat_dataset(dataset_name, LANGUAGE)
base_images = set()

for _, row in dataset.iterrows():
image = get_official_docker_image(row.to_dict())
base_images.add(image)
with open(local_path, "r", encoding="utf-8") as f:
for line in f:
if not line.strip():
continue
instance = json.loads(line)
image = get_official_docker_image(instance)
base_images.add(image)

return list(base_images)

Expand All @@ -107,6 +112,7 @@ def main():
build_dir=Path(
args.output_dir or default_build_output_dir(args.dataset, args.split)
),
base_image_to_custom_tag_fn=extract_custom_tag,
max_workers=args.num_workers,
dry_run=False,
)
Expand Down
Loading