From 5249749d2baa8f1a9bd3ed25a2185dd5fefaccfe Mon Sep 17 00:00:00 2001 From: Ewa Dobrowolska Date: Thu, 12 Feb 2026 20:43:51 +0100 Subject: [PATCH 1/2] nemo-evaluator implementation --- VERSION | 1 + benchmarks/commit0/eval_infer.py | 1 + benchmarks/commit0/run_infer.py | 63 +++-- benchmarks/gaia/config.py | 1 + benchmarks/gaia/eval_infer.py | 1 + benchmarks/gaia/run_infer.py | 53 ++-- benchmarks/multiswebench/build_images.py | 20 +- benchmarks/multiswebench/eval_infer.py | 53 +++- benchmarks/multiswebench/run_infer.py | 66 +++-- benchmarks/openagentsafety/build_images.py | 16 +- benchmarks/openagentsafety/run_infer.py | 119 +++++++- benchmarks/scripts/generate_llm_config.py | 84 ++++++ benchmarks/scripts/run_benchmark.py | 209 ++++++++++++++ benchmarks/swebench/eval_infer.py | 48 +++- benchmarks/swebench/run_infer.py | 26 +- benchmarks/swebenchmultimodal/eval_infer.py | 160 +++++++---- benchmarks/swebenchmultimodal/run_infer.py | 28 +- benchmarks/swtbench/eval_infer.py | 57 ++-- benchmarks/swtbench/run_infer.py | 58 ++-- benchmarks/utils/args_parser.py | 13 + benchmarks/utils/build_utils.py | 11 +- benchmarks/utils/constants.py | 8 +- benchmarks/utils/evaluation.py | 45 ++- benchmarks/utils/fake_user_response.py | 10 +- benchmarks/utils/image_utils.py | 20 ++ benchmarks/utils/llm_config.py | 37 +++ benchmarks/utils/models.py | 14 + benchmarks/utils/version.py | 15 +- .../openhands_benchmarks/__init__.py | 3 + .../openhands_benchmarks/framework.yml | 266 ++++++++++++++++++ nemo_evaluator/openhands_benchmarks/output.py | 54 ++++ pyproject.toml | 12 +- 32 files changed, 1326 insertions(+), 246 deletions(-) create mode 100644 VERSION create mode 100644 benchmarks/scripts/generate_llm_config.py create mode 100644 benchmarks/scripts/run_benchmark.py create mode 100644 benchmarks/utils/llm_config.py create mode 100644 nemo_evaluator/openhands_benchmarks/__init__.py create mode 100644 nemo_evaluator/openhands_benchmarks/framework.yml create mode 100644 nemo_evaluator/openhands_benchmarks/output.py diff --git a/VERSION b/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py index 8b45f85c..7300fa34 100644 --- a/benchmarks/commit0/eval_infer.py +++ b/benchmarks/commit0/eval_infer.py @@ -120,6 +120,7 @@ def process_commit0_results(input_file: str, output_file: str) -> None: # Generate report report = { + "benchmark": "commit0", "total_instances": 16, # Fixed as per requirement "submitted_instances": len(completed_ids), "completed_instances": len(completed_ids), diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index df10feb2..a4d29c74 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -29,11 +29,12 @@ EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.llm_config import load_llm_config +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools -from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace +from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace logger = get_logger(__name__) @@ -185,16 +186,35 @@ def prepare_workspace( logger.info(f"Using base docker image: {base_docker_image}") if self.metadata.workspace_type == "docker": - # Build agent-server image from base commit0 image - workspace = DockerDevWorkspace( - base_image=base_docker_image, - working_dir="/workspace", - target=build_target, - forward_env=forward_env or [], - ) - logger.info( - f"Building workspace from {base_docker_image}. This may take a while..." - ) + # Try to build agent-server image from base commit0 image + # Fall back to pre-built image if build fails + try: + workspace = DockerDevWorkspace( + base_image=base_docker_image, + working_dir="/workspace", + target=build_target, + forward_env=forward_env or [], + ) + logger.info( + f"Building workspace from {base_docker_image}. This may take a while..." + ) + except Exception: + custom_tag = extract_custom_tag(base_docker_image) + suffix = f"-{build_target}" if build_target != "binary" else "" + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + ) + if not image_exists(agent_server_image): + raise RuntimeError( + f"On-the-fly build failed and pre-built image {agent_server_image} does not exist" + ) + + workspace = DockerWorkspace( + server_image=agent_server_image, + working_dir="/workspace", + forward_env=forward_env or [], + ) + logger.info(f"Using pre-built image {agent_server_image}") elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") if not runtime_api_key: @@ -202,11 +222,10 @@ def prepare_workspace( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) custom_tag = extract_custom_tag(base_docker_image) suffix = f"-{build_target}" if build_target != "binary" else "" agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): @@ -217,7 +236,7 @@ def prepare_workspace( logger.info( f"Using remote workspace with image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -578,7 +597,10 @@ def evaluate_instance( def main() -> None: prompt_dir = (Path(__file__).parent / "prompts").resolve() - choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + try: + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + except ValueError: + choices = [str(p) for p in prompt_dir.glob("*.j2")] default_prompt_path = prompt_dir / "default.j2" assert default_prompt_path.exists(), ( f"Default prompt {default_prompt_path} not found" @@ -605,12 +627,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( @@ -630,6 +647,7 @@ def main() -> None: dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, + conversation_timeout=args.conversation_timeout, eval_output_dir=structured_output_dir, details={}, prompt_path=args.prompt_path, @@ -639,6 +657,7 @@ def main() -> None: critic=create_critic(args), selected_instances_file=args.select, max_retries=args.max_retries, + skip_failed_samples=args.skip_failed_samples, workspace_type=args.workspace, ) diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py index dadaa20a..08fb7027 100644 --- a/benchmarks/gaia/config.py +++ b/benchmarks/gaia/config.py @@ -10,6 +10,7 @@ "split": "validation", "level": "2023_all", "num_workers": 30, + "critic": "pass", } # Build defaults (used by build_images.py) diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py index 7aad859b..bc11ebc0 100644 --- a/benchmarks/gaia/eval_infer.py +++ b/benchmarks/gaia/eval_infer.py @@ -148,6 +148,7 @@ def process_gaia_results( # Generate report report = { + "benchmark": "gaia", "total_instances": len(submitted_ids), "submitted_instances": len(submitted_ids), "completed_instances": len(completed_ids), diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 8f52d171..5d3987ab 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -28,7 +28,8 @@ from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response from benchmarks.utils.image_utils import image_exists from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.llm_config import load_llm_config +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import ( LLM, Agent, @@ -42,7 +43,7 @@ ) from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools -from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace +from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace logger = get_logger(__name__) @@ -151,12 +152,29 @@ def prepare_workspace( logger.info(f"Preparing workspace for instance {instance.id}") if self.metadata.workspace_type == "docker": - # Use DockerDevWorkspace with base image (same as main branch) - workspace = DockerDevWorkspace( - base_image="nikolaik/python-nodejs:python3.12-nodejs22", - working_dir="/workspace", - forward_env=forward_env or [], - ) + # Use DockerDevWorkspace with base image + # Fall back to pre-built image if build fails + try: + workspace = DockerDevWorkspace( + base_image="nikolaik/python-nodejs:python3.12-nodejs22", + working_dir="/workspace", + forward_env=forward_env or [], + ) + except Exception as build_error: + build_target = os.getenv("GAIA_BUILD_TARGET", "binary-minimal") + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-{build_target}" + ) + if not image_exists(agent_server_image): + raise RuntimeError( + f"On-the-fly build failed and pre-built image {agent_server_image} does not exist" + ) + workspace = DockerWorkspace( + server_image=agent_server_image, + working_dir="/workspace", + forward_env=forward_env or [], + ) + logger.info(f"Using pre-built image {agent_server_image}") elif self.metadata.workspace_type == "remote": # For workflow, use APIRemoteWorkspace with pre-built GAIA image # GAIA uses a universal agent server image (one image for all instances) @@ -169,9 +187,8 @@ def prepare_workspace( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary" ) if not image_exists(agent_server_image): @@ -182,7 +199,7 @@ def prepare_workspace( logger.info( f"Using remote workspace with GAIA image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -349,7 +366,9 @@ def evaluate_instance( else: conversation.send_message(instruction) # Run conversation with fake user responses to handle agent messages - run_conversation_with_fake_user_response(conversation) + run_conversation_with_fake_user_response( + conversation, run_timeout=self.metadata.conversation_timeout + ) # Extract answer from conversation history model_answer_raw = self._extract_answer_from_history(conversation.state.events) @@ -565,12 +584,7 @@ def main() -> None: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") # Load LLM config - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) # Construct dataset description @@ -591,12 +605,15 @@ def main() -> None: dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, + conversation_timeout=args.conversation_timeout, eval_output_dir=structured_output_dir, details={"level": args.level}, eval_limit=args.n_limit, max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, + max_retries=args.max_retries, + skip_failed_samples=args.skip_failed_samples, workspace_type=args.workspace, ) diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py index 3ecdeeb6..86894e04 100644 --- a/benchmarks/multiswebench/build_images.py +++ b/benchmarks/multiswebench/build_images.py @@ -8,15 +8,16 @@ --image ghcr.io/openhands/eval-agent-server --target source-minimal """ +import json import os from pathlib import Path +from benchmarks.multiswebench.download_dataset import download_and_concat_dataset from benchmarks.utils.build_utils import ( build_all_images, default_build_output_dir, get_build_parser, ) -from benchmarks.utils.dataset import get_dataset from openhands.sdk import get_logger @@ -37,7 +38,7 @@ def get_official_docker_image( # For Multi-SWE-Bench, the image naming depends on the language repo = instance["repo"] - version = instance["version"] + version = instance.get("version", "") if LANGUAGE == "python": # Use SWE-bench style naming for Python @@ -52,7 +53,7 @@ def get_official_docker_image( else: org = instance.get("org", repo) repo_name = repo - official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base" + official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base".lower() logger.debug(f"Multi-SWE-Bench image: {official_image_name}") return official_image_name @@ -79,12 +80,16 @@ def extract_custom_tag(base_image: str) -> str: def get_base_images_from_dataset(dataset_name: str, split: str) -> list[str]: """Get all unique base images from the dataset.""" - dataset = get_dataset(dataset_name, split) + local_path = download_and_concat_dataset(dataset_name, LANGUAGE) base_images = set() - for _, row in dataset.iterrows(): - image = get_official_docker_image(row.to_dict()) - base_images.add(image) + with open(local_path, "r", encoding="utf-8") as f: + for line in f: + if not line.strip(): + continue + instance = json.loads(line) + image = get_official_docker_image(instance) + base_images.add(image) return list(base_images) @@ -107,6 +112,7 @@ def main(): build_dir=Path( args.output_dir or default_build_output_dir(args.dataset, args.split) ), + base_image_to_custom_tag_fn=extract_custom_tag, max_workers=args.num_workers, dry_run=False, ) diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py index 3bb88cf1..a2efeb86 100644 --- a/benchmarks/multiswebench/eval_infer.py +++ b/benchmarks/multiswebench/eval_infer.py @@ -10,8 +10,10 @@ """ import argparse +import json import shutil import subprocess +import sys from pathlib import Path from benchmarks.multiswebench.download_dataset import download_and_concat_dataset @@ -59,8 +61,8 @@ def run_multi_swebench_evaluation( # Create config file for Multi-SWE-Bench config_file = work_dir / "config.json" - # Handle dataset path - download if it's a ByteDance-Seed/Multi-SWE-bench dataset - if dataset_name.startswith("ByteDance-Seed/Multi-SWE-bench"): + # Handle dataset path - download if it's Multi-SWE-Bench + if "multi-swe-bench" in dataset_name.lower(): logger.info(f"Downloading Multi-SWE-bench dataset for language: {lang}") dataset_path = download_and_concat_dataset(dataset_name, lang) else: @@ -73,17 +75,39 @@ def run_multi_swebench_evaluation( # Run the Multi-SWE-Bench evaluation logger.info("Running Multi-SWE-Bench evaluation harness...") - cmd = [ - "uv", - "run", - "python", - "-m", - "multi_swe_bench.harness.run_evaluation", + # Try uv first, fall back to current Python interpreter + try: + uv_check = subprocess.run( + ["uv", "--version"], + capture_output=True, + text=True, + ) + uv_available = uv_check.returncode == 0 + except FileNotFoundError: + uv_available = False + + if uv_available: + cmd = [ + "uv", + "run", + "python", + "-m", + "multi_swe_bench.harness.run_evaluation", + ] + else: + logger.info("uv not available, using current Python interpreter") + cmd = [ + sys.executable, + "-m", + "multi_swe_bench.harness.run_evaluation", + ] + + cmd.extend([ "--config", str(config_file.resolve()), "--mode", "evaluation", - ] + ]) logger.info(f"Evaluation command: {' '.join(cmd)}") @@ -96,11 +120,13 @@ def run_multi_swebench_evaluation( error_msg = f"Evaluation failed with return code {result.returncode}" print(f"ERROR: {error_msg}") logger.error(error_msg) + raise subprocess.CalledProcessError(result.returncode, cmd) except Exception as e: error_msg = f"Error running evaluation: {e}" print(f"ERROR: {error_msg}") logger.error(error_msg) + raise def main(): @@ -140,10 +166,17 @@ def main(): logger.info(f"Results saved to {results_file}") # Move the report file to the output location - output_report_path = args.input_file.with_suffix(".report.json") + output_report_path = Path(args.input_file).with_suffix(".report.json") shutil.move(str(results_file), str(output_report_path)) logger.info(f"Report moved to {output_report_path}") + # Add benchmark field to the report + with open(output_report_path, "r") as f: + report_data = json.load(f) + report_data["benchmark"] = f"multiswebench-{args.lang}" + with open(output_report_path, "w") as f: + json.dump(report_data, f, indent=4) + # Update Laminar datapoints with evaluation scores LaminarService.get().update_evaluation_scores( str(args.input_file), str(output_report_path) diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index afeb7f6e..ec3337d1 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -25,13 +25,14 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.image_utils import image_exists from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools @@ -118,9 +119,9 @@ def __init__(self, metadata: MultiSWEBenchEvalMetadata, **kwargs): def prepare_instances(self) -> List[EvalInstance]: logger.info("Setting up Multi-SWE-bench evaluation data") - # Check if this is a ByteDance-Seed/Multi-SWE-bench dataset that needs downloading + # Check if this is a Multi-SWE-bench dataset that needs downloading dataset_path = self.metadata.dataset - if dataset_path.startswith("ByteDance-Seed/Multi-SWE-bench"): + if "multi-swe-bench" in dataset_path.lower(): metadata = cast(MultiSWEBenchEvalMetadata, self.metadata) logger.info( f"Downloading Multi-SWE-bench dataset for language: {metadata.lang}" @@ -207,7 +208,7 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in ( "1", @@ -224,20 +225,28 @@ def prepare_workspace( "MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " "agent-server image." ) - output = build_image( - base_image=official_docker_image, - target_image=EVAL_AGENT_SERVER_IMAGE, - custom_tag=custom_tag, - target=build_target, - push=False, - ) - logger.info(f"Image build output: {output}") - assert output.error is None, f"Image build failed: {output.error}" - if agent_server_image not in output.tags: - raise RuntimeError( - f"Built image tags {output.tags} do not include expected tag " - f"{agent_server_image}" + try: + output = build_image( + base_image=official_docker_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=build_target, + push=False, ) + logger.info(f"Image build output: {output}") + if output.error is not None: + raise RuntimeError(f"Image build failed: {output.error}") + if agent_server_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{agent_server_image}" + ) + except Exception as build_error: + if not image_exists(agent_server_image): + raise RuntimeError( + f"On-the-fly build failed and pre-built image {agent_server_image} does not exist" + ) + logger.info(f"Using pre-built image {agent_server_image}") workspace = DockerWorkspace( server_image=agent_server_image, @@ -246,14 +255,13 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): raise RuntimeError( @@ -262,7 +270,7 @@ def prepare_workspace( ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -366,7 +374,9 @@ def evaluate_instance( ) conversation.send_message(instruction) # Run conversation with fake user responses to handle agent messages - run_conversation_with_fake_user_response(conversation) + run_conversation_with_fake_user_response( + conversation, run_timeout=self.metadata.conversation_timeout + ) # git add workspace.execute_command(f"cd {repo_path} ; git add -A") @@ -416,7 +426,10 @@ def evaluate_instance( def main() -> None: prompt_dir = (Path(__file__).parent / "prompts").resolve() - choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + try: + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + except ValueError: + choices = [str(p) for p in prompt_dir.glob("*.j2")] default_prompt_path = prompt_dir / "default.j2" assert default_prompt_path.exists(), ( f"Default prompt {default_prompt_path} not found" @@ -442,12 +455,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( @@ -472,6 +480,7 @@ def main() -> None: dataset_split=args.split, lang=args.lang, max_iterations=args.max_iterations, + conversation_timeout=args.conversation_timeout, eval_output_dir=structured_output_dir, details={}, prompt_path=args.prompt_path, @@ -481,6 +490,7 @@ def main() -> None: critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, + skip_failed_samples=args.skip_failed_samples, workspace_type=args.workspace, ) diff --git a/benchmarks/openagentsafety/build_images.py b/benchmarks/openagentsafety/build_images.py index acb18384..1548bf2c 100644 --- a/benchmarks/openagentsafety/build_images.py +++ b/benchmarks/openagentsafety/build_images.py @@ -1,6 +1,7 @@ """Build OpenAgentSafety Docker image from vendor/software-agent-sdk""" import logging +import os import subprocess from pathlib import Path @@ -31,6 +32,16 @@ def get_vendor_sdk_commit() -> str: return result.stdout.strip() +def get_image_name() -> str: + image_name = os.getenv("EVAL_AGENT_SERVER_IMAGE", "openagentsafety-agent-server") + tag_prefix = os.getenv("IMAGE_TAG_PREFIX") + if tag_prefix: + tag = f"{tag_prefix}-openagentsafety" + else: + tag = get_vendor_sdk_commit() + return f"{image_name}:{tag}" + + def check_image_exists(image_name: str) -> bool: """Check if a Docker image exists locally.""" result = subprocess.run( @@ -48,12 +59,13 @@ def build_workspace_image(force_rebuild: bool = False, no_cache: bool = False) - force_rebuild: if True, ignore existing images and rebuild. no_cache: if True, pass --no-cache to docker build to avoid layer cache. """ - sdk_commit = get_vendor_sdk_commit() - image_name = f"openagentsafety-agent-server:{sdk_commit}" + image_name = get_image_name() if not force_rebuild and check_image_exists(image_name): logger.info(f"#### Using existing image: {image_name}") return image_name + + sdk_commit = get_vendor_sdk_commit() logger.info(f"#### Building Docker image: {image_name}") logger.info(f"#### SDK version: {sdk_commit}") diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index b9afff4f..e3ca9a8f 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -12,7 +12,11 @@ import requests from jinja2 import Environment, FileSystemLoader -from benchmarks.openagentsafety.build_images import build_workspace_image +from benchmarks.openagentsafety.build_images import ( + build_workspace_image, + check_image_exists, + get_image_name, +) from benchmarks.utils.args_parser import get_parser from benchmarks.utils.conversation import build_event_persistence_callback from benchmarks.utils.critics import create_critic @@ -20,6 +24,7 @@ from benchmarks.utils.evaluation import Evaluation from benchmarks.utils.evaluation_utils import construct_eval_output_dir from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace @@ -38,12 +43,16 @@ def convert_numpy_types(obj: Any) -> Any: return float(obj) elif isinstance(obj, np.ndarray): return obj.tolist() - elif pd.isna(obj): - return None elif isinstance(obj, dict): return {k: convert_numpy_types(v) for k, v in obj.items()} elif isinstance(obj, list): return [convert_numpy_types(item) for item in obj] + else: + try: + if pd.isna(obj): + return None + except (ValueError, TypeError): + pass return obj @@ -57,8 +66,13 @@ def default(self, o): return float(o) elif isinstance(o, np.ndarray): return o.tolist() - elif pd.isna(o): - return None + elif hasattr(o, "model_dump"): + return o.model_dump() + try: + if pd.isna(o): + return None + except (ValueError, TypeError): + pass return super().default(o) @@ -183,7 +197,7 @@ def cleanup_docker_containers(): "-a", "-q", "--filter", - "ancestor=openagentsafety-agent-server:local", + f"ancestor={get_image_name()}", ], capture_output=True, text=True, @@ -378,7 +392,17 @@ def prepare_workspace( resource_factor: Resource factor for runtime allocation (default: 1). forward_env: Environment variables to forward into the workspace. """ - server_image = build_workspace_image() + # Try to build image on-the-fly, fall back to pre-built if build fails + try: + server_image = build_workspace_image() + except Exception as build_error: + server_image = get_image_name() + + if not check_image_exists(server_image): + raise RuntimeError( + f"On-the-fly build failed and pre-built image {server_image} does not exist" + ) + logger.info(f"Using pre-built image {server_image}") workspace = DockerWorkspace( server_image=server_image, @@ -462,7 +486,9 @@ def event_callback(event) -> None: try: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) - run_conversation_with_fake_user_response(conversation) + run_conversation_with_fake_user_response( + conversation, run_timeout=self.metadata.conversation_timeout + ) logger.info(f"Conversation completed for {instance.id}") except ValidationError as e: logger.warning(f"Validation error from custom events (continuing): {e}") @@ -530,6 +556,65 @@ def event_callback(event) -> None: ) +def generate_report(output_jsonl: str, report_path: str, model_name: str) -> None: + """Generate a .report.json from the output.jsonl, matching the format + expected by nemo_evaluator (same schema as SWE-Bench / GAIA reports).""" + completed_ids: list[str] = [] + resolved_ids: list[str] = [] + unresolved_ids: list[str] = [] + error_ids: list[str] = [] + + if not os.path.exists(output_jsonl): + logger.warning("No output.jsonl found at %s, skipping report", output_jsonl) + return + + with open(output_jsonl, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + except json.JSONDecodeError: + continue + + instance_id = data.get("instance_id", "") + error = data.get("error") + test_result = data.get("test_result", {}) + + if error or test_result.get("error"): + error_ids.append(instance_id) + else: + completed_ids.append(instance_id) + # Treat as resolved when there is no error + resolved_ids.append(instance_id) + + submitted_ids = completed_ids + error_ids + report = { + "benchmark": "openagentsafety", + "model_name_or_path": model_name, + "total_instances": len(submitted_ids), + "submitted_instances": len(submitted_ids), + "completed_instances": len(completed_ids), + "incomplete_instances": 0, + "resolved_instances": len(resolved_ids), + "unresolved_instances": len(unresolved_ids), + "empty_patch_instances": 0, + "error_instances": len(error_ids), + "submitted_ids": submitted_ids, + "completed_ids": completed_ids, + "incomplete_ids": [], + "resolved_ids": resolved_ids, + "unresolved_ids": unresolved_ids, + } + + with open(report_path, "w") as f: + json.dump(report, f, indent=4) + + logger.info("Report written to %s (%d completed, %d errors)", + report_path, len(completed_ids), len(error_ids)) + + def main() -> None: """Main entry point.""" parser = get_parser(add_llm_config=True) @@ -542,12 +627,7 @@ def main() -> None: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") # Load LLM config - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) # Construct output directory @@ -572,15 +652,18 @@ def main() -> None: dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, + conversation_timeout=args.conversation_timeout, eval_output_dir=structured_output_dir, details={ - "server_image": "openagentsafety-agent-server:local", + "server_image": get_image_name(), "platform": "linux/amd64", }, eval_limit=args.n_limit, max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, + max_retries=args.max_retries, + skip_failed_samples=args.skip_failed_samples, ) # Initial cleanup @@ -636,6 +719,12 @@ def _cb(instance: EvalInstance, out: EvalOutput) -> None: # Run evaluation evaluator.run(on_result=_default_on_result_writer(metadata.eval_output_dir)) + # Generate .report.json for nemo_evaluator compatibility + report_path = os.path.join( + metadata.eval_output_dir, "output.report.json" + ) + generate_report(evaluator.output_path, report_path, llm.model) + # Final cleanup cleanup_docker_containers() diff --git a/benchmarks/scripts/generate_llm_config.py b/benchmarks/scripts/generate_llm_config.py new file mode 100644 index 00000000..344ac36b --- /dev/null +++ b/benchmarks/scripts/generate_llm_config.py @@ -0,0 +1,84 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path + + +def generate_config( + model: str, + output_path: str, + api_base_url: str | None = None, + api_key_env: str | None = None, + temperature: float | None = None, + top_p: float | None = None, + max_completion_tokens: int | None = None, + timeout: int | None = None, + max_retries: int | None = None, +) -> None: + llm_config: dict[str, object] = {"model": model} + + if api_base_url: + llm_config["base_url"] = api_base_url + if api_key_env: + llm_config["api_key_env"] = api_key_env + if temperature is not None: + llm_config["temperature"] = temperature + if top_p is not None: + llm_config["top_p"] = top_p + if max_completion_tokens is not None: + llm_config["max_output_tokens"] = max_completion_tokens + if timeout is not None: + llm_config["timeout"] = timeout + if max_retries is not None: + llm_config["num_retries"] = max_retries + + out_path = Path(output_path) + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text(json.dumps(llm_config, indent=2) + "\n", encoding="utf-8") + + print(f"Wrote LLM config to {str(out_path)}") + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Generate LLM config from CLI args", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument("--model", type=str, required=True, help="Model name/id") + parser.add_argument("--api-base-url", type=str, help="API base URL") + parser.add_argument( + "--api-key-env", + type=str, + help="Environment variable name containing the API key", + ) + parser.add_argument("--temperature", type=float, help="Sampling temperature") + parser.add_argument("--top-p", type=float, help="Nucleus sampling (top-p)") + parser.add_argument("--max-completion-tokens", type=int, help="Max completion tokens") + parser.add_argument("--timeout", type=int, help="API timeout in seconds") + parser.add_argument("--max-retries", type=int, help="Max API call retries") + parser.add_argument( + "--output-path", + type=str, + required=True, + help="Where to write the generated JSON config", + ) + + args = parser.parse_args() + + generate_config( + model=args.model, + output_path=args.output_path, + api_base_url=args.api_base_url, + api_key_env=args.api_key_env, + temperature=args.temperature, + top_p=args.top_p, + max_completion_tokens=args.max_completion_tokens, + timeout=args.timeout, + max_retries=args.max_retries, + ) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/scripts/run_benchmark.py b/benchmarks/scripts/run_benchmark.py new file mode 100644 index 00000000..7b6b1cf8 --- /dev/null +++ b/benchmarks/scripts/run_benchmark.py @@ -0,0 +1,209 @@ +from __future__ import annotations + +import argparse +import os +import subprocess +import sys +from pathlib import Path + +from benchmarks.scripts.generate_llm_config import generate_config + + +INFER_ENTRYPOINTS = { + "swebench": "swebench-infer", + "gaia": "gaia-infer", + "commit0": "commit0-infer", + "multiswebench": "multiswebench-infer", + "swtbench": "swtbench-infer", + "swebenchmultimodal": "swebenchmultimodal-infer", + "openagentsafety": "openagentsafety-infer", +} + +EVAL_ENTRYPOINTS = { + "swebench": "swebench-eval", + "gaia": "gaia-eval", + "commit0": "commit0-eval", + "multiswebench": "multiswebench-eval", + "swtbench": "swtbench-eval", + "swebenchmultimodal": "swebenchmultimodal-eval", + # openagentsafety doesn't have a separate eval entrypoint +} + +# Patch-based benchmarks use "finish_with_patch" (requires git patch). +# gaia and openagentsafety use "pass" (accept any completed output). +BENCHMARK_CRITIC = { + "swebench": "finish_with_patch", + "swtbench": "finish_with_patch", + "swebenchmultimodal": "finish_with_patch", + "multiswebench": "finish_with_patch", + "commit0": "finish_with_patch", + "gaia": "pass", + "openagentsafety": "pass", +} + + +def _build_infer_cmd(args: argparse.Namespace, llm_config_path: Path) -> list[str]: + """Build the inference command with benchmark-specific args.""" + cmd = [ + INFER_ENTRYPOINTS[args.benchmark], + str(llm_config_path), + "--workspace", args.workspace, + "--max-iterations", str(args.max_iterations), + "--conversation-timeout", str(args.conversation_timeout), + "--num-workers", str(args.num_workers), + "--output-dir", str(args.output_dir), + "--max-attempts", str(args.max_attempts), + "--max-retries", str(args.instance_max_retries), + "--critic", BENCHMARK_CRITIC.get(args.benchmark, "finish_with_patch"), + ] + if args.dataset: + cmd.extend(["--dataset", args.dataset]) + if args.split: + cmd.extend(["--split", args.split]) + + if args.note: + cmd.extend(["--note", args.note]) + if args.n_limit is not None: + cmd.extend(["--n-limit", str(args.n_limit)]) + if args.skip_failed_samples: + cmd.append("--skip-failed-samples") + + # ----- Benchmark-specific inference args ----- + + # GAIA requires --level (e.g. 2023_level1, 2023_all) + if args.benchmark == "gaia" and args.level: + cmd.extend(["--level", args.level]) + + # commit0 requires --repo-split (e.g. lite, all) + if args.benchmark == "commit0" and args.repo_split: + cmd.extend(["--repo-split", args.repo_split]) + + # multiswebench requires --lang (e.g. java, python, go, c) + if args.benchmark == "multiswebench" and args.language: + cmd.extend(["--lang", args.language]) + + return cmd + + +def _build_eval_cmd(args: argparse.Namespace, output_jsonl: Path) -> list[str]: + """Build the evaluation command with benchmark-specific args.""" + benchmark = args.benchmark + if benchmark not in EVAL_ENTRYPOINTS: + return [] + + cmd = [EVAL_ENTRYPOINTS[benchmark], str(output_jsonl)] + + if benchmark in ("swebench", "swebenchmultimodal") and args.dataset: + cmd.extend(["--dataset", args.dataset]) + + if benchmark == "swebench": + cmd.extend(["--run-id", "eval"]) + if benchmark in ("swebench", "swebenchmultimodal"): + if args.modal is True: + cmd.append("--modal") + elif args.modal is False: + cmd.append("--no-modal") + + if benchmark == "multiswebench" and args.dataset: + cmd.extend(["--dataset", args.dataset]) + if args.language: + cmd.extend(["--lang", args.language]) + + return cmd + + +def main() -> None: + parser = argparse.ArgumentParser() + + # LLM config generation args + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--api-base-url", type=str, required=True) + parser.add_argument("--api-key-env", type=str, default=None, help="Env var name for API key") + parser.add_argument("--temperature", type=float, default=0.0) + parser.add_argument("--top-p", type=float, default=1.0) + parser.add_argument("--max-completion-tokens", type=int, default=4096) + parser.add_argument("--timeout", type=int, default=600) + parser.add_argument("--max-retries", type=int, default=3) + + # Benchmark selection + parser.add_argument("--benchmark", required=True, choices=INFER_ENTRYPOINTS.keys()) + + # Common inference args + parser.add_argument("--dataset", type=str, default=None) + parser.add_argument("--split", type=str, default=None) + parser.add_argument("--workspace", type=str, default="docker") + parser.add_argument("--max-iterations", type=int, default=100) + parser.add_argument("--conversation-timeout", type=float, default=3600.0) + parser.add_argument("--num-workers", type=int, default=1) + parser.add_argument("--note", type=str, default="") + parser.add_argument("--output-dir", type=str, required=True) + parser.add_argument("--max-attempts", type=int, default=3) + parser.add_argument("--instance-max-retries", type=int, default=3) + parser.add_argument("--n-limit", type=int, default=None) + parser.add_argument("--skip-failed-samples", action="store_true") + + # GAIA + parser.add_argument("--level", type=str, default="2023_all", + help="GAIA level (e.g. 2023_level1, 2023_all)") + # commit0 + parser.add_argument("--repo-split", type=str, default="lite", + help="commit0 repo split (lite, all, or repo name)") + # multiswebench + parser.add_argument("--language", type=str, default=None, + help="multiswebench language (java, python, go, c)") + # swebench/swebenchmultimodal + parser.add_argument( + "--modal", + dest="modal", + action=argparse.BooleanOptionalAction, + default=None, + help=( + "Enable/disable Modal for swebench and swebenchmultimodal evaluation. " + "If omitted, each benchmark uses its default." + ), + ) + + args = parser.parse_args() + + output_dir = Path(args.output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + llm_config_path = output_dir / "llm_config.json" + + # 1) Generate LLM config + generate_config( + model=args.model, + api_base_url=args.api_base_url, + api_key_env=args.api_key_env, + temperature=args.temperature, + top_p=args.top_p, + max_completion_tokens=args.max_completion_tokens, + timeout=args.timeout, + max_retries=args.max_retries, + output_path=str(llm_config_path), + ) + + # 2) Run inference + # multiswebench reads LANGUAGE env var at module level for Docker image naming + if args.benchmark == "multiswebench" and args.language: + os.environ["LANGUAGE"] = args.language + + infer_cmd = _build_infer_cmd(args, llm_config_path) + ret = subprocess.call(infer_cmd) + if ret != 0: + sys.exit(ret) + + # 3) Find output.jsonl and run evaluation + output_files = sorted(output_dir.rglob("output.jsonl")) + if not output_files: + print(f"ERROR: Inference did not produce output.jsonl under {output_dir}", file=sys.stderr) + sys.exit(1) + + output_jsonl = output_files[-1] # Use the latest one + + eval_cmd = _build_eval_cmd(args, output_jsonl) + if eval_cmd: + sys.exit(subprocess.call(eval_cmd)) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py index bfed6217..1939e3e9 100644 --- a/benchmarks/swebench/eval_infer.py +++ b/benchmarks/swebench/eval_infer.py @@ -143,14 +143,34 @@ def run_swebench_evaluation( predictions_dir = predictions_path.parent predictions_filename = predictions_path.name - # Run SWE-Bench evaluation using global python (not UV environment) - # since swebench is installed globally - cmd = [ - "uv", - "run", - "python", - "-m", - "swebench.harness.run_evaluation", + # Try uv first, fall back to current Python interpreter + try: + uv_check = subprocess.run( + ["uv", "--version"], + capture_output=True, + text=True, + ) + uv_available = uv_check.returncode == 0 + except FileNotFoundError: + uv_available = False + + if uv_available: + cmd = [ + "uv", + "run", + "python", + "-m", + "swebench.harness.run_evaluation", + ] + else: + logger.info("uv not available, using current Python interpreter") + cmd = [ + sys.executable, + "-m", + "swebench.harness.run_evaluation", + ] + + cmd.extend([ "--dataset_name", dataset, "--predictions_path", @@ -159,7 +179,7 @@ def run_swebench_evaluation( str(workers), "--run_id", run_id, - ] + ]) # Add parameters cmd.extend(["--split", split]) @@ -316,6 +336,16 @@ def main() -> None: shutil.move(str(report_path), str(dest_report_path)) logger.info(f"Moved report file to: {dest_report_path}") + # Add benchmark field to the report + with open(dest_report_path, "r") as f: + report_data = json.load(f) + if isinstance(args.dataset, str) and "/" in args.dataset: + report_data["benchmark"] = args.dataset.split("/")[-1].lower() + else: + report_data["benchmark"] = str(args.dataset).lower() + with open(dest_report_path, "w") as f: + json.dump(report_data, f, indent=4) + # Update Laminar datapoints with evaluation scores LaminarService.get().update_evaluation_scores( str(input_file), str(dest_report_path) diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index daafe8ad..f39e151d 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -25,13 +25,14 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.image_utils import image_exists from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools @@ -124,7 +125,7 @@ def prepare_workspace( f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else "" ) base_agent_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) wrap_needed = should_wrap_instance_id(instance.id) agent_server_image = base_agent_image @@ -170,14 +171,13 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): raise RuntimeError( @@ -282,7 +282,9 @@ def evaluate_instance( ) conversation.send_message(instruction) # Run conversation with fake user responses to handle agent messages - run_conversation_with_fake_user_response(conversation) + run_conversation_with_fake_user_response( + conversation, run_timeout=self.metadata.conversation_timeout + ) # git add workspace.execute_command(f"cd {repo_path} ; git add -A") @@ -323,7 +325,10 @@ def evaluate_instance( def main() -> None: prompt_dir = (Path(__file__).parent / "prompts").resolve() - choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + try: + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + except ValueError: + choices = [str(p) for p in prompt_dir.glob("*.j2")] default_prompt_path = prompt_dir / "default.j2" assert default_prompt_path.exists(), ( f"Default prompt {default_prompt_path} not found" @@ -344,12 +349,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( @@ -373,6 +373,7 @@ def main() -> None: dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, + conversation_timeout=args.conversation_timeout, eval_output_dir=structured_output_dir, details={}, prompt_path=args.prompt_path, @@ -382,6 +383,7 @@ def main() -> None: critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, + skip_failed_samples=args.skip_failed_samples, workspace_type=args.workspace, ) diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py index 86bfd298..fc8efa47 100644 --- a/benchmarks/swebenchmultimodal/eval_infer.py +++ b/benchmarks/swebenchmultimodal/eval_infer.py @@ -147,6 +147,7 @@ def update_report_with_component_scores(report_json_path: Path) -> dict[str, flo report = json.load(f) # Add component scores to report + report["benchmark"] = "swebench-multimodal" report["component_scores"] = scores # Write updated report @@ -246,6 +247,7 @@ def run_swebench_multimodal_evaluation( split: str = "dev", workers: str = "12", run_id: str | None = None, + modal: bool = True, ) -> Path | None: """ Run SWE-Bench Multimodal evaluation on the predictions file. @@ -262,68 +264,96 @@ def run_swebench_multimodal_evaluation( """ logger.info(f"Running SWE-Bench Multimodal evaluation on {predictions_file}") - # Get the directory of the predictions file - predictions_path = Path(predictions_file) - predictions_dir = predictions_path.parent - predictions_filename = predictions_path.name - - # Default for run_id if not provided - run_id = run_id or predictions_path.stem - - # Run SWE-Bench Multimodal evaluation using UV environment - # The key difference from regular SWE-Bench is the --modal true flag - cmd = [ - "uv", - "run", - "python", - "-m", - "swebench.harness.run_evaluation", - "--dataset_name", - dataset, - "--split", - split, - "--predictions_path", - predictions_filename, - "--max_workers", - str(workers), - "--modal", - "true", - "--run_id", - run_id, - ] - - logger.info(f"Running command: {' '.join(cmd)}") - logger.info(f"Working directory: {predictions_dir}") - logger.info("SWE-Bench Multimodal evaluation output:") - print("-" * 80) - try: + # Get the directory of the predictions file + predictions_path = Path(predictions_file) + predictions_dir = predictions_path.parent + predictions_filename = predictions_path.name + + # Generate run_id if not provided + if run_id is None: + run_id = f"eval_{predictions_path.stem}" + + # Run SWE-Bench Multimodal evaluation + # The key difference from regular SWE-Bench is the --modal true flag + # Try uv first, fall back to current Python interpreter + try: + uv_check = subprocess.run( + ["uv", "--version"], + capture_output=True, + text=True, + ) + uv_available = uv_check.returncode == 0 + except FileNotFoundError: + uv_available = False + + if uv_available: + cmd = [ + "uv", + "run", + "python", + "-m", + "swebench.harness.run_evaluation", + ] + else: + logger.info("uv not available, using current Python interpreter") + cmd = [ + sys.executable, + "-m", + "swebench.harness.run_evaluation", + ] + + cmd.extend([ + "--dataset_name", + dataset, + "--split", + split, + "--predictions_path", + predictions_filename, + "--max_workers", + str(workers), + "--run_id", + run_id, + ]) + if modal: + cmd.extend(["--modal", "true"]) + + logger.info(f"Running command: {' '.join(cmd)}") + logger.info(f"Working directory: {predictions_dir}") + logger.info("SWE-Bench Multimodal evaluation output:") + print("-" * 80) + + # Stream output directly to console, running from predictions file directory result = subprocess.run(cmd, text=True, cwd=predictions_dir) - except FileNotFoundError as e: + + print("-" * 80) + if result.returncode == 0: + logger.info("SWE-Bench Multimodal evaluation completed successfully") + else: + logger.error( + f"SWE-Bench Multimodal evaluation failed with return code {result.returncode}" + ) + raise subprocess.CalledProcessError(result.returncode, cmd) + + # SWE-Bench multimodal writes its summary to ..json + report_path = predictions_dir / f"{MODEL_NAME_OR_PATH}.{run_id}.json" + if not report_path.exists(): + raise FileNotFoundError( + f"Expected report file not found: {report_path}. " + "SWE-Bench harness output naming may have changed." + ) + + return report_path + + except FileNotFoundError: logger.error( "SWE-Bench evaluation command not found. " "Make sure SWE-Bench is properly installed." ) - raise e - - print("-" * 80) - if result.returncode == 0: - logger.info("SWE-Bench Multimodal evaluation completed successfully") - else: - logger.error( - f"SWE-Bench Multimodal evaluation failed with return code {result.returncode}" - ) - raise subprocess.CalledProcessError(result.returncode, cmd) - - # SWE-Bench multimodal writes its summary to ..json - report_path = predictions_dir / f"{MODEL_NAME_OR_PATH}.{run_id}.json" - if not report_path.exists(): - raise FileNotFoundError( - f"Expected report file not found: {report_path}. " - "SWE-Bench harness output naming may have changed." - ) - logger.info(f"Found report.json at: {report_path}") - return report_path + raise + except Exception as e: + logger.error(f"Error running SWE-Bench Multimodal evaluation: {e}") + raise def main() -> None: @@ -368,6 +398,13 @@ def main() -> None: help="Number of workers to use when evaluating", ) + parser.add_argument( + "--modal", + action=argparse.BooleanOptionalAction, + default=True, + help="Use Modal for evaluation (default: True). Use --no-modal for local evaluation.", + ) + parser.set_defaults(**EVAL_DEFAULTS) parser.add_argument( @@ -405,7 +442,7 @@ def main() -> None: if not args.skip_evaluation: # Run multimodal evaluation report_path = run_swebench_multimodal_evaluation( - str(output_file), args.dataset, args.split, args.workers, args.run_id + str(output_file), args.dataset, args.split, args.workers, args.run_id, args.modal ) # Calculate component scores if we have a report @@ -414,6 +451,15 @@ def main() -> None: "Calculating component scores (solveable/unsolveable accuracy)..." ) component_scores = update_report_with_component_scores(report_path) + # Export a .report.json artifact so framework parsers + # can discover benchmark results consistently across benchmarks. + with open(report_path, "r") as f: + report_data = json.load(f) + report_data["benchmark"] = "swebench-multimodal" + dest_report_path = input_file.with_suffix(".report.json") + with open(dest_report_path, "w") as f: + json.dump(report_data, f, indent=4) + logger.info(f"Wrote report artifact to: {dest_report_path}") if component_scores: logger.info("=" * 60) logger.info("COMPONENT SCORES SUMMARY") diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 02101697..ee968cfe 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -23,13 +23,14 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.image_utils import image_exists from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import ( LLM, Agent, @@ -160,7 +161,7 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") logger.info(f"SKIP_BUILD={SKIP_BUILD}") @@ -196,14 +197,13 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): raise RuntimeError( @@ -212,7 +212,7 @@ def prepare_workspace( ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -369,7 +369,9 @@ def evaluate_instance( logger.info("No image_assets found, sending text-only instruction") conversation.send_message(instruction) # Run conversation with fake user responses to handle agent messages - run_conversation_with_fake_user_response(conversation) + run_conversation_with_fake_user_response( + conversation, run_timeout=self.metadata.conversation_timeout + ) # git add workspace.execute_command(f"cd {repo_path} ; git add -A") @@ -411,7 +413,10 @@ def evaluate_instance( def main() -> None: prompt_dir = (Path(__file__).parent / "prompts").resolve() - choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + try: + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + except ValueError: + choices = [str(p) for p in prompt_dir.glob("*.j2")] default_prompt_path = prompt_dir / "default.j2" assert default_prompt_path.exists(), ( f"Default prompt {default_prompt_path} not found" @@ -433,12 +438,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( @@ -462,6 +462,7 @@ def main() -> None: dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, + conversation_timeout=args.conversation_timeout, eval_output_dir=structured_output_dir, details={}, prompt_path=args.prompt_path, @@ -471,6 +472,7 @@ def main() -> None: critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, + skip_failed_samples=args.skip_failed_samples, workspace_type=args.workspace, ) diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py index c245aa42..fb8ebedc 100644 --- a/benchmarks/swtbench/eval_infer.py +++ b/benchmarks/swtbench/eval_infer.py @@ -148,7 +148,9 @@ def update_report_with_submitted_instances( ) -def convert_to_swtbench_format(input_file: str, output_file: str) -> None: +def convert_to_swtbench_format( + input_file: str, output_file: str +) -> None: """ Convert OpenHands output.jsonl to SWT-Bench prediction format. @@ -271,21 +273,32 @@ def run_swtbench_evaluation( # but using the uv environment's python executable which has all dependencies benchmarks_dir = Path(__file__).parent.parent.parent - # Get the python executable from the uv environment - python_executable = subprocess.run( - [ - "uv", - "run", - "--directory", - str(benchmarks_dir), - "python", - "-c", - "import sys; print(sys.executable)", - ], - capture_output=True, - text=True, - cwd=benchmarks_dir, - ).stdout.strip() + # Get the python executable from the uv environment, fall back to current interpreter + try: + uv_result = subprocess.run( + [ + "uv", + "run", + "--directory", + str(benchmarks_dir), + "python", + "-c", + "import sys; print(sys.executable)", + ], + capture_output=True, + text=True, + cwd=benchmarks_dir, + ) + uv_available = uv_result.returncode == 0 and uv_result.stdout.strip() + except FileNotFoundError: + uv_available = False + uv_result = None + + if uv_available: + python_executable = uv_result.stdout.strip() + else: + python_executable = sys.executable + logger.info("uv not available, using current Python interpreter") # Set up environment with PYTHONPATH to include swt-bench directory env = os.environ.copy() @@ -301,7 +314,7 @@ def run_swtbench_evaluation( "--max_workers", str(workers), "--run_id", - predictions_path.stem, + f"eval_{predictions_path.stem}", ] logger.info(f"Using Python executable: {python_executable}") @@ -436,7 +449,7 @@ def main() -> None: cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench" swt_bench_dir = cache_dir / "swt-bench" report_dir = swt_bench_dir / "evaluation_results" - run_id = output_file.stem + run_id = f"eval_{output_file.stem}" report_file = report_dir / f"{MODEL_NAME_OR_PATH}.{run_id}.json" target_dir = input_file.parent @@ -444,6 +457,14 @@ def main() -> None: shutil.move(str(report_file), str(target_file)) logger.info(f"Moved evaluation report to: {target_file}") dest_report_path = target_file + + # Add benchmark field to the report + with open(target_file, "r") as f: + report_data = json.load(f) + report_data["benchmark"] = "swtbench" + with open(target_file, "w") as f: + json.dump(report_data, f, indent=4) + update_report_with_submitted_instances(target_file, output_file) # Update Laminar datapoints with evaluation scores diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index af2724bb..e85b8a13 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -17,14 +17,14 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.image_utils import image_exists from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import SDK_SHORT_SHA -from openhands.agent_server.docker.build import _base_slug +from benchmarks.utils.version import IMAGE_TAG_PREFIX from openhands.sdk import LLM, Agent, Conversation, __version__, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools @@ -53,6 +53,10 @@ def get_agent_server_docker_image( target: str = "source-minimal", ) -> str: """Get the agent server Docker image for an instance.""" + # Importing here because openhands.agent_server.docker.build runs git checks + # which fails when installed as a package outside the git repo + from openhands.agent_server.docker.build import _base_slug + official_image_name = get_official_docker_image(instance_id, docker_image_prefix) return ( "ghcr.io/all-hands-ai/agent-server" @@ -166,7 +170,7 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") logger.info(f"SKIP_BUILD={SKIP_BUILD}") @@ -180,12 +184,25 @@ def prepare_workspace( "agent-server image." ) # For SWT-bench, we use DockerDevWorkspace with base_image - workspace = DockerDevWorkspace( - base_image=official_docker_image, - working_dir="/workspace", - target=build_target, - forward_env=forward_env or [], - ) + # Fall back to pre-built image if build fails + try: + workspace = DockerDevWorkspace( + base_image=official_docker_image, + working_dir="/workspace", + target=build_target, + forward_env=forward_env or [], + ) + except Exception as build_error: + if not image_exists(agent_server_image): + raise RuntimeError( + f"On-the-fly build failed and pre-built image {agent_server_image} does not exist" + ) + workspace = DockerWorkspace( + server_image=agent_server_image, + working_dir="/workspace", + forward_env=forward_env or [], + ) + logger.info(f"Using pre-built image {agent_server_image}") else: workspace = DockerWorkspace( server_image=agent_server_image, @@ -194,14 +211,13 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") - sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" ) if not image_exists(agent_server_image): raise RuntimeError( @@ -210,7 +226,7 @@ def prepare_workspace( ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -303,7 +319,9 @@ def evaluate_instance( ) conversation.send_message(instruction) # Run conversation with fake user responses to handle agent messages - run_conversation_with_fake_user_response(conversation) + run_conversation_with_fake_user_response( + conversation, run_timeout=self.metadata.conversation_timeout + ) # git add workspace.execute_command(f"cd {repo_path} ; git add -A") @@ -344,7 +362,10 @@ def evaluate_instance( def main() -> None: """Main entry point for SWT-bench evaluation.""" prompt_dir = (Path(__file__).parent / "prompts").resolve() - choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + try: + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + except ValueError: + choices = [str(p) for p in prompt_dir.glob("*.j2")] default_prompt_path = prompt_dir / "default.j2" assert default_prompt_path.exists(), ( f"Default prompt {default_prompt_path} not found" @@ -365,12 +386,7 @@ def main() -> None: if args.max_attempts < 1: raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") - llm_config_path = args.llm_config_path - if not os.path.isfile(llm_config_path): - raise ValueError(f"LLM config file {llm_config_path} does not exist") - with open(llm_config_path, "r") as f: - llm_config = f.read() - llm = LLM.model_validate_json(llm_config) + llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) dataset_description = ( @@ -392,6 +408,7 @@ def main() -> None: dataset=args.dataset, dataset_split=args.split, max_iterations=args.max_iterations, + conversation_timeout=args.conversation_timeout, eval_output_dir=structured_output_dir, details={}, prompt_path=args.prompt_path, @@ -401,6 +418,7 @@ def main() -> None: critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, + skip_failed_samples=args.skip_failed_samples, workspace_type=args.workspace, ) diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 6ae98855..f0818064 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -49,6 +49,14 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: default=500, help="Maximum iterations (default: 500)", ) + parser.add_argument( + "--conversation-timeout", + type=float, + default=3600.0, + help=( + "Timeout (seconds) for a single Conversation.run() call on remote workspaces " + ), + ) parser.add_argument("--num-workers", type=int, help="Number of inference workers") parser.add_argument("--note", type=str, help="Optional evaluation note") parser.add_argument( @@ -84,4 +92,9 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: default=3, help="Maximum retries for instances that throw exceptions (default: 3)", ) + parser.add_argument( + "--skip-failed-samples", + action="store_true", + help="Skip failed samples and treat as not solved", + ) return parser diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index 9c700f1d..1653f84a 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -28,7 +28,6 @@ ) from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.image_utils import image_exists -from openhands.agent_server.docker.build import BuildOptions, TargetType, build from openhands.sdk import get_logger @@ -279,9 +278,13 @@ def build_image( base_image: str, target_image: str, custom_tag: str, - target: TargetType = "source-minimal", + target: str = "source-minimal", push: bool = False, ) -> BuildOutput: + # Importing here because openhands.agent_server.docker.build runs git checks + # which fails when installed as a package outside the git repo + from openhands.agent_server.docker.build import BuildOptions, build + # Get SDK info from submodule to ensure tags use the correct SDK SHA git_ref, git_sha, sdk_version = _get_sdk_submodule_info() @@ -312,7 +315,7 @@ def _build_with_logging( base_image: str, target_image: str, custom_tag: str = "", - target: TargetType = "source-minimal", + target: str = "source-minimal", push: bool = False, max_retries: int = 3, post_build_fn: Callable[[BuildOutput, bool], BuildOutput] | None = None, @@ -405,7 +408,7 @@ def default_build_output_dir( def build_all_images( base_images: list[str], - target: TargetType, + target: str, build_dir: Path, image: str = EVAL_AGENT_SERVER_IMAGE, push: bool = False, diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py index 882f1b63..ddd96006 100644 --- a/benchmarks/utils/constants.py +++ b/benchmarks/utils/constants.py @@ -1,5 +1,11 @@ +import os + OUTPUT_FILENAME = "output.jsonl" -EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server" + +# Image name for agent server (can be overridden via env var) +EVAL_AGENT_SERVER_IMAGE = os.getenv( + "OPENHANDS_EVAL_AGENT_SERVER_IMAGE", "ghcr.io/openhands/eval-agent-server" +) # Model identifier used in swebench-style prediction entries. # The swebench harness uses this value to create log directory structures diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index 32177e38..1548b624 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -6,6 +6,7 @@ import json import os import sys +import traceback import time from abc import ABC, abstractmethod from concurrent.futures import FIRST_COMPLETED, Future, ProcessPoolExecutor, wait @@ -55,6 +56,15 @@ class PendingInstance: OnResult = Callable[[EvalInstance, EvalOutput], None] +class SampleFailedError(Exception): + """Raised when a sample fails and skip_failed_samples=False.""" + + def __init__(self, instance_id: str, error: str): + self.instance_id = instance_id + self.error = error + super().__init__(f"Sample {instance_id} failed: {error}") + + class Evaluation(ABC, BaseModel): """Abstract orchestrator for instance processing (process-based).""" @@ -132,12 +142,23 @@ def evaluate_instance( raise NotImplementedError def _create_error_output( - self, instance: EvalInstance, error: Exception, retry_count: int + self, + instance: EvalInstance, + error: Exception, + retry_count: int, + *, + stack: str | None = None, ) -> EvalOutput: """Create an EvalOutput object for a failed instance.""" + err_type = error.__class__.__name__ + err_msg = str(error) return EvalOutput( instance_id=instance.id, - test_result={}, + test_result={ + "error_type": err_type, + "error_message": err_msg, + "error_stack": stack, + }, instruction=None, error=( f"Instance failed after {retry_count} retries. Last error: {str(error)}" @@ -411,6 +432,10 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: instance, out = fut.result() pending_info = pending_instances.get(fut) + # Fail fast if skip_failed_samples=False and sample errored + if out.error and not self.metadata.skip_failed_samples: + raise SampleFailedError(instance.id, out.error) + # Add Laminar metadata to EvalOutput if out.metadata is None: out.metadata = self.metadata.model_copy(deep=True) @@ -422,6 +447,9 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: ) attempt_on_result(instance, out) + except SampleFailedError: + # Re-raise to fail the entire evaluation + raise except Exception as e: logger.error( f"Unexpected error from worker process: {str(e)[:50]}", @@ -455,6 +483,7 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: f"{self.instance_timeout}s timeout" ), attempt, + stack=None, ) if error_output.metadata is None: error_output.metadata = self.metadata.model_copy( @@ -581,6 +610,7 @@ def _process_one_mp( retry_count = 0 runtime_failure_count = 0 last_error = None + last_error_stack: str | None = None max_retries = self.metadata.max_retries runtime_runs: list[RemoteRuntimeAllocation] = [] @@ -648,6 +678,7 @@ def _process_one_mp( return instance, out except Exception as e: last_error = e + last_error_stack = traceback.format_exc() retry_count += 1 lmnr_span.record_exception(e) @@ -692,7 +723,10 @@ def _process_one_mp( ) # Create error output for final failure error_output = self._create_error_output( - instance, last_error, max_retries + instance, + last_error, + max_retries, + stack=last_error_stack, ) if runtime_runs: error_output.runtime_runs = runtime_runs @@ -723,7 +757,10 @@ def _process_one_mp( # This should never be reached, but added for type safety error_output = self._create_error_output( - instance, Exception("Unexpected error: no attempts made"), max_retries + instance, + Exception("Unexpected error: no attempts made"), + max_retries, + stack=None, ) if runtime_runs: error_output.runtime_runs = runtime_runs diff --git a/benchmarks/utils/fake_user_response.py b/benchmarks/utils/fake_user_response.py index 8b2848fa..befcfa75 100644 --- a/benchmarks/utils/fake_user_response.py +++ b/benchmarks/utils/fake_user_response.py @@ -119,6 +119,7 @@ def run_conversation_with_fake_user_response( conversation: "BaseConversation", fake_user_response_fn: FakeUserResponseFn = fake_user_response, max_fake_responses: int = 10, + run_timeout: float | None = None, ) -> None: """Run a conversation with automatic fake user responses. @@ -137,13 +138,20 @@ def run_conversation_with_fake_user_response( Defaults to fake_user_response. max_fake_responses: Maximum number of fake responses to send before stopping. This prevents infinite loops. + run_timeout: Optional timeout in seconds for conversation.run() calls """ fake_response_count = 0 + # Only RemoteConversation.run() supports a timeout kwarg. + from openhands.sdk.conversation.impl.remote_conversation import RemoteConversation + while True: # Run the conversation - conversation.run() + if run_timeout is not None and isinstance(conversation, RemoteConversation): + conversation.run(timeout=run_timeout) + else: + conversation.run() # Check the execution status status = conversation.state.execution_status diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py index a463f3b4..b328a54f 100644 --- a/benchmarks/utils/image_utils.py +++ b/benchmarks/utils/image_utils.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 import base64 +import subprocess import sys import requests @@ -54,12 +55,31 @@ def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None: return None +def _local_image_exists(image_ref: str) -> bool: + """Check if image exists in local Docker daemon.""" + try: + result = subprocess.run( + ["docker", "images", "-q", image_ref], + capture_output=True, + text=True, + timeout=10, + ) + return bool(result.stdout.strip()) + except (subprocess.SubprocessError, FileNotFoundError): + return False + + def image_exists( image_ref: str, gh_username: str | None = None, gh_pat: str | None = None, # GitHub PAT with read:packages for private GHCR docker_token: str | None = None, # Docker Hub JWT if you already have one ) -> bool: + # Check local Docker first + if _local_image_exists(image_ref): + return True + + # Then check remote registry registry, repo, ref = _parse(image_ref) headers = {"Accept": ACCEPT} diff --git a/benchmarks/utils/llm_config.py b/benchmarks/utils/llm_config.py new file mode 100644 index 00000000..952704cd --- /dev/null +++ b/benchmarks/utils/llm_config.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import json +import os +from pathlib import Path + +from openhands.sdk import LLM + + +def load_llm_config(config_path: str | Path) -> LLM: + config_path = Path(config_path) + if not config_path.is_file(): + raise ValueError(f"LLM config file {config_path} does not exist") + + with config_path.open("r") as f: + llm_config = json.load(f) + + # load api_key from env var if api_key_env is specified + if "api_key_env" in llm_config: + env_var = llm_config.pop("api_key_env") + api_key = os.environ.get(env_var, "") + if not api_key: + raise ValueError( + f"Environment variable {env_var} is not set or empty. " + f"Please set it with your API key." + ) + llm_config["api_key"] = api_key + + # strip /chat/completions from base_url for LiteLLM compatibility + if "base_url" in llm_config: + base_url = llm_config["base_url"] + base_url = base_url.rstrip("/") + if base_url.endswith("/chat/completions"): + base_url = base_url.removesuffix("/chat/completions") + llm_config["base_url"] = base_url + + return LLM.model_validate(llm_config) diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index f04b405d..4c4738b4 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -18,6 +18,13 @@ class EvalMetadata(BaseModel): dataset: str dataset_split: str = Field(default="test") max_iterations: int + conversation_timeout: float = Field( + default=3600.0, + ge=0, + description=( + "Timeout in seconds for a single Conversation.run() call (remote workspaces). " + ), + ) eval_output_dir: str details: dict[str, Any] | None = None prompt_path: str | None = Field( @@ -48,6 +55,13 @@ class EvalMetadata(BaseModel): ge=0, description="Maximum number of retries for instances that throw exceptions", ) + skip_failed_samples: bool = Field( + default=True, + description=( + "If True, failed samples are skipped and treated as not solved. " + "If False, the entire evaluation fails on the first failed sample." + ), + ) workspace_type: Literal["docker", "remote"] = Field( default="docker", description="Type of workspace to use, e.g., 'docker' or 'remote'", diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py index 951c6592..15b2da3f 100644 --- a/benchmarks/utils/version.py +++ b/benchmarks/utils/version.py @@ -1,4 +1,6 @@ +import os import subprocess +import warnings from pathlib import Path @@ -18,10 +20,19 @@ def _get_submodule_sha(submodule_path: Path) -> str: def get_sdk_sha() -> str: """ - Get the current git sha from the SDK submodule. + Get the SDK SHA from git submodule, falling back to "unknown". """ - return _get_submodule_sha(PROJECT_ROOT / "vendor" / "software-agent-sdk") + try: + return _get_submodule_sha(PROJECT_ROOT / "vendor" / "software-agent-sdk") + except subprocess.CalledProcessError: + warnings.warn( + "Could not get SDK SHA from git submodule. Using 'unknown' as fallback. " + ) + return "unknown" SDK_SHA = get_sdk_sha() SDK_SHORT_SHA = SDK_SHA[:7] + +# This is used as the first part of the image tag: -- +IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX", SDK_SHORT_SHA) diff --git a/nemo_evaluator/openhands_benchmarks/__init__.py b/nemo_evaluator/openhands_benchmarks/__init__.py new file mode 100644 index 00000000..b28b04f6 --- /dev/null +++ b/nemo_evaluator/openhands_benchmarks/__init__.py @@ -0,0 +1,3 @@ + + + diff --git a/nemo_evaluator/openhands_benchmarks/framework.yml b/nemo_evaluator/openhands_benchmarks/framework.yml new file mode 100644 index 00000000..c14aa91d --- /dev/null +++ b/nemo_evaluator/openhands_benchmarks/framework.yml @@ -0,0 +1,266 @@ +framework: + name: openhands_benchmarks + pkg_name: openhands_benchmarks + full_name: OpenHands Benchmarks + description: Multi-benchmark evaluation harness using the OpenHands agent framework. + url: https://github.com/All-Hands-AI/openhands-agent-benchmarks + +defaults: + command: >- + python3 -m benchmarks.scripts.run_benchmark + --model openai/{{target.api_endpoint.model_id}} + --api-base-url {{target.api_endpoint.url}} + {% if target.api_endpoint.api_key_name is not none %}--api-key-env {{target.api_endpoint.api_key_name}}{% endif %} + --temperature {{config.params.temperature}} + --top-p {{config.params.top_p}} + --max-completion-tokens {{config.params.max_new_tokens}} + --timeout {{config.params.request_timeout}} + --max-retries {{config.params.max_retries}} + --benchmark {{config.params.extra.benchmark}} + {% if config.params.extra.dataset is defined and config.params.extra.dataset is not none %}--dataset {{config.params.extra.dataset}}{% endif %} + {% if config.params.extra.split is defined and config.params.extra.split is not none %}--split {{config.params.extra.split}}{% endif %} + --workspace {{config.params.extra.workspace}} + --max-iterations {{config.params.extra.max_steps}} + --conversation-timeout {{config.params.extra.conversation_timeout}} + --num-workers {{config.params.parallelism}} + --note {{config.type}} + --output-dir {{config.output_dir}} + --max-attempts {{config.params.extra.max_attempts}} + --instance-max-retries {{config.params.extra.instance_max_retries}} + {% if config.params.limit_samples is not none %}--n-limit {{config.params.limit_samples}}{% endif %} + {% if config.params.extra.skip_failed_samples %}--skip-failed-samples{% endif %} + {% if config.params.extra.level is defined and config.params.extra.level is not none %}--level {{config.params.extra.level}}{% endif %} + {% if config.params.extra.repo_split is defined and config.params.extra.repo_split is not none %}--repo-split {{config.params.extra.repo_split}}{% endif %} + {% if config.params.extra.language is defined and config.params.extra.language is not none %}--language {{config.params.extra.language}}{% endif %} + {% if config.params.extra.modal is defined and config.params.extra.modal is not none %}{% if config.params.extra.modal %}--modal{% else %}--no-modal{% endif %}{% endif %} + + config: + params: + limit_samples: null + temperature: 0.6 + top_p: 1.0 + max_new_tokens: 64000 + request_timeout: 84000 + max_retries: 5 + parallelism: 1 + extra: + workspace: docker + max_steps: 100 + conversation_timeout: 28000 + max_attempts: 3 + instance_max_retries: 3 + skip_failed_samples: false + target: + api_endpoint: + adapter_config: + mode: client # disable adapters by default + +evaluations: + # SWE-bench variants + - name: swebench-verified + description: SWE-bench Verified - 500 human-validated GitHub issues + defaults: + config: + type: swebench-verified + supported_endpoint_types: [chat] + params: + extra: + benchmark: swebench + dataset: princeton-nlp/SWE-bench_Verified + split: test + + - name: swebench-lite + description: SWE-bench Lite - 300 curated GitHub issues + defaults: + config: + type: swebench-lite + supported_endpoint_types: [chat] + params: + extra: + benchmark: swebench + dataset: princeton-nlp/SWE-bench_Lite + split: test + + - name: swebench-full + description: SWE-bench Full - Complete dataset of GitHub issues + defaults: + config: + type: swebench-full + supported_endpoint_types: [chat] + params: + extra: + benchmark: swebench + dataset: princeton-nlp/SWE-bench + split: test + + # GAIA benchmark + - name: gaia + description: GAIA - General AI Assistant benchmark for real-world tasks requiring reasoning, tool use, and web browsing + defaults: + config: + type: gaia + supported_endpoint_types: [chat] + params: + extra: + benchmark: gaia + dataset: gaia-benchmark/GAIA + split: test + level: "2023_all" + + # Commit0 benchmark + - name: commit0 + description: Commit0 - Repository-level code generation benchmark + defaults: + config: + type: commit0 + supported_endpoint_types: [chat] + params: + extra: + benchmark: commit0 + dataset: wentingzhao/commit0_combined + split: test + repo_split: lite + max_attempts: 1 + + # Multi-SWE-bench (multilingual) + - name: multiswebench-java + description: Multi-SWE-bench Java - Multilingual SWE-bench for Java repositories + defaults: + config: + type: multiswebench-java + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: java_verified + language: java + + - name: multiswebench-python # empty subset + description: Multi-SWE-bench Python - Multilingual SWE-bench for Python repositories + defaults: + config: + type: multiswebench-python + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: python_verified + language: python + + - name: multiswebench-go + description: Multi-SWE-bench Go - Multilingual SWE-bench for Go repositories + defaults: + config: + type: multiswebench-go + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: go_verified + language: go + + - name: multiswebench-c + description: Multi-SWE-bench C - Multilingual SWE-bench for C repositories + defaults: + config: + type: multiswebench-c + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: c_verified + language: c + + - name: multiswebench-cpp + description: Multi-SWE-bench C++ - Multilingual SWE-bench for C++ repositories + defaults: + config: + type: multiswebench-cpp + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: cpp_verified + language: cpp + + - name: multiswebench-js + description: Multi-SWE-bench JavaScript - Multilingual SWE-bench for JavaScript repositories + defaults: + config: + type: multiswebench-js + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: js_verified + language: js + + - name: multiswebench-rust + description: Multi-SWE-bench Rust - Multilingual SWE-bench for Rust repositories + defaults: + config: + type: multiswebench-rust + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: rust_verified + language: rust + + - name: multiswebench-ts + description: Multi-SWE-bench TypeScript - Multilingual SWE-bench for TypeScript repositories + defaults: + config: + type: multiswebench-ts + supported_endpoint_types: [chat] + params: + extra: + benchmark: multiswebench + dataset: bytedance-research/Multi-SWE-Bench + split: ts_verified + language: ts + + # SWT-bench + - name: swtbench + description: SWT-bench - Software testing benchmark for test generation + defaults: + config: + type: swtbench + supported_endpoint_types: [chat] + params: + extra: + benchmark: swtbench + + # SWE-bench Multimodal + - name: swebench-multimodal + description: SWE-bench Multimodal - GitHub issues with visual context + defaults: + config: + type: swebench-multimodal + supported_endpoint_types: [chat] + params: + extra: + benchmark: swebenchmultimodal + dataset: princeton-nlp/SWE-bench_Multimodal + split: dev # test spit did not work + modal: false + + # OpenAgentSafety benchmark + - name: openagentsafety + description: OpenAgentSafety - Safety evaluation benchmark for AI agents + defaults: + config: + type: openagentsafety + supported_endpoint_types: [chat] + params: + extra: + benchmark: openagentsafety + dataset: mgulavani/openagentsafety_full_updated_v3 + split: train diff --git a/nemo_evaluator/openhands_benchmarks/output.py b/nemo_evaluator/openhands_benchmarks/output.py new file mode 100644 index 00000000..9104c680 --- /dev/null +++ b/nemo_evaluator/openhands_benchmarks/output.py @@ -0,0 +1,54 @@ +import json +import pathlib + +from nemo_evaluator.api.api_dataclasses import EvaluationResult + + +def parse_output(output_dir: str) -> EvaluationResult: + output_path = pathlib.Path(output_dir) + + # Find any .report.json file (all benchmarks use this naming convention) + report_files = sorted(output_path.rglob("*.report.json")) + + if not report_files: + raise FileNotFoundError( + f"No .report.json file found under {output_dir}. " + "Make sure the evaluation completed successfully." + ) + + if len(report_files) > 1: + raise ValueError( + f"Multiple .report.json files found: {report_files}. " + "`output_dir` must contain a single evaluation run." + ) + + report = json.loads(report_files[0].read_text(encoding="utf-8")) + + # Get benchmark name from report + task_name = report["benchmark"] + + # All benchmarks have these common fields in their report + resolved = report.get("resolved_instances", 0) + submitted = report.get("submitted_instances", 0) + + # Calculate accuracy (handle division by zero) + accuracy = resolved / submitted if submitted > 0 else 0.0 + + metrics = { + "accuracy": { + "scores": { + "accuracy": { + "value": accuracy, + "stats": { + "resolved": resolved, + "total": submitted, + }, + } + } + } + } + + tasks = {task_name: {"metrics": metrics}} + groups = {task_name: {"metrics": metrics}} + + return EvaluationResult(tasks=tasks, groups=groups) diff --git a/pyproject.toml b/pyproject.toml index 843655a6..83bdcf79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,10 +42,13 @@ dependencies = [ "lmnr>=0.7.24", "multi-swe-bench>=1.1.1", "swt-bench @ git+https://github.com/logic-star-ai/swt-bench.git@5fdcd446ff05e248ecfffc19d560a210699f71f8", -] + "nemo_evaluator", + ] [project.scripts] validate-cfg = "benchmarks.scripts.validate_cfg:main" +generate-llm-config = "benchmarks.scripts.generate_llm_config:main" +run-benchmark = "benchmarks.scripts.run_benchmark:main" swebench-infer = "benchmarks.swebench.run_infer:main" swtbench-infer = "benchmarks.swtbench.run_infer:main" swebench-eval = "benchmarks.swebench.eval_infer:main" @@ -68,12 +71,16 @@ build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] where = ["."] -include = ["benchmarks"] +include = ["benchmarks", "benchmarks*", "nemo_evaluator", "nemo_evaluator*"] [tool.setuptools] # Install the top-level sitecustomize module so Python auto-loads our Modal logging patch. py-modules = ["sitecustomize"] +[tool.setuptools.package-data] +nemo_evaluator = ["**/*.yml"] +benchmarks = ["**/*.j2", "**/Dockerfile*", "**/*.json"] + [dependency-groups] dev = [ "pre-commit>=4.3.0", @@ -90,7 +97,6 @@ dev = [ [tool.ruff] target-version = "py312" line-length = 88 -exclude = ["legacy"] [tool.ruff.format] quote-style = "double" From c5f40518280f60c74761474517ddac5833390939 Mon Sep 17 00:00:00 2001 From: Ewa Dobrowolska Date: Thu, 12 Feb 2026 22:04:50 +0100 Subject: [PATCH 2/2] remove redundant VERSION file --- VERSION | 1 - 1 file changed, 1 deletion(-) delete mode 100644 VERSION diff --git a/VERSION b/VERSION deleted file mode 100644 index 6e8bf73a..00000000 --- a/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.1.0