From 8fb5447717ef2e9f360cae40c030b3de67a5962e Mon Sep 17 00:00:00 2001 From: Rb Date: Wed, 18 Feb 2026 13:16:23 +0300 Subject: [PATCH 1/3] feat(swesmith): add SWE-Smith benchmark scaffold Add core inference and evaluation scripts for running OpenHands agents on SWE-Smith task instances. Co-Authored-By: Muhammed Karamuk --- benchmarks/swesmith/config.py | 15 + benchmarks/swesmith/constants.py | 28 ++ benchmarks/swesmith/eval_infer.py | 314 +++++++++++++++++++ benchmarks/swesmith/profiles.py | 111 +++++++ benchmarks/swesmith/run_infer.py | 487 ++++++++++++++++++++++++++++++ 5 files changed, 955 insertions(+) create mode 100644 benchmarks/swesmith/config.py create mode 100644 benchmarks/swesmith/constants.py create mode 100644 benchmarks/swesmith/eval_infer.py create mode 100644 benchmarks/swesmith/profiles.py create mode 100644 benchmarks/swesmith/run_infer.py diff --git a/benchmarks/swesmith/config.py b/benchmarks/swesmith/config.py new file mode 100644 index 00000000..a65bae76 --- /dev/null +++ b/benchmarks/swesmith/config.py @@ -0,0 +1,15 @@ +""" +SWE-Smith benchmark configuration. +""" + +# Inference defaults (used by run_infer.py) +INFER_DEFAULTS = { + "dataset": "SWE-bench/SWE-smith-py", + "split": "train", + "num_workers": 4, +} + +# Evaluation defaults (used by eval_infer.py) +EVAL_DEFAULTS = { + "workers": 4, +} diff --git a/benchmarks/swesmith/constants.py b/benchmarks/swesmith/constants.py new file mode 100644 index 00000000..9b903abb --- /dev/null +++ b/benchmarks/swesmith/constants.py @@ -0,0 +1,28 @@ +""" +SWE-Smith hyperparameters and constant values. +""" + +from typing import Final, Literal + + +# Build target type (matches openhands.agent_server.docker.build.TargetType) +TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"] +BUILD_TARGET_SOURCE_MINIMAL: Final[TargetType] = "source-minimal" +BUILD_TARGET_BINARY: Final[TargetType] = "binary" +DEFAULT_BUILD_TARGET: Final[TargetType] = BUILD_TARGET_SOURCE_MINIMAL + +# Runtime +DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev" +DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600 + +# Git +GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev" +GIT_USER_NAME: Final[str] = "OpenHands Evaluation" +GIT_COMMIT_MESSAGE: Final[str] = "patch" + +# Patch Processing +SETUP_FILES_TO_REMOVE: Final[tuple[str, ...]] = ( + "pyproject.toml", + "tox.ini", + "setup.py", +) diff --git a/benchmarks/swesmith/eval_infer.py b/benchmarks/swesmith/eval_infer.py new file mode 100644 index 00000000..f613e2d3 --- /dev/null +++ b/benchmarks/swesmith/eval_infer.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python3 +""" +SWE-Smith Evaluation Script + +This script converts OpenHands output.jsonl format to SWE-Smith prediction format +and runs the SWE-Smith evaluation. + +Usage: + uv run swesmith-eval --run-id --dataset +""" + +import argparse +import json +import os +import shutil +import sys +from pathlib import Path + +from swesmith.harness.eval import main as swesmith_eval_main + +import benchmarks.swesmith.profiles # noqa: F401 — registers custom profiles +from benchmarks.swesmith import constants +from benchmarks.swesmith.config import EVAL_DEFAULTS +from benchmarks.utils.constants import MODEL_NAME_OR_PATH +from benchmarks.utils.laminar import LaminarService +from benchmarks.utils.patch_utils import remove_files_from_patch +from benchmarks.utils.report_costs import generate_cost_report +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +def convert_to_swesmith_format(input_file: str, output_file: str) -> None: + """ + Convert OpenHands output.jsonl to SWE-Smith prediction format. + + OpenHands format: + { + "instance_id": "repo__name.hash__ig_llm", + "test_result": { + "git_patch": "diff --git a/file.py b/file.py\n..." + }, + ... + } + + SWE-Smith format: + { + "instance_id": "repo__name.hash__ig_llm", + "model_patch": "diff --git a/file.py b/file.py\n...", + "model_name_or_path": "" + } + """ + logger.info(f"Converting {input_file} to SWE-Smith format: {output_file}") + + converted_count = 0 + error_count = 0 + + with open(input_file, "r") as infile, open(output_file, "w") as outfile: + for line_num, line in enumerate(infile, 1): + try: + line = line.strip() + if not line: + continue + + data = json.loads(line) + + instance_id = data.get("instance_id") + if not instance_id: + logger.warning(f"Line {line_num}: Missing instance_id") + error_count += 1 + continue + + test_result = data.get("test_result", {}) + git_patch = test_result.get("git_patch", "") + + if not git_patch: + logger.warning( + f"Line {line_num}: Missing or empty git_patch for {instance_id}" + ) + git_patch = "" + + git_patch = remove_files_from_patch( + git_patch, constants.SETUP_FILES_TO_REMOVE + ) + + swesmith_entry = { + "instance_id": instance_id, + "model_patch": git_patch, + "model_name_or_path": MODEL_NAME_OR_PATH, + } + + outfile.write(json.dumps(swesmith_entry) + "\n") + converted_count += 1 + + except json.JSONDecodeError as e: + logger.error(f"Line {line_num}: Invalid JSON - {e}") + error_count += 1 + except Exception as e: + logger.error(f"Line {line_num}: Unexpected error - {e}") + error_count += 1 + + logger.info( + f"Conversion complete: {converted_count} entries converted, " + f"{error_count} errors" + ) + + if converted_count == 0: + raise ValueError("No valid entries were converted") + + +def run_swesmith_evaluation( + predictions_file: str, + run_id: str, + dataset: str, + workers: int = EVAL_DEFAULTS["workers"], + f2p_only: bool = False, + instance_ids: list[str] | None = None, + report_only: bool = False, + redo_existing: bool = False, +) -> None: + """ + Run SWE-Smith evaluation on the predictions file. + + Calls swesmith.harness.eval directly as a Python API (not subprocess). + Custom profiles from benchmarks.swesmith.profiles are auto-registered + at import time, making them available to the swesmith harness. + + Args: + predictions_file: Path to the SWE-Smith format predictions file + run_id: Unique identifier for this evaluation run + dataset: Path to SWE-Smith dataset file (.json or .jsonl) + workers: Number of workers to use for evaluation + f2p_only: Run evaluation using only files with fail-to-pass tests + instance_ids: Instance IDs to evaluate (supports glob patterns) + report_only: Regenerate reports only, skip running evaluations + redo_existing: Redo already-completed evaluation instances + """ + logger.info(f"Running SWE-Smith evaluation on {predictions_file}") + + predictions_path = Path(predictions_file) + predictions_dir = predictions_path.parent + + # Resolve dataset to absolute path before changing cwd + dataset_abs = str(Path(dataset).resolve()) + + logger.info(f"Working directory: {predictions_dir}") + + # swesmith writes logs relative to cwd, so we temporarily change to + # the predictions directory (same effect as subprocess cwd=). + original_cwd = os.getcwd() + os.chdir(predictions_dir) + try: + swesmith_eval_main( + run_id=run_id, + workers=workers, + predictions_path=predictions_path.name, + dataset_path=dataset_abs, + f2p_only=f2p_only, + instance_ids=instance_ids, + report_only=report_only, + redo_existing=redo_existing, + ) + logger.info("SWE-Smith evaluation completed successfully") + except Exception as e: + logger.error(f"SWE-Smith evaluation failed: {e}") + raise + finally: + os.chdir(original_cwd) + + +def main() -> None: + """Main entry point for the script.""" + from dotenv import load_dotenv + + load_dotenv() + + parser = argparse.ArgumentParser( + description="Convert OpenHands output to SWE-Smith format and run evaluation", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + uv run swesmith-eval output.jsonl --run-id my_eval --dataset /path/to/dataset.json + uv run swesmith-eval output.jsonl --run-id test --dataset /path/to/dataset.json --skip-evaluation + uv run swesmith-eval output.jsonl --run-id fast --dataset /path/to/dataset.json --f2p-only + uv run swesmith-eval output.jsonl --run-id filtered --dataset /path/to/dataset.json --instance-ids "repo__name.*" + """, + ) + + parser.add_argument("input_file", help="Path to the OpenHands output.jsonl file") + + parser.add_argument( + "--dataset", + required=True, + help="Path to SWE-Smith dataset file (.json or .jsonl)", + ) + + parser.add_argument( + "--output-file", + help="Output file for SWE-Smith format " + "(default: input_file with .swesmith.jsonl extension)", + ) + + parser.add_argument( + "--skip-evaluation", + action="store_true", + help="Only convert format, skip running evaluation", + ) + + parser.add_argument( + "--workers", + type=int, + default=EVAL_DEFAULTS["workers"], + help=f"Number of workers to use when evaluating (default: {EVAL_DEFAULTS['workers']})", + ) + + parser.add_argument( + "--run-id", + required=True, + help="Unique identifier for this evaluation run", + ) + + parser.add_argument( + "--f2p-only", + action="store_true", + help="Run evaluation using only files with fail-to-pass tests (faster)", + ) + + parser.add_argument( + "--instance-ids", + nargs="+", + help="Instance IDs to evaluate (supports glob patterns like 'repo__name.*')", + ) + + parser.add_argument( + "--report-only", + action="store_true", + help="Regenerate reports only, skip running evaluations", + ) + + parser.add_argument( + "--redo-existing", + action="store_true", + help="Redo already-completed evaluation instances", + ) + + args = parser.parse_args() + + input_file = Path(args.input_file) + if not input_file.exists(): + logger.error(f"Input file does not exist: {input_file}") + sys.exit(1) + + if not input_file.suffix == ".jsonl": + logger.warning(f"Input file does not have .jsonl extension: {input_file}") + + if args.output_file: + output_file = Path(args.output_file) + else: + output_file = input_file.with_suffix(".swesmith.jsonl") + + logger.info(f"Input file: {input_file}") + logger.info(f"Output file: {output_file}") + logger.info(f"Dataset: {args.dataset}") + + dest_report_path: Path | None = None + + try: + convert_to_swesmith_format(str(input_file), str(output_file)) + + if not args.skip_evaluation: + run_swesmith_evaluation( + str(output_file), + args.run_id, + args.dataset, + args.workers, + f2p_only=args.f2p_only, + instance_ids=args.instance_ids, + report_only=args.report_only, + redo_existing=args.redo_existing, + ) + + # swesmith creates: logs/run_evaluation/{run_id}/report.json relative to cwd + report_path = ( + output_file.parent + / "logs" + / "run_evaluation" + / args.run_id + / "report.json" + ) + dest_report_path = input_file.with_suffix(".report.json") + + shutil.move(str(report_path), str(dest_report_path)) + logger.info(f"Moved report file to: {dest_report_path}") + + LaminarService.get().update_evaluation_scores( + str(input_file), str(dest_report_path) + ) + + generate_cost_report(str(input_file)) + + logger.info("Script completed successfully!") + if not args.skip_evaluation and dest_report_path is not None: + print(json.dumps({"report_json": str(dest_report_path)})) + else: + print(json.dumps({"report_json": ""})) + + except Exception as e: + logger.error(f"Script failed: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/swesmith/profiles.py b/benchmarks/swesmith/profiles.py new file mode 100644 index 00000000..cea625f9 --- /dev/null +++ b/benchmarks/swesmith/profiles.py @@ -0,0 +1,111 @@ +""" +Custom repo profiles for SWE-Smith evaluation. + +Profiles defined here are auto-registered with the swesmith global registry +on import. To add a new repo, define a dataclass inheriting from the +appropriate base (GoProfile, PythonProfile, etc.) and it will be picked up +automatically. + +Usage in eval_infer.py: + import benchmarks.swesmith.profiles # noqa: F401 +""" + +from dataclasses import dataclass + +from swesmith.profiles import registry # triggers __init__.py → registers all languages +from swesmith.profiles.base import RepoProfile +from swesmith.profiles.golang import GoProfile +from swesmith.profiles.python import PythonProfile + + +# --------------------------------------------------------------------------- +# Monkey-patch: use image_name from the task instance dataset +# +# swesmith's RepoProfile.image_name is a @property that computes the Docker +# image name from profile fields. However, the computed name can differ from +# the actual image name stored in the task instance dataset (which was set at +# image build time and is the source of truth). +# +# Instead of recomputing the name, we patch the lookup to use the value +# directly from the task instance: +# +# 1. Patch registry.get_from_inst() to stash instance["image_name"] keyed +# by repo_name when the harness resolves a profile from an instance. +# 2. Patch RepoProfile.image_name to return the stashed value when available, +# falling back to the original computation otherwise. +# --------------------------------------------------------------------------- +_instance_image_names: dict[str, str] = {} + +_original_get_from_inst = registry.get_from_inst + + +def _patched_get_from_inst(instance): + rp = _original_get_from_inst(instance) + if "image_name" in instance: + _instance_image_names[rp.repo_name] = instance["image_name"] + return rp + + +registry.get_from_inst = _patched_get_from_inst + +_original_image_name_fget = RepoProfile.image_name.fget +assert _original_image_name_fget is not None +_image_name_getter = _original_image_name_fget + + +@property +def _patched_image_name(self): + override = _instance_image_names.get(self.repo_name) + if override is not None: + return override + return _image_name_getter(self) + + +RepoProfile.image_name = _patched_image_name # type: ignore[assignment] + + +# --------------------------------------------------------------------------- +# Custom profiles — add your repo profiles below. +# --------------------------------------------------------------------------- + + +@dataclass +class SecretGoProject2c88df8f(GoProfile): + owner: str = "studentkaramuk" + repo: str = "secret-go-project" + commit: str = "2c88df8f24627306470fb88dd4d89f11cee3408d" + org_gh: str = "studentkaramuk-swesmith" + + +@dataclass +class BookSummaryf26f9b51(PythonProfile): + owner: str = "reisepass" + repo: str = "book_chapter_detection_and_summarization" + commit: str = "f26f9b510449cd0bc7aacc2f504d793aed43bc96" + org_gh: str = "code-peerbench" + test_cmd: str = ( + "source /opt/miniconda3/bin/activate; " + "conda activate testbed; " + "ELEVENLABS_API_KEY=dummy " + "pytest tests/ --disable-warnings --color=no --tb=no --verbose" + ) + + +@dataclass +class Httpxae1b9f66(PythonProfile): + owner: str = "encode" + repo: str = "httpx" + commit: str = "ae1b9f66238f75ced3ced5e4485408435de10768" + org_gh: str = "studentkaramuk-swesmith" + + +# ---- Auto-register all profiles defined above ---- +_BASE_CLASSES = {RepoProfile, GoProfile, PythonProfile} + +for _name, _obj in list(globals().items()): + if ( + isinstance(_obj, type) + and issubclass(_obj, RepoProfile) + and _obj not in _BASE_CLASSES + ): + registry.register_profile(_obj) diff --git a/benchmarks/swesmith/run_infer.py b/benchmarks/swesmith/run_infer.py new file mode 100644 index 00000000..fb80b2bd --- /dev/null +++ b/benchmarks/swesmith/run_infer.py @@ -0,0 +1,487 @@ +import json +import os +from pathlib import Path +from typing import List + +from jinja2 import Environment, FileSystemLoader + +from benchmarks.swesmith import constants +from benchmarks.swesmith.build_images import ( + extract_custom_tag, + get_official_docker_image, +) +from benchmarks.swesmith.config import INFER_DEFAULTS +from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.build_utils import build_image +from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE +from benchmarks.utils.conversation import build_event_persistence_callback +from benchmarks.utils.critics import create_critic +from benchmarks.utils.dataset import get_dataset +from benchmarks.utils.evaluation import Evaluation +from benchmarks.utils.evaluation_utils import ( + construct_eval_output_dir, + get_default_on_result_writer, +) +from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.models import ( + EvalInstance, + EvalMetadata, + EvalOutput, +) +from benchmarks.utils.version import SDK_SHORT_SHA +from openhands.sdk import LLM, Agent, Conversation, get_logger +from openhands.sdk.workspace import RemoteWorkspace +from openhands.tools.preset.default import get_default_tools +from openhands.workspace import APIRemoteWorkspace, DockerWorkspace + + +logger = get_logger(__name__) + +_SSH_KEY_CONTAINER_PATH = "/workspace/github_key" +_GIT_SSH_COMMAND = ( + f"ssh -i {_SSH_KEY_CONTAINER_PATH}" + " -o StrictHostKeyChecking=accept-new" + " -o IdentitiesOnly=yes" +) + +_DEFAULT_SSH_KEYS = [ + "id_rsa", + "id_ecdsa", + "id_ecdsa_sk", + "id_ed25519", + "id_ed25519_sk", + "id_xmss", +] + + +def _find_ssh_key() -> Path | None: + """Find an SSH private key: GITHUB_USER_SSH_KEY env var first, then default paths.""" + key_path = os.environ.get("GITHUB_USER_SSH_KEY") + if key_path and Path(key_path).exists(): + return Path(key_path) + + ssh_dir = Path.home() / ".ssh" + for key_name in _DEFAULT_SSH_KEYS: + key_file = ssh_dir / key_name + if key_file.exists(): + return key_file + + return None + + +def get_instruction( + instance: dict, + metadata: EvalMetadata, + workspace_path: str, +) -> str: + """Generate instruction for the agent.""" + workspace_dir_name = instance["repo"].split("/")[-1] + assert metadata.details is not None + + # Set up Jinja2 environment + assert metadata.prompt_path is not None + prompts_dir = os.path.dirname(metadata.prompt_path) + template_name = os.path.basename(metadata.prompt_path) + env = Environment(loader=FileSystemLoader(prompts_dir)) + template = env.get_template(template_name) + + # Prepare context for rendering + context = { + "instance": instance, + "workspace_dir_name": workspace_dir_name, + "actual_workspace_path": workspace_path, + "metadata": metadata, + } + context["test_instructions"] = "" + + # Render the instruction + instruction = template.render(context) + return instruction + + +class SWESmithEvaluation(Evaluation): + """ + Process-based SWE-Smith evaluation implemented as a child of the + abstract Evaluation orchestrator. + + Implements: + - prepare_instances() + - prepare_workspace(instance) + - evaluate_instance(instance, workspace) + """ + + def prepare_instances(self) -> List[EvalInstance]: + logger.info("Setting up SWE-Smith evaluation data") + + df = get_dataset( + dataset_name=self.metadata.dataset, + split=self.metadata.dataset_split, + eval_limit=self.metadata.eval_limit, + selected_instances_file=self.metadata.selected_instances_file, + ) + + instances: List[EvalInstance] = [] + for _, row in df.iterrows(): + inst_id = str(row["instance_id"]) + instances.append(EvalInstance(id=inst_id, data=row.to_dict())) + + logger.info("Total instances to process: %d", len(instances)) + return instances + + # ---- Hook: prepare a workspace per instance ---------------------------------- + def prepare_workspace( + self, + instance: EvalInstance, + resource_factor: int = 1, + forward_env: list[str] | None = None, + ) -> RemoteWorkspace: + """ + Use DockerWorkspace by default. + + Args: + instance: The evaluation instance to prepare workspace for. + resource_factor: Resource factor for runtime allocation (default: 1). + Higher values allocate more CPU/memory resources. + Used by APIRemoteWorkspace for remote runtime allocation. + """ + # ADAPTATION 1: Use image_name field from dataset instead of deriving + # from instance_id (SWE-Smith stores image name directly in dataset) + official_docker_image = get_official_docker_image(instance.data["image_name"]) + build_target = constants.DEFAULT_BUILD_TARGET + custom_tag = extract_custom_tag(official_docker_image) + # For non-binary targets, append target suffix + suffix = ( + f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else "" + ) + base_agent_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" + ) + agent_server_image = base_agent_image + + # Forward all OPENHANDS_* env vars into the container with prefix stripped. + # e.g. OPENHANDS_ANTHROPIC_API_KEY becomes ANTHROPIC_API_KEY inside the container. + OPENHANDS_ENV_PREFIX = "OPENHANDS_" + forwarded_env_names = [] + for key, value in os.environ.items(): + if key.startswith(OPENHANDS_ENV_PREFIX): + stripped = key[len(OPENHANDS_ENV_PREFIX) :] + os.environ[stripped] = value + forwarded_env_names.append(stripped) + all_forward_env = list(forward_env or []) + forwarded_env_names + + volumes = [] + + # Forward GIT_SSH_COMMAND for private repo git fetch. + # The actual key is injected in evaluate_instance() via base64 to avoid + # Docker bind-mount permission issues. + ssh_key_path = _find_ssh_key() + if ssh_key_path: + all_forward_env.append("GIT_SSH_COMMAND") + os.environ["GIT_SSH_COMMAND"] = _GIT_SSH_COMMAND + logger.info(f"Found SSH key {ssh_key_path} for private repo access") + + if self.metadata.workspace_type == "docker": + SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") + logger.info(f"SKIP_BUILD={SKIP_BUILD}") + if not SKIP_BUILD: + logger.info( + f"Building workspace from {official_docker_image} " + f"for instance {instance.id}. " + "This may take a while...\n" + "You can run benchmarks/swesmith/build_images.py and set " + "SKIP_BUILD=1 to skip building and use pre-built " + "agent-server image." + ) + output = build_image( + base_image=official_docker_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=build_target, + push=False, + ) + logger.info(f"Image build output: {output}") + assert output.error is None, f"Image build failed: {output.error}" + if base_agent_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{base_agent_image}" + ) + + workspace = DockerWorkspace( + server_image=agent_server_image, + working_dir="/workspace", + forward_env=all_forward_env, + volumes=volumes, + ) + elif self.metadata.workspace_type == "remote": + runtime_api_key = os.getenv("RUNTIME_API_KEY") + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) + if not runtime_api_key: + raise ValueError( + "RUNTIME_API_KEY environment variable is not set for remote workspace" + ) + + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + ) + if not image_exists(agent_server_image): + raise RuntimeError( + f"Agent server image {agent_server_image} does not exist in container registry, " + "make sure to build, push it, and make it public accessible before using remote workspace." + ) + logger.info( + f"Using remote workspace with image {agent_server_image} " + f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" + ) + startup_timeout = float( + os.getenv( + "REMOTE_RUNTIME_STARTUP_TIMEOUT", + str(constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT), + ) + ) + workspace = APIRemoteWorkspace( + runtime_api_url=os.getenv( + "RUNTIME_API_URL", constants.DEFAULT_RUNTIME_API_URL + ), + runtime_api_key=runtime_api_key, + server_image=agent_server_image, + target_type="source" if "source" in build_target else "binary", + forward_env=forward_env or [], + resource_factor=resource_factor, + init_timeout=startup_timeout, + startup_wait_timeout=startup_timeout, + ) + else: + raise ValueError( + f"Unsupported workspace_type: {self.metadata.workspace_type}" + ) + + for cmd in self.metadata.env_setup_commands or []: + res = workspace.execute_command(cmd) + if res.exit_code != 0: + raise RuntimeError( + f"Failed to run env setup command '{cmd}': {res.stderr}" + ) + logger.debug(f"Ran env setup command '{cmd}': {res.stdout}") + return workspace + + # ---- Hook: evaluate one instance --------------------------------------------- + def evaluate_instance( + self, instance: EvalInstance, workspace: RemoteWorkspace + ) -> EvalOutput: + """ + Create conversation, run agent, collect history and git patch. + Do not write files here; just return EvalOutput. + """ + tools = get_default_tools( + # Disable browser tools in CLI mode + enable_browser=False, + ) + agent = Agent( + llm=self.metadata.llm, + tools=tools, + system_prompt_kwargs={"cli_mode": True}, + ) + + assert isinstance(workspace, RemoteWorkspace) + + repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/" + instance.data["repo_path"] = repo_path + + persist_callback = build_event_persistence_callback( + run_id=self.metadata.eval_output_dir, + instance_id=instance.id, + attempt=self.current_attempt, + ) + + conversation = Conversation( + agent=agent, + workspace=workspace, + callbacks=[persist_callback], + max_iteration_per_run=self.metadata.max_iterations, + delete_on_close=True, + ) + + logger.info("repo_path: %s", repo_path) + cp_testebed_repo = workspace.execute_command( + (f"mkdir -p {repo_path} ; cp -r /testbed/. {repo_path}") + ) + assert cp_testebed_repo.exit_code == 0, ( + f"cp_testebed_repo failed: {cp_testebed_repo.stderr}" + ) + + # git reset + git_reset = workspace.execute_command(f"cd {repo_path} ; git reset --hard") + assert git_reset.exit_code == 0, f"git reset failed: {git_reset.stderr}" + + # Inject SSH key into container for private repo git fetch. + # We base64-encode and decode to avoid shell escaping issues and + # Docker bind-mount permission problems. + ssh_key = _find_ssh_key() + if ssh_key: + import base64 + + key_b64 = base64.b64encode(ssh_key.read_bytes()).decode() + setup_ssh = workspace.execute_command( + f"echo '{key_b64}' | base64 -d > {_SSH_KEY_CONTAINER_PATH}" + f" && chmod 600 {_SSH_KEY_CONTAINER_PATH}" + ) + assert setup_ssh.exit_code == 0, f"SSH key setup failed: {setup_ssh.stderr}" + + # Fetch bug branch from GitHub mirror and checkout + # Use SSH URL for private repos when an SSH key is available + if ssh_key: + mirror_url = f"git@github.com:{instance.data['repo']}.git" + else: + mirror_url = f"https://github.com/{instance.data['repo']}.git" + git_fetch = workspace.execute_command( + f"cd {repo_path} ; git fetch {mirror_url} {instance.id}" + ) + assert git_fetch.exit_code == 0, f"git fetch failed: {git_fetch.stderr}" + git_checkout = workspace.execute_command( + f"cd {repo_path} ; git checkout FETCH_HEAD" + ) + assert git_checkout.exit_code == 0, ( + f"git checkout failed: {git_checkout.stderr}" + ) + + # Remove untracked files (respects .gitignore, so installed deps are preserved) + workspace.execute_command(f"cd {repo_path} ; git clean -fdq") + + # Capture HEAD after checkout so base_commit reflects the bug branch + head_result = workspace.execute_command(f"cd {repo_path} ; git rev-parse HEAD") + assert head_result.exit_code == 0, ( + f"git rev-parse HEAD failed: {head_result.stderr}" + ) + base_commit = head_result.stdout.strip() + instance.data["base_commit"] = base_commit + + instruction = get_instruction( + instance=instance.data, + metadata=self.metadata, + workspace_path=workspace.working_dir, + ) + conversation.send_message(instruction) + # Run conversation with fake user responses to handle agent messages + run_conversation_with_fake_user_response(conversation) + + # git add + workspace.execute_command(f"cd {repo_path} ; git add -A") + + # git commit + # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail + workspace.execute_command( + f"cd {repo_path} && " + f"git config --global user.email '{constants.GIT_USER_EMAIL}' && " + f"git config --global user.name '{constants.GIT_USER_NAME}' && " + f"git commit --no-verify -m '{constants.GIT_COMMIT_MESSAGE}'" + ) + + # Get git patch + git_patch_result = workspace.execute_command( + (f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD") + ) + assert git_patch_result.exit_code == 0, ( + f"git diff failed: {git_patch_result.stderr}" + ) + git_patch = git_patch_result.stdout + + # EvalOutput is your model; keep fields consistent with prior JSONL + out = EvalOutput( + instance_id=instance.id, + attempt=self.current_attempt, + test_result={ + "git_patch": git_patch, + }, + instruction=instruction, + error=None, + history=list(conversation.state.events), + metrics=conversation.conversation_stats.get_combined_metrics(), + ) + return out + + +def main() -> None: + from dotenv import load_dotenv + + load_dotenv() + + prompt_dir = (Path(__file__).parent / "prompts").resolve() + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + default_prompt_path = prompt_dir / "default.j2" + assert default_prompt_path.exists(), ( + f"Default prompt {default_prompt_path} not found" + ) + + parser = get_parser() + parser.add_argument( + "--prompt-path", + type=str, + default=str(default_prompt_path), + choices=choices, + help="Path to prompt template file", + ) + parser.set_defaults(**INFER_DEFAULTS) + args = parser.parse_args() + + # Validate max_attempts + if args.max_attempts < 1: + raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + + llm_config_path = args.llm_config_path + if not os.path.isfile(llm_config_path): + raise ValueError(f"LLM config file {llm_config_path} does not exist") + with open(llm_config_path, "r") as f: + llm_config = f.read() + llm = LLM.model_validate_json(llm_config) + logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) + + dataset_description = ( + args.dataset.replace("/", "__") + "-" + args.split.replace("/", "__") + ) + + structured_output_dir = construct_eval_output_dir( + base_dir=args.output_dir, + dataset_name=dataset_description, + model_name=llm.model, + max_iterations=args.max_iterations, + eval_note=args.note, + ) + + # Create critic instance from parsed arguments + critic = create_critic(args) + logger.info(f"Using critic: {type(critic).__name__}") + + metadata = EvalMetadata( + llm=llm, + dataset=args.dataset, + dataset_split=args.split, + max_iterations=args.max_iterations, + eval_output_dir=structured_output_dir, + details={}, + prompt_path=args.prompt_path, + eval_limit=args.n_limit, + env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], + max_attempts=args.max_attempts, + critic=critic, + selected_instances_file=args.select, + max_retries=args.max_retries, + workspace_type=args.workspace, + ) + + # Run orchestrator with a simple JSONL writer + evaluator = SWESmithEvaluation( + metadata=metadata, + num_workers=args.num_workers, + ) + + evaluator.run(on_result=get_default_on_result_writer(evaluator.output_path)) + + logger.info("Evaluation completed!") + # Emit machine-readable path for callers + print(json.dumps({"output_json": str(evaluator.output_path)})) + + +if __name__ == "__main__": + main() From 3fc2bd0efdfd0b0934613b91b4255ebe55645235 Mon Sep 17 00:00:00 2001 From: Rb Date: Wed, 18 Feb 2026 13:16:31 +0300 Subject: [PATCH 2/3] docs(swesmith): add prompt template, README, and env example Co-Authored-By: Muhammed Karamuk --- benchmarks/swesmith/.env.example | 6 + benchmarks/swesmith/README.md | 192 +++++++++++++++++++++++-- benchmarks/swesmith/prompts/default.j2 | 61 ++++++++ 3 files changed, 250 insertions(+), 9 deletions(-) create mode 100644 benchmarks/swesmith/.env.example create mode 100644 benchmarks/swesmith/prompts/default.j2 diff --git a/benchmarks/swesmith/.env.example b/benchmarks/swesmith/.env.example new file mode 100644 index 00000000..ad7bc859 --- /dev/null +++ b/benchmarks/swesmith/.env.example @@ -0,0 +1,6 @@ +# SSH key for private repo access (optional, only needed for non-standard key paths) +# If not set, default keys in ~/.ssh/ are used automatically. +# GITHUB_USER_SSH_KEY=/home/user/.ssh/id_ed25519_github + +# GitHub token (optional, increases GitHub API rate limit from 60 to 5000 req/hour) +# GITHUB_TOKEN= diff --git a/benchmarks/swesmith/README.md b/benchmarks/swesmith/README.md index 7e69768f..dbb16646 100644 --- a/benchmarks/swesmith/README.md +++ b/benchmarks/swesmith/README.md @@ -1,19 +1,23 @@ -# SWE-Smith Benchmark - building Docker images +# SWE-Smith Benchmark Evaluation -This directory contains implementation for building custom agent server Docker images for SWE-Smith. The primary purpose is to use GitHub workflows for building these images fast and using them to train LLMs as SWE agents. +This directory contains the implementation for running SWE-Smith evaluation using OpenHands agents. + +## Overview + +SWE-Smith is a benchmark for training and evaluating AI agents on synthetically generated software engineering tasks. Task instances are created by injecting bugs into real repositories and validating them against test suites. ## Dataset -- **Source**: [Paper](https://arxiv.org/abs/2504.21798) -- **Dataset**: - - `SWE-bench/SWE-smith-py` - Full dataset +- **Source**: [SWE-Smith Paper](https://arxiv.org/abs/2504.21798) +- **Dataset**: `SWE-bench/SWE-smith-py` - **Splits**: `train` +- Local task instance files (`.json` / `.jsonl`) generated via SWE-Smith are also supported. ## Usage -### Build Docker Images +### Step 1: Build Docker Images -You need to build Docker images for the SWE-Smith instances. Each instance requires a specific environment setup based on the repository and issue. **Note that this will consume atleast 150-200GB of disk space. Considering setting `--n-limit` to a smaller value if required.** +Before running inference, you need to build Docker images for the SWE-Smith instances. Each instance requires a specific environment setup. Disk usage depends on the number and size of task instances — the full dataset can consume 150-200GB, but smaller local instance files will use proportionally less. ```bash uv run python -m benchmarks.swesmith.build_images \ @@ -23,7 +27,177 @@ uv run python -m benchmarks.swesmith.build_images \ --target source-minimal ``` -### Running rollouts +For local task instance files: + +```bash +uv run python -m benchmarks.swesmith.build_images \ + --dataset /path/to/task_instances.json \ + --split train \ + --image ghcr.io/openhands/eval-agent-server \ + --target source-minimal \ + --n-limit 10 +``` + +### Step 2: Run Inference + +```bash +uv run swesmith-infer path/to/llm_config.json \ + --dataset /path/to/task_instances.json \ + --workspace docker \ + --max-iterations 75 \ + --num-workers 4 +``` + +**Selecting specific instances:** + +```bash +# Create instances.txt with one instance ID per line +echo "encode__httpx.ae1b9f66.lm_modify__abc123" > instances.txt + +uv run swesmith-infer path/to/llm_config.json \ + --dataset /path/to/task_instances.json \ + --select instances.txt \ + --workspace docker +``` + +### Configuration Options + +| Argument | Description | Default | +|----------|-------------|---------| +| `--dataset` | HuggingFace dataset name or local file path | `SWE-bench/SWE-smith-py` | +| `--split` | Dataset split | `train` | +| `--workspace` | Workspace type | `docker` | +| `--num-workers` | Parallel workers | `4` | +| `--max-iterations` | Max agent turns per instance | `500` | +| `--n-limit` | Limit number of instances | all | +| `--select` | Text file with instance IDs (one per line) | - | +| `--max-attempts` | Retry attempts with critic | `3` | +| `--critic` | `pass` / `finish_with_patch` / `empty_patch_critic` | `finish_with_patch` | +| `--prompt-path` | Jinja2 prompt template | `prompts/default.j2` | +| `--note` | Note appended to output directory name | - | + +### Private Repositories + +For private repos, an SSH key must be accessible. The lookup order is: + +1. `GITHUB_USER_SSH_KEY` environment variable (path to key file) +2. `~/.ssh/id_rsa`, `id_ecdsa`, `id_ecdsa_sk`, `id_ed25519`, `id_ed25519_sk` (first match) + +```bash +# Only needed if your key has a non-standard name +export GITHUB_USER_SSH_KEY=~/.ssh/my_custom_key +``` + +### Environment Variables + +Environment variables can be set directly or via a `.env` file in the project root. + +All environment variables prefixed with `OPENHANDS_` are forwarded into the Docker container with the prefix stripped. For example, `OPENHANDS_ANTHROPIC_API_KEY` becomes `ANTHROPIC_API_KEY` inside the container. This is how you pass LLM API keys and other credentials to the agent. + +```bash +export OPENHANDS_ANTHROPIC_API_KEY=sk-xxx +export OPENHANDS_OPENAI_API_KEY=sk-xxx +export OPENHANDS_GOOGLE_APPLICATION_CREDENTIALS='{"type":"service_account",...}' +``` + +| Variable | Description | +|----------|-------------| +| `OPENHANDS_*` | Forwarded into the container with prefix stripped (LLM keys, credentials, etc.) | +| `GITHUB_USER_SSH_KEY` | Path to SSH key for private repos | +| `SKIP_BUILD` | Set to `1` to skip Docker image building during inference (default: `1`) | + +## Evaluation + +After running inference, evaluate the generated patches: + +```bash +uv run swesmith-eval output.jsonl \ + --run-id my_eval \ + --dataset /path/to/task_instances.json +``` + +**Advanced options:** + +```bash +# Faster evaluation using only fail-to-pass tests +uv run swesmith-eval output.jsonl \ + --run-id my_eval \ + --dataset /path/to/task_instances.json \ + --f2p-only + +# Re-evaluate failed/errored instances +uv run swesmith-eval output.jsonl \ + --run-id my_eval \ + --dataset /path/to/task_instances.json \ + --redo-existing + +# Only regenerate the report from existing evaluation logs +uv run swesmith-eval output.jsonl \ + --run-id my_eval \ + --dataset /path/to/task_instances.json \ + --report-only +``` + +## Output Structure + +``` +eval_outputs/ +└── -/ + └── / + ├── output.jsonl # Main results + ├── output.critic_attempt_N.jsonl # Per-attempt results + ├── output.swesmith.jsonl # SWE-Smith format predictions + ├── output.report.json # Evaluation report (SWE-Smith format) + ├── cost_report.jsonl # Token usage and cost + └── conversations/ # Per-instance conversation logs + └── .tar.gz +``` + +**Inference result** (`output.jsonl`, one entry per line): + +```json +{ + "instance_id": "encode__httpx.ae1b9f66.lm_modify__abc123", + "attempt": 1, + "test_result": { + "git_patch": "diff --git a/file.py b/file.py\n..." + }, + "instruction": "...", + "history": [], + "metrics": {}, + "error": null +} +``` + +**Evaluation report** (`output.report.json`) follows the SWE-Smith report format: + +```json +{ + "resolved": 5, + "unresolved": 3, + "total": 8, + "ids_resolved": ["instance_1", "..."], + "ids_unresolved": ["instance_3", "..."] +} +``` + +## Custom Repository Profiles + +To add a custom repository, define a profile class in `profiles.py`: + +```python +@dataclass +class MyRepoBcd12345(PythonProfile): + owner: str = "github-org" + repo: str = "my-repo" + commit: str = "bcd1234567890" + org_gh: str = "org-swesmith" +``` + +Profiles are auto-registered on import. For Go repositories, inherit from `GoProfile` instead. -This is not supported yet for SWE-Smith because the primary purpose of this directory is fast and smooth creation of Docker images. +## References +- [SWE-Smith Paper](https://arxiv.org/abs/2504.21798) +- [SWE-Smith GitHub](https://github.com/SWE-bench/SWE-smith) +- [SWE-Smith Dataset on HuggingFace](https://huggingface.co/datasets/SWE-bench/SWE-smith) diff --git a/benchmarks/swesmith/prompts/default.j2 b/benchmarks/swesmith/prompts/default.j2 new file mode 100644 index 00000000..6adfc035 --- /dev/null +++ b/benchmarks/swesmith/prompts/default.j2 @@ -0,0 +1,61 @@ +I have access to a code repository in the directory {{ instance.repo_path }} . You can explore and modify files using the available tools. Consider the following issue description: + + +{{ instance.problem_statement }} + + +Can you help me implement the necessary changes to the repository so that the requirements specified in the are met? +I've already taken care of all changes to any of the test files described in the . This means you DON'T have to modify the testing logic or any of the tests in any way! +Also the development environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages. +Your task is to make the minimal changes to non-test files in the {{ instance.repo_path }} directory to ensure the is satisfied. + +Follow these phases to resolve the issue: + +Phase 1. READING: read the problem and reword it in clearer terms + 1.1 If there are code or config snippets. Express in words any best practices or conventions in them. + 1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details. + 1.3 Explain the problem in clear terms. + 1.4 Enumerate the steps to reproduce the problem. + 1.5 Hightlight any best practices to take into account when testing and fixing the issue + +Phase 2. RUNNING: set up and run the tests on the repository + 2.1 Explore the repository structure and any build/test configuration files to understand how to run the project. + 2.2 Activate any required environment (e.g., virtualenv, conda, nvm) if applicable. + 2.3 Iterate and figure out how to run the tests. + +Phase 3. EXPLORATION: find the files that are related to the problem and possible solutions + 3.1 Use `grep` to search for relevant methods, classes, keywords and error messages. + 3.2 Identify all files related to the problem statement. + 3.3 Propose the methods and files to fix the issue and explain why. + 3.4 From the possible file locations, select the most likely location to fix the issue. + +Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue. + 4.1 Look at existing test files in the repository to understand the test format/structure. + 4.2 Create a minimal reproduction script that reproduces the located issue. + 4.3 Run the reproduction script to confirm you are reproducing the issue. + 4.4 Adjust the reproduction script as necessary. + +Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it + 5.1 State clearly what the problem is. + 5.2 State clearly where the problem is located. + 5.3 State clearly how the test reproduces the issue. + 5.4 State clearly the best practices to take into account in the fix. + 5.5 State clearly how to fix the problem. + +Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution. + 6.1 Make minimal, focused changes to fix the issue. + +Phase 7. VERIFICATION: Test your implementation thoroughly. + 7.1 Run your reproduction script to verify the fix works. + 7.2 Add edge cases to your test script to ensure comprehensive coverage. + 7.3 Run existing tests related to the modified code to ensure you haven't broken anything. + +8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {{ instance.base_commit }}. + 8.1 Ensure you've fully addressed all requirements. + 8.2 Run any tests in the repository related to: + 8.2.1 The issue you are fixing + 8.2.2 The files you modified + 8.2.3 The functions you changed + 8.3 If any tests fail, revise your implementation until all tests pass + +Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity. From 8e2a05d8fa5c5c508b7dc192e25749983405db88 Mon Sep 17 00:00:00 2001 From: Muhammed Karamuk Date: Wed, 18 Feb 2026 13:16:37 +0300 Subject: [PATCH 3/3] fix(utils): support .json files in dataset loader Co-Authored-By: Rb --- benchmarks/utils/dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/benchmarks/utils/dataset.py b/benchmarks/utils/dataset.py index a60356ca..3eeab8d0 100644 --- a/benchmarks/utils/dataset.py +++ b/benchmarks/utils/dataset.py @@ -114,7 +114,9 @@ def get_dataset( ) -> pd.DataFrame: """Load and prepare dataset for evaluation.""" # Check if dataset_name is a local file path - if os.path.isfile(dataset_name) and dataset_name.endswith(".jsonl"): + if os.path.isfile(dataset_name) and ( + dataset_name.endswith(".jsonl") or dataset_name.endswith(".json") + ): # Load local JSONL file dataset = load_dataset("json", data_files=dataset_name, split="train") assert isinstance(dataset, Dataset)