From 8fb5447717ef2e9f360cae40c030b3de67a5962e Mon Sep 17 00:00:00 2001
From: Rb <rubenwolff@gmail.com>
Date: Wed, 18 Feb 2026 13:16:23 +0300
Subject: [PATCH 1/3] feat(swesmith): add SWE-Smith benchmark scaffold

Add core inference and evaluation scripts for running OpenHands agents
on SWE-Smith task instances.

Co-Authored-By: Muhammed Karamuk <mkaramuk@proton.me>
---
 benchmarks/swesmith/config.py     |  15 +
 benchmarks/swesmith/constants.py  |  28 ++
 benchmarks/swesmith/eval_infer.py | 314 +++++++++++++++++++
 benchmarks/swesmith/profiles.py   | 111 +++++++
 benchmarks/swesmith/run_infer.py  | 487 ++++++++++++++++++++++++++++++
 5 files changed, 955 insertions(+)
 create mode 100644 benchmarks/swesmith/config.py
 create mode 100644 benchmarks/swesmith/constants.py
 create mode 100644 benchmarks/swesmith/eval_infer.py
 create mode 100644 benchmarks/swesmith/profiles.py
 create mode 100644 benchmarks/swesmith/run_infer.py

diff --git a/benchmarks/swesmith/config.py b/benchmarks/swesmith/config.py
new file mode 100644
index 00000000..a65bae76
--- /dev/null
+++ b/benchmarks/swesmith/config.py
@@ -0,0 +1,15 @@
+"""
+SWE-Smith benchmark configuration.
+"""
+
+# Inference defaults (used by run_infer.py)
+INFER_DEFAULTS = {
+    "dataset": "SWE-bench/SWE-smith-py",
+    "split": "train",
+    "num_workers": 4,
+}
+
+# Evaluation defaults (used by eval_infer.py)
+EVAL_DEFAULTS = {
+    "workers": 4,
+}
diff --git a/benchmarks/swesmith/constants.py b/benchmarks/swesmith/constants.py
new file mode 100644
index 00000000..9b903abb
--- /dev/null
+++ b/benchmarks/swesmith/constants.py
@@ -0,0 +1,28 @@
+"""
+SWE-Smith hyperparameters and constant values.
+"""
+
+from typing import Final, Literal
+
+
+# Build target type (matches openhands.agent_server.docker.build.TargetType)
+TargetType = Literal["binary", "binary-minimal", "source", "source-minimal"]
+BUILD_TARGET_SOURCE_MINIMAL: Final[TargetType] = "source-minimal"
+BUILD_TARGET_BINARY: Final[TargetType] = "binary"
+DEFAULT_BUILD_TARGET: Final[TargetType] = BUILD_TARGET_SOURCE_MINIMAL
+
+# Runtime
+DEFAULT_RUNTIME_API_URL: Final[str] = "https://runtime.eval.all-hands.dev"
+DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT: Final[int] = 600
+
+# Git
+GIT_USER_EMAIL: Final[str] = "evaluation@openhands.dev"
+GIT_USER_NAME: Final[str] = "OpenHands Evaluation"
+GIT_COMMIT_MESSAGE: Final[str] = "patch"
+
+# Patch Processing
+SETUP_FILES_TO_REMOVE: Final[tuple[str, ...]] = (
+    "pyproject.toml",
+    "tox.ini",
+    "setup.py",
+)
diff --git a/benchmarks/swesmith/eval_infer.py b/benchmarks/swesmith/eval_infer.py
new file mode 100644
index 00000000..f613e2d3
--- /dev/null
+++ b/benchmarks/swesmith/eval_infer.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python3
+"""
+SWE-Smith Evaluation Script
+
+This script converts OpenHands output.jsonl format to SWE-Smith prediction format
+and runs the SWE-Smith evaluation.
+
+Usage:
+    uv run swesmith-eval <path_to_output.jsonl> --run-id <run_id> --dataset <path_to_dataset>
+"""
+
+import argparse
+import json
+import os
+import shutil
+import sys
+from pathlib import Path
+
+from swesmith.harness.eval import main as swesmith_eval_main
+
+import benchmarks.swesmith.profiles  # noqa: F401 — registers custom profiles
+from benchmarks.swesmith import constants
+from benchmarks.swesmith.config import EVAL_DEFAULTS
+from benchmarks.utils.constants import MODEL_NAME_OR_PATH
+from benchmarks.utils.laminar import LaminarService
+from benchmarks.utils.patch_utils import remove_files_from_patch
+from benchmarks.utils.report_costs import generate_cost_report
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def convert_to_swesmith_format(input_file: str, output_file: str) -> None:
+    """
+    Convert OpenHands output.jsonl to SWE-Smith prediction format.
+
+    OpenHands format:
+    {
+        "instance_id": "repo__name.hash__ig_llm",
+        "test_result": {
+            "git_patch": "diff --git a/file.py b/file.py\n..."
+        },
+        ...
+    }
+
+    SWE-Smith format:
+    {
+        "instance_id": "repo__name.hash__ig_llm",
+        "model_patch": "diff --git a/file.py b/file.py\n...",
+        "model_name_or_path": "<MODEL_NAME_OR_PATH>"
+    }
+    """
+    logger.info(f"Converting {input_file} to SWE-Smith format: {output_file}")
+
+    converted_count = 0
+    error_count = 0
+
+    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
+        for line_num, line in enumerate(infile, 1):
+            try:
+                line = line.strip()
+                if not line:
+                    continue
+
+                data = json.loads(line)
+
+                instance_id = data.get("instance_id")
+                if not instance_id:
+                    logger.warning(f"Line {line_num}: Missing instance_id")
+                    error_count += 1
+                    continue
+
+                test_result = data.get("test_result", {})
+                git_patch = test_result.get("git_patch", "")
+
+                if not git_patch:
+                    logger.warning(
+                        f"Line {line_num}: Missing or empty git_patch for {instance_id}"
+                    )
+                    git_patch = ""
+
+                git_patch = remove_files_from_patch(
+                    git_patch, constants.SETUP_FILES_TO_REMOVE
+                )
+
+                swesmith_entry = {
+                    "instance_id": instance_id,
+                    "model_patch": git_patch,
+                    "model_name_or_path": MODEL_NAME_OR_PATH,
+                }
+
+                outfile.write(json.dumps(swesmith_entry) + "\n")
+                converted_count += 1
+
+            except json.JSONDecodeError as e:
+                logger.error(f"Line {line_num}: Invalid JSON - {e}")
+                error_count += 1
+            except Exception as e:
+                logger.error(f"Line {line_num}: Unexpected error - {e}")
+                error_count += 1
+
+    logger.info(
+        f"Conversion complete: {converted_count} entries converted, "
+        f"{error_count} errors"
+    )
+
+    if converted_count == 0:
+        raise ValueError("No valid entries were converted")
+
+
+def run_swesmith_evaluation(
+    predictions_file: str,
+    run_id: str,
+    dataset: str,
+    workers: int = EVAL_DEFAULTS["workers"],
+    f2p_only: bool = False,
+    instance_ids: list[str] | None = None,
+    report_only: bool = False,
+    redo_existing: bool = False,
+) -> None:
+    """
+    Run SWE-Smith evaluation on the predictions file.
+
+    Calls swesmith.harness.eval directly as a Python API (not subprocess).
+    Custom profiles from benchmarks.swesmith.profiles are auto-registered
+    at import time, making them available to the swesmith harness.
+
+    Args:
+        predictions_file: Path to the SWE-Smith format predictions file
+        run_id: Unique identifier for this evaluation run
+        dataset: Path to SWE-Smith dataset file (.json or .jsonl)
+        workers: Number of workers to use for evaluation
+        f2p_only: Run evaluation using only files with fail-to-pass tests
+        instance_ids: Instance IDs to evaluate (supports glob patterns)
+        report_only: Regenerate reports only, skip running evaluations
+        redo_existing: Redo already-completed evaluation instances
+    """
+    logger.info(f"Running SWE-Smith evaluation on {predictions_file}")
+
+    predictions_path = Path(predictions_file)
+    predictions_dir = predictions_path.parent
+
+    # Resolve dataset to absolute path before changing cwd
+    dataset_abs = str(Path(dataset).resolve())
+
+    logger.info(f"Working directory: {predictions_dir}")
+
+    # swesmith writes logs relative to cwd, so we temporarily change to
+    # the predictions directory (same effect as subprocess cwd=).
+    original_cwd = os.getcwd()
+    os.chdir(predictions_dir)
+    try:
+        swesmith_eval_main(
+            run_id=run_id,
+            workers=workers,
+            predictions_path=predictions_path.name,
+            dataset_path=dataset_abs,
+            f2p_only=f2p_only,
+            instance_ids=instance_ids,
+            report_only=report_only,
+            redo_existing=redo_existing,
+        )
+        logger.info("SWE-Smith evaluation completed successfully")
+    except Exception as e:
+        logger.error(f"SWE-Smith evaluation failed: {e}")
+        raise
+    finally:
+        os.chdir(original_cwd)
+
+
+def main() -> None:
+    """Main entry point for the script."""
+    from dotenv import load_dotenv
+
+    load_dotenv()
+
+    parser = argparse.ArgumentParser(
+        description="Convert OpenHands output to SWE-Smith format and run evaluation",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+    uv run swesmith-eval output.jsonl --run-id my_eval --dataset /path/to/dataset.json
+    uv run swesmith-eval output.jsonl --run-id test --dataset /path/to/dataset.json --skip-evaluation
+    uv run swesmith-eval output.jsonl --run-id fast --dataset /path/to/dataset.json --f2p-only
+    uv run swesmith-eval output.jsonl --run-id filtered --dataset /path/to/dataset.json --instance-ids "repo__name.*"
+        """,
+    )
+
+    parser.add_argument("input_file", help="Path to the OpenHands output.jsonl file")
+
+    parser.add_argument(
+        "--dataset",
+        required=True,
+        help="Path to SWE-Smith dataset file (.json or .jsonl)",
+    )
+
+    parser.add_argument(
+        "--output-file",
+        help="Output file for SWE-Smith format "
+        "(default: input_file with .swesmith.jsonl extension)",
+    )
+
+    parser.add_argument(
+        "--skip-evaluation",
+        action="store_true",
+        help="Only convert format, skip running evaluation",
+    )
+
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=EVAL_DEFAULTS["workers"],
+        help=f"Number of workers to use when evaluating (default: {EVAL_DEFAULTS['workers']})",
+    )
+
+    parser.add_argument(
+        "--run-id",
+        required=True,
+        help="Unique identifier for this evaluation run",
+    )
+
+    parser.add_argument(
+        "--f2p-only",
+        action="store_true",
+        help="Run evaluation using only files with fail-to-pass tests (faster)",
+    )
+
+    parser.add_argument(
+        "--instance-ids",
+        nargs="+",
+        help="Instance IDs to evaluate (supports glob patterns like 'repo__name.*')",
+    )
+
+    parser.add_argument(
+        "--report-only",
+        action="store_true",
+        help="Regenerate reports only, skip running evaluations",
+    )
+
+    parser.add_argument(
+        "--redo-existing",
+        action="store_true",
+        help="Redo already-completed evaluation instances",
+    )
+
+    args = parser.parse_args()
+
+    input_file = Path(args.input_file)
+    if not input_file.exists():
+        logger.error(f"Input file does not exist: {input_file}")
+        sys.exit(1)
+
+    if not input_file.suffix == ".jsonl":
+        logger.warning(f"Input file does not have .jsonl extension: {input_file}")
+
+    if args.output_file:
+        output_file = Path(args.output_file)
+    else:
+        output_file = input_file.with_suffix(".swesmith.jsonl")
+
+    logger.info(f"Input file: {input_file}")
+    logger.info(f"Output file: {output_file}")
+    logger.info(f"Dataset: {args.dataset}")
+
+    dest_report_path: Path | None = None
+
+    try:
+        convert_to_swesmith_format(str(input_file), str(output_file))
+
+        if not args.skip_evaluation:
+            run_swesmith_evaluation(
+                str(output_file),
+                args.run_id,
+                args.dataset,
+                args.workers,
+                f2p_only=args.f2p_only,
+                instance_ids=args.instance_ids,
+                report_only=args.report_only,
+                redo_existing=args.redo_existing,
+            )
+
+            # swesmith creates: logs/run_evaluation/{run_id}/report.json relative to cwd
+            report_path = (
+                output_file.parent
+                / "logs"
+                / "run_evaluation"
+                / args.run_id
+                / "report.json"
+            )
+            dest_report_path = input_file.with_suffix(".report.json")
+
+            shutil.move(str(report_path), str(dest_report_path))
+            logger.info(f"Moved report file to: {dest_report_path}")
+
+            LaminarService.get().update_evaluation_scores(
+                str(input_file), str(dest_report_path)
+            )
+
+        generate_cost_report(str(input_file))
+
+        logger.info("Script completed successfully!")
+        if not args.skip_evaluation and dest_report_path is not None:
+            print(json.dumps({"report_json": str(dest_report_path)}))
+        else:
+            print(json.dumps({"report_json": ""}))
+
+    except Exception as e:
+        logger.error(f"Script failed: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/swesmith/profiles.py b/benchmarks/swesmith/profiles.py
new file mode 100644
index 00000000..cea625f9
--- /dev/null
+++ b/benchmarks/swesmith/profiles.py
@@ -0,0 +1,111 @@
+"""
+Custom repo profiles for SWE-Smith evaluation.
+
+Profiles defined here are auto-registered with the swesmith global registry
+on import. To add a new repo, define a dataclass inheriting from the
+appropriate base (GoProfile, PythonProfile, etc.) and it will be picked up
+automatically.
+
+Usage in eval_infer.py:
+    import benchmarks.swesmith.profiles  # noqa: F401
+"""
+
+from dataclasses import dataclass
+
+from swesmith.profiles import registry  # triggers __init__.py → registers all languages
+from swesmith.profiles.base import RepoProfile
+from swesmith.profiles.golang import GoProfile
+from swesmith.profiles.python import PythonProfile
+
+
+# ---------------------------------------------------------------------------
+# Monkey-patch: use image_name from the task instance dataset
+#
+# swesmith's RepoProfile.image_name is a @property that computes the Docker
+# image name from profile fields. However, the computed name can differ from
+# the actual image name stored in the task instance dataset (which was set at
+# image build time and is the source of truth).
+#
+# Instead of recomputing the name, we patch the lookup to use the value
+# directly from the task instance:
+#
+# 1. Patch registry.get_from_inst() to stash instance["image_name"] keyed
+#    by repo_name when the harness resolves a profile from an instance.
+# 2. Patch RepoProfile.image_name to return the stashed value when available,
+#    falling back to the original computation otherwise.
+# ---------------------------------------------------------------------------
+_instance_image_names: dict[str, str] = {}
+
+_original_get_from_inst = registry.get_from_inst
+
+
+def _patched_get_from_inst(instance):
+    rp = _original_get_from_inst(instance)
+    if "image_name" in instance:
+        _instance_image_names[rp.repo_name] = instance["image_name"]
+    return rp
+
+
+registry.get_from_inst = _patched_get_from_inst
+
+_original_image_name_fget = RepoProfile.image_name.fget
+assert _original_image_name_fget is not None
+_image_name_getter = _original_image_name_fget
+
+
+@property
+def _patched_image_name(self):
+    override = _instance_image_names.get(self.repo_name)
+    if override is not None:
+        return override
+    return _image_name_getter(self)
+
+
+RepoProfile.image_name = _patched_image_name  # type: ignore[assignment]
+
+
+# ---------------------------------------------------------------------------
+# Custom profiles — add your repo profiles below.
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class SecretGoProject2c88df8f(GoProfile):
+    owner: str = "studentkaramuk"
+    repo: str = "secret-go-project"
+    commit: str = "2c88df8f24627306470fb88dd4d89f11cee3408d"
+    org_gh: str = "studentkaramuk-swesmith"
+
+
+@dataclass
+class BookSummaryf26f9b51(PythonProfile):
+    owner: str = "reisepass"
+    repo: str = "book_chapter_detection_and_summarization"
+    commit: str = "f26f9b510449cd0bc7aacc2f504d793aed43bc96"
+    org_gh: str = "code-peerbench"
+    test_cmd: str = (
+        "source /opt/miniconda3/bin/activate; "
+        "conda activate testbed; "
+        "ELEVENLABS_API_KEY=dummy "
+        "pytest tests/ --disable-warnings --color=no --tb=no --verbose"
+    )
+
+
+@dataclass
+class Httpxae1b9f66(PythonProfile):
+    owner: str = "encode"
+    repo: str = "httpx"
+    commit: str = "ae1b9f66238f75ced3ced5e4485408435de10768"
+    org_gh: str = "studentkaramuk-swesmith"
+
+
+# ---- Auto-register all profiles defined above ----
+_BASE_CLASSES = {RepoProfile, GoProfile, PythonProfile}
+
+for _name, _obj in list(globals().items()):
+    if (
+        isinstance(_obj, type)
+        and issubclass(_obj, RepoProfile)
+        and _obj not in _BASE_CLASSES
+    ):
+        registry.register_profile(_obj)
diff --git a/benchmarks/swesmith/run_infer.py b/benchmarks/swesmith/run_infer.py
new file mode 100644
index 00000000..fb80b2bd
--- /dev/null
+++ b/benchmarks/swesmith/run_infer.py
@@ -0,0 +1,487 @@
+import json
+import os
+from pathlib import Path
+from typing import List
+
+from jinja2 import Environment, FileSystemLoader
+
+from benchmarks.swesmith import constants
+from benchmarks.swesmith.build_images import (
+    extract_custom_tag,
+    get_official_docker_image,
+)
+from benchmarks.swesmith.config import INFER_DEFAULTS
+from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.build_utils import build_image
+from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
+from benchmarks.utils.conversation import build_event_persistence_callback
+from benchmarks.utils.critics import create_critic
+from benchmarks.utils.dataset import get_dataset
+from benchmarks.utils.evaluation import Evaluation
+from benchmarks.utils.evaluation_utils import (
+    construct_eval_output_dir,
+    get_default_on_result_writer,
+)
+from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
+from benchmarks.utils.image_utils import image_exists
+from benchmarks.utils.models import (
+    EvalInstance,
+    EvalMetadata,
+    EvalOutput,
+)
+from benchmarks.utils.version import SDK_SHORT_SHA
+from openhands.sdk import LLM, Agent, Conversation, get_logger
+from openhands.sdk.workspace import RemoteWorkspace
+from openhands.tools.preset.default import get_default_tools
+from openhands.workspace import APIRemoteWorkspace, DockerWorkspace
+
+
+logger = get_logger(__name__)
+
+_SSH_KEY_CONTAINER_PATH = "/workspace/github_key"
+_GIT_SSH_COMMAND = (
+    f"ssh -i {_SSH_KEY_CONTAINER_PATH}"
+    " -o StrictHostKeyChecking=accept-new"
+    " -o IdentitiesOnly=yes"
+)
+
+_DEFAULT_SSH_KEYS = [
+    "id_rsa",
+    "id_ecdsa",
+    "id_ecdsa_sk",
+    "id_ed25519",
+    "id_ed25519_sk",
+    "id_xmss",
+]
+
+
+def _find_ssh_key() -> Path | None:
+    """Find an SSH private key: GITHUB_USER_SSH_KEY env var first, then default paths."""
+    key_path = os.environ.get("GITHUB_USER_SSH_KEY")
+    if key_path and Path(key_path).exists():
+        return Path(key_path)
+
+    ssh_dir = Path.home() / ".ssh"
+    for key_name in _DEFAULT_SSH_KEYS:
+        key_file = ssh_dir / key_name
+        if key_file.exists():
+            return key_file
+
+    return None
+
+
+def get_instruction(
+    instance: dict,
+    metadata: EvalMetadata,
+    workspace_path: str,
+) -> str:
+    """Generate instruction for the agent."""
+    workspace_dir_name = instance["repo"].split("/")[-1]
+    assert metadata.details is not None
+
+    # Set up Jinja2 environment
+    assert metadata.prompt_path is not None
+    prompts_dir = os.path.dirname(metadata.prompt_path)
+    template_name = os.path.basename(metadata.prompt_path)
+    env = Environment(loader=FileSystemLoader(prompts_dir))
+    template = env.get_template(template_name)
+
+    # Prepare context for rendering
+    context = {
+        "instance": instance,
+        "workspace_dir_name": workspace_dir_name,
+        "actual_workspace_path": workspace_path,
+        "metadata": metadata,
+    }
+    context["test_instructions"] = ""
+
+    # Render the instruction
+    instruction = template.render(context)
+    return instruction
+
+
+class SWESmithEvaluation(Evaluation):
+    """
+    Process-based SWE-Smith evaluation implemented as a child of the
+    abstract Evaluation orchestrator.
+
+    Implements:
+      - prepare_instances()
+      - prepare_workspace(instance)
+      - evaluate_instance(instance, workspace)
+    """
+
+    def prepare_instances(self) -> List[EvalInstance]:
+        logger.info("Setting up SWE-Smith evaluation data")
+
+        df = get_dataset(
+            dataset_name=self.metadata.dataset,
+            split=self.metadata.dataset_split,
+            eval_limit=self.metadata.eval_limit,
+            selected_instances_file=self.metadata.selected_instances_file,
+        )
+
+        instances: List[EvalInstance] = []
+        for _, row in df.iterrows():
+            inst_id = str(row["instance_id"])
+            instances.append(EvalInstance(id=inst_id, data=row.to_dict()))
+
+        logger.info("Total instances to process: %d", len(instances))
+        return instances
+
+    # ---- Hook: prepare a workspace per instance ----------------------------------
+    def prepare_workspace(
+        self,
+        instance: EvalInstance,
+        resource_factor: int = 1,
+        forward_env: list[str] | None = None,
+    ) -> RemoteWorkspace:
+        """
+        Use DockerWorkspace by default.
+
+        Args:
+            instance: The evaluation instance to prepare workspace for.
+            resource_factor: Resource factor for runtime allocation (default: 1).
+                           Higher values allocate more CPU/memory resources.
+                           Used by APIRemoteWorkspace for remote runtime allocation.
+        """
+        # ADAPTATION 1: Use image_name field from dataset instead of deriving
+        # from instance_id (SWE-Smith stores image name directly in dataset)
+        official_docker_image = get_official_docker_image(instance.data["image_name"])
+        build_target = constants.DEFAULT_BUILD_TARGET
+        custom_tag = extract_custom_tag(official_docker_image)
+        # For non-binary targets, append target suffix
+        suffix = (
+            f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else ""
+        )
+        base_agent_image = (
+            f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
+        )
+        agent_server_image = base_agent_image
+
+        # Forward all OPENHANDS_* env vars into the container with prefix stripped.
+        # e.g. OPENHANDS_ANTHROPIC_API_KEY becomes ANTHROPIC_API_KEY inside the container.
+        OPENHANDS_ENV_PREFIX = "OPENHANDS_"
+        forwarded_env_names = []
+        for key, value in os.environ.items():
+            if key.startswith(OPENHANDS_ENV_PREFIX):
+                stripped = key[len(OPENHANDS_ENV_PREFIX) :]
+                os.environ[stripped] = value
+                forwarded_env_names.append(stripped)
+        all_forward_env = list(forward_env or []) + forwarded_env_names
+
+        volumes = []
+
+        # Forward GIT_SSH_COMMAND for private repo git fetch.
+        # The actual key is injected in evaluate_instance() via base64 to avoid
+        # Docker bind-mount permission issues.
+        ssh_key_path = _find_ssh_key()
+        if ssh_key_path:
+            all_forward_env.append("GIT_SSH_COMMAND")
+            os.environ["GIT_SSH_COMMAND"] = _GIT_SSH_COMMAND
+            logger.info(f"Found SSH key {ssh_key_path} for private repo access")
+
+        if self.metadata.workspace_type == "docker":
+            SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
+            logger.info(f"SKIP_BUILD={SKIP_BUILD}")
+            if not SKIP_BUILD:
+                logger.info(
+                    f"Building workspace from {official_docker_image} "
+                    f"for instance {instance.id}. "
+                    "This may take a while...\n"
+                    "You can run benchmarks/swesmith/build_images.py and set "
+                    "SKIP_BUILD=1 to skip building and use pre-built "
+                    "agent-server image."
+                )
+                output = build_image(
+                    base_image=official_docker_image,
+                    target_image=EVAL_AGENT_SERVER_IMAGE,
+                    custom_tag=custom_tag,
+                    target=build_target,
+                    push=False,
+                )
+                logger.info(f"Image build output: {output}")
+                assert output.error is None, f"Image build failed: {output.error}"
+                if base_agent_image not in output.tags:
+                    raise RuntimeError(
+                        f"Built image tags {output.tags} do not include expected tag "
+                        f"{base_agent_image}"
+                    )
+
+            workspace = DockerWorkspace(
+                server_image=agent_server_image,
+                working_dir="/workspace",
+                forward_env=all_forward_env,
+                volumes=volumes,
+            )
+        elif self.metadata.workspace_type == "remote":
+            runtime_api_key = os.getenv("RUNTIME_API_KEY")
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
+            if not runtime_api_key:
+                raise ValueError(
+                    "RUNTIME_API_KEY environment variable is not set for remote workspace"
+                )
+
+            agent_server_image = (
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
+            )
+            if not image_exists(agent_server_image):
+                raise RuntimeError(
+                    f"Agent server image {agent_server_image} does not exist in container registry, "
+                    "make sure to build, push it, and make it public accessible before using remote workspace."
+                )
+            logger.info(
+                f"Using remote workspace with image {agent_server_image} "
+                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
+            )
+            startup_timeout = float(
+                os.getenv(
+                    "REMOTE_RUNTIME_STARTUP_TIMEOUT",
+                    str(constants.DEFAULT_REMOTE_RUNTIME_STARTUP_TIMEOUT),
+                )
+            )
+            workspace = APIRemoteWorkspace(
+                runtime_api_url=os.getenv(
+                    "RUNTIME_API_URL", constants.DEFAULT_RUNTIME_API_URL
+                ),
+                runtime_api_key=runtime_api_key,
+                server_image=agent_server_image,
+                target_type="source" if "source" in build_target else "binary",
+                forward_env=forward_env or [],
+                resource_factor=resource_factor,
+                init_timeout=startup_timeout,
+                startup_wait_timeout=startup_timeout,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported workspace_type: {self.metadata.workspace_type}"
+            )
+
+        for cmd in self.metadata.env_setup_commands or []:
+            res = workspace.execute_command(cmd)
+            if res.exit_code != 0:
+                raise RuntimeError(
+                    f"Failed to run env setup command '{cmd}': {res.stderr}"
+                )
+            logger.debug(f"Ran env setup command '{cmd}': {res.stdout}")
+        return workspace
+
+    # ---- Hook: evaluate one instance ---------------------------------------------
+    def evaluate_instance(
+        self, instance: EvalInstance, workspace: RemoteWorkspace
+    ) -> EvalOutput:
+        """
+        Create conversation, run agent, collect history and git patch.
+        Do not write files here; just return EvalOutput.
+        """
+        tools = get_default_tools(
+            # Disable browser tools in CLI mode
+            enable_browser=False,
+        )
+        agent = Agent(
+            llm=self.metadata.llm,
+            tools=tools,
+            system_prompt_kwargs={"cli_mode": True},
+        )
+
+        assert isinstance(workspace, RemoteWorkspace)
+
+        repo_path = f"/workspace/{instance.data['repo'].split('/')[-1]}/"
+        instance.data["repo_path"] = repo_path
+
+        persist_callback = build_event_persistence_callback(
+            run_id=self.metadata.eval_output_dir,
+            instance_id=instance.id,
+            attempt=self.current_attempt,
+        )
+
+        conversation = Conversation(
+            agent=agent,
+            workspace=workspace,
+            callbacks=[persist_callback],
+            max_iteration_per_run=self.metadata.max_iterations,
+            delete_on_close=True,
+        )
+
+        logger.info("repo_path: %s", repo_path)
+        cp_testebed_repo = workspace.execute_command(
+            (f"mkdir -p {repo_path} ; cp -r /testbed/. {repo_path}")
+        )
+        assert cp_testebed_repo.exit_code == 0, (
+            f"cp_testebed_repo failed: {cp_testebed_repo.stderr}"
+        )
+
+        # git reset
+        git_reset = workspace.execute_command(f"cd {repo_path} ; git reset --hard")
+        assert git_reset.exit_code == 0, f"git reset failed: {git_reset.stderr}"
+
+        # Inject SSH key into container for private repo git fetch.
+        # We base64-encode and decode to avoid shell escaping issues and
+        # Docker bind-mount permission problems.
+        ssh_key = _find_ssh_key()
+        if ssh_key:
+            import base64
+
+            key_b64 = base64.b64encode(ssh_key.read_bytes()).decode()
+            setup_ssh = workspace.execute_command(
+                f"echo '{key_b64}' | base64 -d > {_SSH_KEY_CONTAINER_PATH}"
+                f" && chmod 600 {_SSH_KEY_CONTAINER_PATH}"
+            )
+            assert setup_ssh.exit_code == 0, f"SSH key setup failed: {setup_ssh.stderr}"
+
+        # Fetch bug branch from GitHub mirror and checkout
+        # Use SSH URL for private repos when an SSH key is available
+        if ssh_key:
+            mirror_url = f"git@github.com:{instance.data['repo']}.git"
+        else:
+            mirror_url = f"https://github.com/{instance.data['repo']}.git"
+        git_fetch = workspace.execute_command(
+            f"cd {repo_path} ; git fetch {mirror_url} {instance.id}"
+        )
+        assert git_fetch.exit_code == 0, f"git fetch failed: {git_fetch.stderr}"
+        git_checkout = workspace.execute_command(
+            f"cd {repo_path} ; git checkout FETCH_HEAD"
+        )
+        assert git_checkout.exit_code == 0, (
+            f"git checkout failed: {git_checkout.stderr}"
+        )
+
+        # Remove untracked files (respects .gitignore, so installed deps are preserved)
+        workspace.execute_command(f"cd {repo_path} ; git clean -fdq")
+
+        # Capture HEAD after checkout so base_commit reflects the bug branch
+        head_result = workspace.execute_command(f"cd {repo_path} ; git rev-parse HEAD")
+        assert head_result.exit_code == 0, (
+            f"git rev-parse HEAD failed: {head_result.stderr}"
+        )
+        base_commit = head_result.stdout.strip()
+        instance.data["base_commit"] = base_commit
+
+        instruction = get_instruction(
+            instance=instance.data,
+            metadata=self.metadata,
+            workspace_path=workspace.working_dir,
+        )
+        conversation.send_message(instruction)
+        # Run conversation with fake user responses to handle agent messages
+        run_conversation_with_fake_user_response(conversation)
+
+        # git add
+        workspace.execute_command(f"cd {repo_path} ; git add -A")
+
+        # git commit
+        # Use --no-verify to bypass pre-commit hooks (e.g., husky) that can fail
+        workspace.execute_command(
+            f"cd {repo_path} && "
+            f"git config --global user.email '{constants.GIT_USER_EMAIL}' && "
+            f"git config --global user.name '{constants.GIT_USER_NAME}' && "
+            f"git commit --no-verify -m '{constants.GIT_COMMIT_MESSAGE}'"
+        )
+
+        # Get git patch
+        git_patch_result = workspace.execute_command(
+            (f"cd {repo_path} ; git --no-pager diff --no-color {base_commit} HEAD")
+        )
+        assert git_patch_result.exit_code == 0, (
+            f"git diff failed: {git_patch_result.stderr}"
+        )
+        git_patch = git_patch_result.stdout
+
+        # EvalOutput is your model; keep fields consistent with prior JSONL
+        out = EvalOutput(
+            instance_id=instance.id,
+            attempt=self.current_attempt,
+            test_result={
+                "git_patch": git_patch,
+            },
+            instruction=instruction,
+            error=None,
+            history=list(conversation.state.events),
+            metrics=conversation.conversation_stats.get_combined_metrics(),
+        )
+        return out
+
+
+def main() -> None:
+    from dotenv import load_dotenv
+
+    load_dotenv()
+
+    prompt_dir = (Path(__file__).parent / "prompts").resolve()
+    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    default_prompt_path = prompt_dir / "default.j2"
+    assert default_prompt_path.exists(), (
+        f"Default prompt {default_prompt_path} not found"
+    )
+
+    parser = get_parser()
+    parser.add_argument(
+        "--prompt-path",
+        type=str,
+        default=str(default_prompt_path),
+        choices=choices,
+        help="Path to prompt template file",
+    )
+    parser.set_defaults(**INFER_DEFAULTS)
+    args = parser.parse_args()
+
+    # Validate max_attempts
+    if args.max_attempts < 1:
+        raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
+
+    llm_config_path = args.llm_config_path
+    if not os.path.isfile(llm_config_path):
+        raise ValueError(f"LLM config file {llm_config_path} does not exist")
+    with open(llm_config_path, "r") as f:
+        llm_config = f.read()
+    llm = LLM.model_validate_json(llm_config)
+    logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
+
+    dataset_description = (
+        args.dataset.replace("/", "__") + "-" + args.split.replace("/", "__")
+    )
+
+    structured_output_dir = construct_eval_output_dir(
+        base_dir=args.output_dir,
+        dataset_name=dataset_description,
+        model_name=llm.model,
+        max_iterations=args.max_iterations,
+        eval_note=args.note,
+    )
+
+    # Create critic instance from parsed arguments
+    critic = create_critic(args)
+    logger.info(f"Using critic: {type(critic).__name__}")
+
+    metadata = EvalMetadata(
+        llm=llm,
+        dataset=args.dataset,
+        dataset_split=args.split,
+        max_iterations=args.max_iterations,
+        eval_output_dir=structured_output_dir,
+        details={},
+        prompt_path=args.prompt_path,
+        eval_limit=args.n_limit,
+        env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
+        max_attempts=args.max_attempts,
+        critic=critic,
+        selected_instances_file=args.select,
+        max_retries=args.max_retries,
+        workspace_type=args.workspace,
+    )
+
+    # Run orchestrator with a simple JSONL writer
+    evaluator = SWESmithEvaluation(
+        metadata=metadata,
+        num_workers=args.num_workers,
+    )
+
+    evaluator.run(on_result=get_default_on_result_writer(evaluator.output_path))
+
+    logger.info("Evaluation completed!")
+    # Emit machine-readable path for callers
+    print(json.dumps({"output_json": str(evaluator.output_path)}))
+
+
+if __name__ == "__main__":
+    main()

From 3fc2bd0efdfd0b0934613b91b4255ebe55645235 Mon Sep 17 00:00:00 2001
From: Rb <rubenwolff@gmail.com>
Date: Wed, 18 Feb 2026 13:16:31 +0300
Subject: [PATCH 2/3] docs(swesmith): add prompt template, README, and env
 example

Co-Authored-By: Muhammed Karamuk <mkaramuk@proton.me>
---
 benchmarks/swesmith/.env.example       |   6 +
 benchmarks/swesmith/README.md          | 192 +++++++++++++++++++++++--
 benchmarks/swesmith/prompts/default.j2 |  61 ++++++++
 3 files changed, 250 insertions(+), 9 deletions(-)
 create mode 100644 benchmarks/swesmith/.env.example
 create mode 100644 benchmarks/swesmith/prompts/default.j2

diff --git a/benchmarks/swesmith/.env.example b/benchmarks/swesmith/.env.example
new file mode 100644
index 00000000..ad7bc859
--- /dev/null
+++ b/benchmarks/swesmith/.env.example
@@ -0,0 +1,6 @@
+# SSH key for private repo access (optional, only needed for non-standard key paths)
+# If not set, default keys in ~/.ssh/ are used automatically.
+# GITHUB_USER_SSH_KEY=/home/user/.ssh/id_ed25519_github
+
+# GitHub token (optional, increases GitHub API rate limit from 60 to 5000 req/hour)
+# GITHUB_TOKEN=
diff --git a/benchmarks/swesmith/README.md b/benchmarks/swesmith/README.md
index 7e69768f..dbb16646 100644
--- a/benchmarks/swesmith/README.md
+++ b/benchmarks/swesmith/README.md
@@ -1,19 +1,23 @@
-# SWE-Smith Benchmark - building Docker images
+# SWE-Smith Benchmark Evaluation
 
-This directory contains implementation for building custom agent server Docker images for SWE-Smith. The primary purpose is to use GitHub workflows for building these images fast and using them to train LLMs as SWE agents.
+This directory contains the implementation for running SWE-Smith evaluation using OpenHands agents.
+
+## Overview
+
+SWE-Smith is a benchmark for training and evaluating AI agents on synthetically generated software engineering tasks. Task instances are created by injecting bugs into real repositories and validating them against test suites.
 
 ## Dataset
 
-- **Source**: [Paper](https://arxiv.org/abs/2504.21798)
-- **Dataset**: 
-  - `SWE-bench/SWE-smith-py` - Full dataset
+- **Source**: [SWE-Smith Paper](https://arxiv.org/abs/2504.21798)
+- **Dataset**: `SWE-bench/SWE-smith-py`
 - **Splits**: `train`
+- Local task instance files (`.json` / `.jsonl`) generated via SWE-Smith are also supported.
 
 ## Usage
 
-### Build Docker Images
+### Step 1: Build Docker Images
 
-You need to build Docker images for the SWE-Smith instances. Each instance requires a specific environment setup based on the repository and issue. **Note that this will consume atleast 150-200GB of disk space. Considering setting `--n-limit` to a smaller value if required.**
+Before running inference, you need to build Docker images for the SWE-Smith instances. Each instance requires a specific environment setup. Disk usage depends on the number and size of task instances — the full dataset can consume 150-200GB, but smaller local instance files will use proportionally less.
 
 ```bash
 uv run python -m benchmarks.swesmith.build_images \
@@ -23,7 +27,177 @@ uv run python -m benchmarks.swesmith.build_images \
   --target source-minimal
 ```
 
-### Running rollouts
+For local task instance files:
+
+```bash
+uv run python -m benchmarks.swesmith.build_images \
+  --dataset /path/to/task_instances.json \
+  --split train \
+  --image ghcr.io/openhands/eval-agent-server \
+  --target source-minimal \
+  --n-limit 10
+```
+
+### Step 2: Run Inference
+
+```bash
+uv run swesmith-infer path/to/llm_config.json \
+  --dataset /path/to/task_instances.json \
+  --workspace docker \
+  --max-iterations 75 \
+  --num-workers 4
+```
+
+**Selecting specific instances:**
+
+```bash
+# Create instances.txt with one instance ID per line
+echo "encode__httpx.ae1b9f66.lm_modify__abc123" > instances.txt
+
+uv run swesmith-infer path/to/llm_config.json \
+  --dataset /path/to/task_instances.json \
+  --select instances.txt \
+  --workspace docker
+```
+
+### Configuration Options
+
+| Argument | Description | Default |
+|----------|-------------|---------|
+| `--dataset` | HuggingFace dataset name or local file path | `SWE-bench/SWE-smith-py` |
+| `--split` | Dataset split | `train` |
+| `--workspace` | Workspace type | `docker` |
+| `--num-workers` | Parallel workers | `4` |
+| `--max-iterations` | Max agent turns per instance | `500` |
+| `--n-limit` | Limit number of instances | all |
+| `--select` | Text file with instance IDs (one per line) | - |
+| `--max-attempts` | Retry attempts with critic | `3` |
+| `--critic` | `pass` / `finish_with_patch` / `empty_patch_critic` | `finish_with_patch` |
+| `--prompt-path` | Jinja2 prompt template | `prompts/default.j2` |
+| `--note` | Note appended to output directory name | - |
+
+### Private Repositories
+
+For private repos, an SSH key must be accessible. The lookup order is:
+
+1. `GITHUB_USER_SSH_KEY` environment variable (path to key file)
+2. `~/.ssh/id_rsa`, `id_ecdsa`, `id_ecdsa_sk`, `id_ed25519`, `id_ed25519_sk` (first match)
+
+```bash
+# Only needed if your key has a non-standard name
+export GITHUB_USER_SSH_KEY=~/.ssh/my_custom_key
+```
+
+### Environment Variables
+
+Environment variables can be set directly or via a `.env` file in the project root.
+
+All environment variables prefixed with `OPENHANDS_` are forwarded into the Docker container with the prefix stripped. For example, `OPENHANDS_ANTHROPIC_API_KEY` becomes `ANTHROPIC_API_KEY` inside the container. This is how you pass LLM API keys and other credentials to the agent.
+
+```bash
+export OPENHANDS_ANTHROPIC_API_KEY=sk-xxx
+export OPENHANDS_OPENAI_API_KEY=sk-xxx
+export OPENHANDS_GOOGLE_APPLICATION_CREDENTIALS='{"type":"service_account",...}'
+```
+
+| Variable | Description |
+|----------|-------------|
+| `OPENHANDS_*` | Forwarded into the container with prefix stripped (LLM keys, credentials, etc.) |
+| `GITHUB_USER_SSH_KEY` | Path to SSH key for private repos |
+| `SKIP_BUILD` | Set to `1` to skip Docker image building during inference (default: `1`) |
+
+## Evaluation
+
+After running inference, evaluate the generated patches:
+
+```bash
+uv run swesmith-eval output.jsonl \
+  --run-id my_eval \
+  --dataset /path/to/task_instances.json
+```
+
+**Advanced options:**
+
+```bash
+# Faster evaluation using only fail-to-pass tests
+uv run swesmith-eval output.jsonl \
+  --run-id my_eval \
+  --dataset /path/to/task_instances.json \
+  --f2p-only
+
+# Re-evaluate failed/errored instances
+uv run swesmith-eval output.jsonl \
+  --run-id my_eval \
+  --dataset /path/to/task_instances.json \
+  --redo-existing
+
+# Only regenerate the report from existing evaluation logs
+uv run swesmith-eval output.jsonl \
+  --run-id my_eval \
+  --dataset /path/to/task_instances.json \
+  --report-only
+```
+
+## Output Structure
+
+```
+eval_outputs/
+└── <dataset>-<split>/
+    └── <model>/
+        ├── output.jsonl                    # Main results
+        ├── output.critic_attempt_N.jsonl   # Per-attempt results
+        ├── output.swesmith.jsonl           # SWE-Smith format predictions
+        ├── output.report.json              # Evaluation report (SWE-Smith format)
+        ├── cost_report.jsonl               # Token usage and cost
+        └── conversations/                  # Per-instance conversation logs
+            └── <instance_id>.tar.gz
+```
+
+**Inference result** (`output.jsonl`, one entry per line):
+
+```json
+{
+  "instance_id": "encode__httpx.ae1b9f66.lm_modify__abc123",
+  "attempt": 1,
+  "test_result": {
+    "git_patch": "diff --git a/file.py b/file.py\n..."
+  },
+  "instruction": "...",
+  "history": [],
+  "metrics": {},
+  "error": null
+}
+```
+
+**Evaluation report** (`output.report.json`) follows the SWE-Smith report format:
+
+```json
+{
+  "resolved": 5,
+  "unresolved": 3,
+  "total": 8,
+  "ids_resolved": ["instance_1", "..."],
+  "ids_unresolved": ["instance_3", "..."]
+}
+```
+
+## Custom Repository Profiles
+
+To add a custom repository, define a profile class in `profiles.py`:
+
+```python
+@dataclass
+class MyRepoBcd12345(PythonProfile):
+    owner: str = "github-org"
+    repo: str = "my-repo"
+    commit: str = "bcd1234567890"
+    org_gh: str = "org-swesmith"
+```
+
+Profiles are auto-registered on import. For Go repositories, inherit from `GoProfile` instead.
 
-This is not supported yet for SWE-Smith because the primary purpose of this directory is fast and smooth creation of Docker images.
+## References
 
+- [SWE-Smith Paper](https://arxiv.org/abs/2504.21798)
+- [SWE-Smith GitHub](https://github.com/SWE-bench/SWE-smith)
+- [SWE-Smith Dataset on HuggingFace](https://huggingface.co/datasets/SWE-bench/SWE-smith)
diff --git a/benchmarks/swesmith/prompts/default.j2 b/benchmarks/swesmith/prompts/default.j2
new file mode 100644
index 00000000..6adfc035
--- /dev/null
+++ b/benchmarks/swesmith/prompts/default.j2
@@ -0,0 +1,61 @@
+I have access to a code repository in the directory {{ instance.repo_path }} . You can explore and modify files using the available tools. Consider the following issue description:
+
+<issue_description>
+{{ instance.problem_statement }}
+</issue_description>
+
+Can you help me implement the necessary changes to the repository so that the requirements specified in the <issue_description> are met?
+I've already taken care of all changes to any of the test files described in the <issue_description>. This means you DON'T have to modify the testing logic or any of the tests in any way!
+Also the development environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
+Your task is to make the minimal changes to non-test files in the {{ instance.repo_path }} directory to ensure the <issue_description> is satisfied.
+
+Follow these phases to resolve the issue:
+
+Phase 1. READING: read the problem and reword it in clearer terms
+   1.1 If there are code or config snippets. Express in words any best practices or conventions in them.
+   1.2 Hightlight message errors, method names, variables, file names, stack traces, and technical details.
+   1.3 Explain the problem in clear terms.
+   1.4 Enumerate the steps to reproduce the problem.
+   1.5 Hightlight any best practices to take into account when testing and fixing the issue
+
+Phase 2. RUNNING: set up and run the tests on the repository
+   2.1 Explore the repository structure and any build/test configuration files to understand how to run the project.
+   2.2 Activate any required environment (e.g., virtualenv, conda, nvm) if applicable.
+   2.3 Iterate and figure out how to run the tests.
+
+Phase 3. EXPLORATION: find the files that are related to the problem and possible solutions
+   3.1 Use `grep` to search for relevant methods, classes, keywords and error messages.
+   3.2 Identify all files related to the problem statement.
+   3.3 Propose the methods and files to fix the issue and explain why.
+   3.4 From the possible file locations, select the most likely location to fix the issue.
+
+Phase 4. TEST CREATION: before implementing any fix, create a script to reproduce and verify the issue.
+   4.1 Look at existing test files in the repository to understand the test format/structure.
+   4.2 Create a minimal reproduction script that reproduces the located issue.
+   4.3 Run the reproduction script to confirm you are reproducing the issue.
+   4.4 Adjust the reproduction script as necessary.
+
+Phase 5. FIX ANALYSIS: state clearly the problem and how to fix it
+   5.1 State clearly what the problem is.
+   5.2 State clearly where the problem is located.
+   5.3 State clearly how the test reproduces the issue.
+   5.4 State clearly the best practices to take into account in the fix.
+   5.5 State clearly how to fix the problem.
+
+Phase 6. FIX IMPLEMENTATION: Edit the source code to implement your chosen solution.
+   6.1 Make minimal, focused changes to fix the issue.
+
+Phase 7. VERIFICATION: Test your implementation thoroughly.
+   7.1 Run your reproduction script to verify the fix works.
+   7.2 Add edge cases to your test script to ensure comprehensive coverage.
+   7.3 Run existing tests related to the modified code to ensure you haven't broken anything.
+
+8. FINAL REVIEW: Carefully re-read the problem description and compare your changes with the base commit {{ instance.base_commit }}.
+   8.1 Ensure you've fully addressed all requirements.
+   8.2 Run any tests in the repository related to:
+     8.2.1 The issue you are fixing
+     8.2.2 The files you modified
+     8.2.3 The functions you changed
+   8.3 If any tests fail, revise your implementation until all tests pass
+
+Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.

From 8e2a05d8fa5c5c508b7dc192e25749983405db88 Mon Sep 17 00:00:00 2001
From: Muhammed Karamuk <mkaramuk@proton.me>
Date: Wed, 18 Feb 2026 13:16:37 +0300
Subject: [PATCH 3/3] fix(utils): support .json files in dataset loader

Co-Authored-By: Rb <rubenwolff@gmail.com>
---
 benchmarks/utils/dataset.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/utils/dataset.py b/benchmarks/utils/dataset.py
index a60356ca..3eeab8d0 100644
--- a/benchmarks/utils/dataset.py
+++ b/benchmarks/utils/dataset.py
@@ -114,7 +114,9 @@ def get_dataset(
 ) -> pd.DataFrame:
     """Load and prepare dataset for evaluation."""
     # Check if dataset_name is a local file path
-    if os.path.isfile(dataset_name) and dataset_name.endswith(".jsonl"):
+    if os.path.isfile(dataset_name) and (
+        dataset_name.endswith(".jsonl") or dataset_name.endswith(".json")
+    ):
         # Load local JSONL file
         dataset = load_dataset("json", data_files=dataset_name, split="train")
         assert isinstance(dataset, Dataset)