From 5249749d2baa8f1a9bd3ed25a2185dd5fefaccfe Mon Sep 17 00:00:00 2001
From: Ewa Dobrowolska <edobrowolska@nvidia.com>
Date: Thu, 12 Feb 2026 20:43:51 +0100
Subject: [PATCH 1/2] nemo-evaluator implementation

---
 VERSION                                       |   1 +
 benchmarks/commit0/eval_infer.py              |   1 +
 benchmarks/commit0/run_infer.py               |  63 +++--
 benchmarks/gaia/config.py                     |   1 +
 benchmarks/gaia/eval_infer.py                 |   1 +
 benchmarks/gaia/run_infer.py                  |  53 ++--
 benchmarks/multiswebench/build_images.py      |  20 +-
 benchmarks/multiswebench/eval_infer.py        |  53 +++-
 benchmarks/multiswebench/run_infer.py         |  66 +++--
 benchmarks/openagentsafety/build_images.py    |  16 +-
 benchmarks/openagentsafety/run_infer.py       | 119 +++++++-
 benchmarks/scripts/generate_llm_config.py     |  84 ++++++
 benchmarks/scripts/run_benchmark.py           | 209 ++++++++++++++
 benchmarks/swebench/eval_infer.py             |  48 +++-
 benchmarks/swebench/run_infer.py              |  26 +-
 benchmarks/swebenchmultimodal/eval_infer.py   | 160 +++++++----
 benchmarks/swebenchmultimodal/run_infer.py    |  28 +-
 benchmarks/swtbench/eval_infer.py             |  57 ++--
 benchmarks/swtbench/run_infer.py              |  58 ++--
 benchmarks/utils/args_parser.py               |  13 +
 benchmarks/utils/build_utils.py               |  11 +-
 benchmarks/utils/constants.py                 |   8 +-
 benchmarks/utils/evaluation.py                |  45 ++-
 benchmarks/utils/fake_user_response.py        |  10 +-
 benchmarks/utils/image_utils.py               |  20 ++
 benchmarks/utils/llm_config.py                |  37 +++
 benchmarks/utils/models.py                    |  14 +
 benchmarks/utils/version.py                   |  15 +-
 .../openhands_benchmarks/__init__.py          |   3 +
 .../openhands_benchmarks/framework.yml        | 266 ++++++++++++++++++
 nemo_evaluator/openhands_benchmarks/output.py |  54 ++++
 pyproject.toml                                |  12 +-
 32 files changed, 1326 insertions(+), 246 deletions(-)
 create mode 100644 VERSION
 create mode 100644 benchmarks/scripts/generate_llm_config.py
 create mode 100644 benchmarks/scripts/run_benchmark.py
 create mode 100644 benchmarks/utils/llm_config.py
 create mode 100644 nemo_evaluator/openhands_benchmarks/__init__.py
 create mode 100644 nemo_evaluator/openhands_benchmarks/framework.yml
 create mode 100644 nemo_evaluator/openhands_benchmarks/output.py

diff --git a/VERSION b/VERSION
new file mode 100644
index 00000000..6e8bf73a
--- /dev/null
+++ b/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/benchmarks/commit0/eval_infer.py b/benchmarks/commit0/eval_infer.py
index 8b45f85c..7300fa34 100644
--- a/benchmarks/commit0/eval_infer.py
+++ b/benchmarks/commit0/eval_infer.py
@@ -120,6 +120,7 @@ def process_commit0_results(input_file: str, output_file: str) -> None:
 
     # Generate report
     report = {
+        "benchmark": "commit0",
         "total_instances": 16,  # Fixed as per requirement
         "submitted_instances": len(completed_ids),
         "completed_instances": len(completed_ids),
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index df10feb2..a4d29c74 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -29,11 +29,12 @@
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import SDK_SHORT_SHA
+from benchmarks.utils.llm_config import load_llm_config
+from benchmarks.utils.version import IMAGE_TAG_PREFIX
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
-from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
+from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
 
 
 logger = get_logger(__name__)
@@ -185,16 +186,35 @@ def prepare_workspace(
         logger.info(f"Using base docker image: {base_docker_image}")
 
         if self.metadata.workspace_type == "docker":
-            # Build agent-server image from base commit0 image
-            workspace = DockerDevWorkspace(
-                base_image=base_docker_image,
-                working_dir="/workspace",
-                target=build_target,
-                forward_env=forward_env or [],
-            )
-            logger.info(
-                f"Building workspace from {base_docker_image}. This may take a while..."
-            )
+            # Try to build agent-server image from base commit0 image
+            # Fall back to pre-built image if build fails
+            try:
+                workspace = DockerDevWorkspace(
+                    base_image=base_docker_image,
+                    working_dir="/workspace",
+                    target=build_target,
+                    forward_env=forward_env or [],
+                )
+                logger.info(
+                    f"Building workspace from {base_docker_image}. This may take a while..."
+                )
+            except Exception:
+                custom_tag = extract_custom_tag(base_docker_image)
+                suffix = f"-{build_target}" if build_target != "binary" else ""
+                agent_server_image = (
+                    f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+                )
+                if not image_exists(agent_server_image):
+                    raise RuntimeError(
+                        f"On-the-fly build failed and pre-built image {agent_server_image} does not exist"
+                    )
+                
+                workspace = DockerWorkspace(
+                    server_image=agent_server_image,
+                    working_dir="/workspace",
+                    forward_env=forward_env or [],
+                )
+                logger.info(f"Using pre-built image {agent_server_image}")
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
             if not runtime_api_key:
@@ -202,11 +222,10 @@ def prepare_workspace(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
-            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             custom_tag = extract_custom_tag(base_docker_image)
             suffix = f"-{build_target}" if build_target != "binary" else ""
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
             )
 
             if not image_exists(agent_server_image):
@@ -217,7 +236,7 @@ def prepare_workspace(
 
             logger.info(
                 f"Using remote workspace with image {agent_server_image} "
-                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
+                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -578,7 +597,10 @@ def evaluate_instance(
 
 def main() -> None:
     prompt_dir = (Path(__file__).parent / "prompts").resolve()
-    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    try:
+        choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    except ValueError:
+        choices = [str(p) for p in prompt_dir.glob("*.j2")]
     default_prompt_path = prompt_dir / "default.j2"
     assert default_prompt_path.exists(), (
         f"Default prompt {default_prompt_path} not found"
@@ -605,12 +627,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
@@ -630,6 +647,7 @@ def main() -> None:
         dataset=args.dataset,
         dataset_split=args.split,
         max_iterations=args.max_iterations,
+        conversation_timeout=args.conversation_timeout,
         eval_output_dir=structured_output_dir,
         details={},
         prompt_path=args.prompt_path,
@@ -639,6 +657,7 @@ def main() -> None:
         critic=create_critic(args),
         selected_instances_file=args.select,
         max_retries=args.max_retries,
+        skip_failed_samples=args.skip_failed_samples,
         workspace_type=args.workspace,
     )
 
diff --git a/benchmarks/gaia/config.py b/benchmarks/gaia/config.py
index dadaa20a..08fb7027 100644
--- a/benchmarks/gaia/config.py
+++ b/benchmarks/gaia/config.py
@@ -10,6 +10,7 @@
     "split": "validation",
     "level": "2023_all",
     "num_workers": 30,
+    "critic": "pass",
 }
 
 # Build defaults (used by build_images.py)
diff --git a/benchmarks/gaia/eval_infer.py b/benchmarks/gaia/eval_infer.py
index 7aad859b..bc11ebc0 100644
--- a/benchmarks/gaia/eval_infer.py
+++ b/benchmarks/gaia/eval_infer.py
@@ -148,6 +148,7 @@ def process_gaia_results(
 
     # Generate report
     report = {
+        "benchmark": "gaia",
         "total_instances": len(submitted_ids),
         "submitted_instances": len(submitted_ids),
         "completed_instances": len(completed_ids),
diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index 8f52d171..5d3987ab 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -28,7 +28,8 @@
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
 from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
-from benchmarks.utils.version import SDK_SHORT_SHA
+from benchmarks.utils.llm_config import load_llm_config
+from benchmarks.utils.version import IMAGE_TAG_PREFIX
 from openhands.sdk import (
     LLM,
     Agent,
@@ -42,7 +43,7 @@
 )
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
-from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
+from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
 
 
 logger = get_logger(__name__)
@@ -151,12 +152,29 @@ def prepare_workspace(
         logger.info(f"Preparing workspace for instance {instance.id}")
 
         if self.metadata.workspace_type == "docker":
-            # Use DockerDevWorkspace with base image (same as main branch)
-            workspace = DockerDevWorkspace(
-                base_image="nikolaik/python-nodejs:python3.12-nodejs22",
-                working_dir="/workspace",
-                forward_env=forward_env or [],
-            )
+            # Use DockerDevWorkspace with base image
+            # Fall back to pre-built image if build fails
+            try:
+                workspace = DockerDevWorkspace(
+                    base_image="nikolaik/python-nodejs:python3.12-nodejs22",
+                    working_dir="/workspace",
+                    forward_env=forward_env or [],
+                )
+            except Exception as build_error:
+                build_target = os.getenv("GAIA_BUILD_TARGET", "binary-minimal")
+                agent_server_image = (
+                    f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-{build_target}"
+                )
+                if not image_exists(agent_server_image):
+                    raise RuntimeError(
+                        f"On-the-fly build failed and pre-built image {agent_server_image} does not exist"
+                    )
+                workspace = DockerWorkspace(
+                    server_image=agent_server_image,
+                    working_dir="/workspace",
+                    forward_env=forward_env or [],
+                )
+                logger.info(f"Using pre-built image {agent_server_image}")
         elif self.metadata.workspace_type == "remote":
             # For workflow, use APIRemoteWorkspace with pre-built GAIA image
             # GAIA uses a universal agent server image (one image for all instances)
@@ -169,9 +187,8 @@ def prepare_workspace(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
-            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
             )
 
             if not image_exists(agent_server_image):
@@ -182,7 +199,7 @@ def prepare_workspace(
 
             logger.info(
                 f"Using remote workspace with GAIA image {agent_server_image} "
-                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
+                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -349,7 +366,9 @@ def evaluate_instance(
         else:
             conversation.send_message(instruction)
         # Run conversation with fake user responses to handle agent messages
-        run_conversation_with_fake_user_response(conversation)
+        run_conversation_with_fake_user_response(
+            conversation, run_timeout=self.metadata.conversation_timeout
+        )
 
         # Extract answer from conversation history
         model_answer_raw = self._extract_answer_from_history(conversation.state.events)
@@ -565,12 +584,7 @@ def main() -> None:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
     # Load LLM config
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     # Construct dataset description
@@ -591,12 +605,15 @@ def main() -> None:
         dataset=args.dataset,
         dataset_split=args.split,
         max_iterations=args.max_iterations,
+        conversation_timeout=args.conversation_timeout,
         eval_output_dir=structured_output_dir,
         details={"level": args.level},
         eval_limit=args.n_limit,
         max_attempts=args.max_attempts,
         critic=critic,
         selected_instances_file=args.select,
+        max_retries=args.max_retries,
+        skip_failed_samples=args.skip_failed_samples,
         workspace_type=args.workspace,
     )
 
diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py
index 3ecdeeb6..86894e04 100644
--- a/benchmarks/multiswebench/build_images.py
+++ b/benchmarks/multiswebench/build_images.py
@@ -8,15 +8,16 @@
     --image ghcr.io/openhands/eval-agent-server --target source-minimal
 """
 
+import json
 import os
 from pathlib import Path
 
+from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
 from benchmarks.utils.build_utils import (
     build_all_images,
     default_build_output_dir,
     get_build_parser,
 )
-from benchmarks.utils.dataset import get_dataset
 from openhands.sdk import get_logger
 
 
@@ -37,7 +38,7 @@ def get_official_docker_image(
 
     # For Multi-SWE-Bench, the image naming depends on the language
     repo = instance["repo"]
-    version = instance["version"]
+    version = instance.get("version", "")
 
     if LANGUAGE == "python":
         # Use SWE-bench style naming for Python
@@ -52,7 +53,7 @@ def get_official_docker_image(
         else:
             org = instance.get("org", repo)
             repo_name = repo
-        official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base"
+        official_image_name = f"{docker_image_prefix}/{org}_m_{repo_name}:base".lower()
 
     logger.debug(f"Multi-SWE-Bench image: {official_image_name}")
     return official_image_name
@@ -79,12 +80,16 @@ def extract_custom_tag(base_image: str) -> str:
 
 def get_base_images_from_dataset(dataset_name: str, split: str) -> list[str]:
     """Get all unique base images from the dataset."""
-    dataset = get_dataset(dataset_name, split)
+    local_path = download_and_concat_dataset(dataset_name, LANGUAGE)
     base_images = set()
 
-    for _, row in dataset.iterrows():
-        image = get_official_docker_image(row.to_dict())
-        base_images.add(image)
+    with open(local_path, "r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            instance = json.loads(line)
+            image = get_official_docker_image(instance)
+            base_images.add(image)
 
     return list(base_images)
 
@@ -107,6 +112,7 @@ def main():
         build_dir=Path(
             args.output_dir or default_build_output_dir(args.dataset, args.split)
         ),
+        base_image_to_custom_tag_fn=extract_custom_tag,
         max_workers=args.num_workers,
         dry_run=False,
     )
diff --git a/benchmarks/multiswebench/eval_infer.py b/benchmarks/multiswebench/eval_infer.py
index 3bb88cf1..a2efeb86 100644
--- a/benchmarks/multiswebench/eval_infer.py
+++ b/benchmarks/multiswebench/eval_infer.py
@@ -10,8 +10,10 @@
 """
 
 import argparse
+import json
 import shutil
 import subprocess
+import sys
 from pathlib import Path
 
 from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
@@ -59,8 +61,8 @@ def run_multi_swebench_evaluation(
         # Create config file for Multi-SWE-Bench
         config_file = work_dir / "config.json"
 
-        # Handle dataset path - download if it's a ByteDance-Seed/Multi-SWE-bench dataset
-        if dataset_name.startswith("ByteDance-Seed/Multi-SWE-bench"):
+        # Handle dataset path - download if it's Multi-SWE-Bench
+        if "multi-swe-bench" in dataset_name.lower():
             logger.info(f"Downloading Multi-SWE-bench dataset for language: {lang}")
             dataset_path = download_and_concat_dataset(dataset_name, lang)
         else:
@@ -73,17 +75,39 @@ def run_multi_swebench_evaluation(
         # Run the Multi-SWE-Bench evaluation
         logger.info("Running Multi-SWE-Bench evaluation harness...")
 
-        cmd = [
-            "uv",
-            "run",
-            "python",
-            "-m",
-            "multi_swe_bench.harness.run_evaluation",
+        # Try uv first, fall back to current Python interpreter
+        try:
+            uv_check = subprocess.run(
+                ["uv", "--version"],
+                capture_output=True,
+                text=True,
+            )
+            uv_available = uv_check.returncode == 0
+        except FileNotFoundError:
+            uv_available = False
+
+        if uv_available:
+            cmd = [
+                "uv",
+                "run",
+                "python",
+                "-m",
+                "multi_swe_bench.harness.run_evaluation",
+            ]
+        else:
+            logger.info("uv not available, using current Python interpreter")
+            cmd = [
+                sys.executable,
+                "-m",
+                "multi_swe_bench.harness.run_evaluation",
+            ]
+
+        cmd.extend([
             "--config",
             str(config_file.resolve()),
             "--mode",
             "evaluation",
-        ]
+        ])
 
         logger.info(f"Evaluation command: {' '.join(cmd)}")
 
@@ -96,11 +120,13 @@ def run_multi_swebench_evaluation(
             error_msg = f"Evaluation failed with return code {result.returncode}"
             print(f"ERROR: {error_msg}")
             logger.error(error_msg)
+            raise subprocess.CalledProcessError(result.returncode, cmd)
 
     except Exception as e:
         error_msg = f"Error running evaluation: {e}"
         print(f"ERROR: {error_msg}")
         logger.error(error_msg)
+        raise
 
 
 def main():
@@ -140,10 +166,17 @@ def main():
         logger.info(f"Results saved to {results_file}")
 
         # Move the report file to the output location
-        output_report_path = args.input_file.with_suffix(".report.json")
+        output_report_path = Path(args.input_file).with_suffix(".report.json")
         shutil.move(str(results_file), str(output_report_path))
         logger.info(f"Report moved to {output_report_path}")
 
+        # Add benchmark field to the report
+        with open(output_report_path, "r") as f:
+            report_data = json.load(f)
+        report_data["benchmark"] = f"multiswebench-{args.lang}"
+        with open(output_report_path, "w") as f:
+            json.dump(report_data, f, indent=4)
+
         # Update Laminar datapoints with evaluation scores
         LaminarService.get().update_evaluation_scores(
             str(args.input_file), str(output_report_path)
diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
index afeb7f6e..ec3337d1 100644
--- a/benchmarks/multiswebench/run_infer.py
+++ b/benchmarks/multiswebench/run_infer.py
@@ -25,13 +25,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import SDK_SHORT_SHA
+from benchmarks.utils.version import IMAGE_TAG_PREFIX
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
@@ -118,9 +119,9 @@ def __init__(self, metadata: MultiSWEBenchEvalMetadata, **kwargs):
     def prepare_instances(self) -> List[EvalInstance]:
         logger.info("Setting up Multi-SWE-bench evaluation data")
 
-        # Check if this is a ByteDance-Seed/Multi-SWE-bench dataset that needs downloading
+        # Check if this is a Multi-SWE-bench dataset that needs downloading
         dataset_path = self.metadata.dataset
-        if dataset_path.startswith("ByteDance-Seed/Multi-SWE-bench"):
+        if "multi-swe-bench" in dataset_path.lower():
             metadata = cast(MultiSWEBenchEvalMetadata, self.metadata)
             logger.info(
                 f"Downloading Multi-SWE-bench dataset for language: {metadata.lang}"
@@ -207,7 +208,7 @@ def prepare_workspace(
 
         if self.metadata.workspace_type == "docker":
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
             )
             SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in (
                 "1",
@@ -224,20 +225,28 @@ def prepare_workspace(
                     "MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
                     "agent-server image."
                 )
-                output = build_image(
-                    base_image=official_docker_image,
-                    target_image=EVAL_AGENT_SERVER_IMAGE,
-                    custom_tag=custom_tag,
-                    target=build_target,
-                    push=False,
-                )
-                logger.info(f"Image build output: {output}")
-                assert output.error is None, f"Image build failed: {output.error}"
-                if agent_server_image not in output.tags:
-                    raise RuntimeError(
-                        f"Built image tags {output.tags} do not include expected tag "
-                        f"{agent_server_image}"
+                try:
+                    output = build_image(
+                        base_image=official_docker_image,
+                        target_image=EVAL_AGENT_SERVER_IMAGE,
+                        custom_tag=custom_tag,
+                        target=build_target,
+                        push=False,
                     )
+                    logger.info(f"Image build output: {output}")
+                    if output.error is not None:
+                        raise RuntimeError(f"Image build failed: {output.error}")
+                    if agent_server_image not in output.tags:
+                        raise RuntimeError(
+                            f"Built image tags {output.tags} do not include expected tag "
+                            f"{agent_server_image}"
+                        )
+                except Exception as build_error:
+                    if not image_exists(agent_server_image):
+                        raise RuntimeError(
+                            f"On-the-fly build failed and pre-built image {agent_server_image} does not exist"
+                        )
+                    logger.info(f"Using pre-built image {agent_server_image}")
 
             workspace = DockerWorkspace(
                 server_image=agent_server_image,
@@ -246,14 +255,13 @@ def prepare_workspace(
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
-            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             if not runtime_api_key:
                 raise ValueError(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
             )
             if not image_exists(agent_server_image):
                 raise RuntimeError(
@@ -262,7 +270,7 @@ def prepare_workspace(
                 )
             logger.info(
                 f"Using remote workspace with image {agent_server_image} "
-                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
+                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -366,7 +374,9 @@ def evaluate_instance(
         )
         conversation.send_message(instruction)
         # Run conversation with fake user responses to handle agent messages
-        run_conversation_with_fake_user_response(conversation)
+        run_conversation_with_fake_user_response(
+            conversation, run_timeout=self.metadata.conversation_timeout
+        )
 
         # git add
         workspace.execute_command(f"cd {repo_path} ; git add -A")
@@ -416,7 +426,10 @@ def evaluate_instance(
 
 def main() -> None:
     prompt_dir = (Path(__file__).parent / "prompts").resolve()
-    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    try:
+        choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    except ValueError:
+        choices = [str(p) for p in prompt_dir.glob("*.j2")]
     default_prompt_path = prompt_dir / "default.j2"
     assert default_prompt_path.exists(), (
         f"Default prompt {default_prompt_path} not found"
@@ -442,12 +455,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
@@ -472,6 +480,7 @@ def main() -> None:
         dataset_split=args.split,
         lang=args.lang,
         max_iterations=args.max_iterations,
+        conversation_timeout=args.conversation_timeout,
         eval_output_dir=structured_output_dir,
         details={},
         prompt_path=args.prompt_path,
@@ -481,6 +490,7 @@ def main() -> None:
         critic=critic,
         selected_instances_file=args.select,
         max_retries=args.max_retries,
+        skip_failed_samples=args.skip_failed_samples,
         workspace_type=args.workspace,
     )
 
diff --git a/benchmarks/openagentsafety/build_images.py b/benchmarks/openagentsafety/build_images.py
index acb18384..1548bf2c 100644
--- a/benchmarks/openagentsafety/build_images.py
+++ b/benchmarks/openagentsafety/build_images.py
@@ -1,6 +1,7 @@
 """Build OpenAgentSafety Docker image from vendor/software-agent-sdk"""
 
 import logging
+import os
 import subprocess
 from pathlib import Path
 
@@ -31,6 +32,16 @@ def get_vendor_sdk_commit() -> str:
     return result.stdout.strip()
 
 
+def get_image_name() -> str:
+    image_name = os.getenv("EVAL_AGENT_SERVER_IMAGE", "openagentsafety-agent-server")
+    tag_prefix = os.getenv("IMAGE_TAG_PREFIX")
+    if tag_prefix:
+        tag = f"{tag_prefix}-openagentsafety"
+    else:
+        tag = get_vendor_sdk_commit()
+    return f"{image_name}:{tag}"
+
+
 def check_image_exists(image_name: str) -> bool:
     """Check if a Docker image exists locally."""
     result = subprocess.run(
@@ -48,12 +59,13 @@ def build_workspace_image(force_rebuild: bool = False, no_cache: bool = False) -
         force_rebuild: if True, ignore existing images and rebuild.
         no_cache: if True, pass --no-cache to docker build to avoid layer cache.
     """
-    sdk_commit = get_vendor_sdk_commit()
-    image_name = f"openagentsafety-agent-server:{sdk_commit}"
+    image_name = get_image_name()
 
     if not force_rebuild and check_image_exists(image_name):
         logger.info(f"#### Using existing image: {image_name}")
         return image_name
+    
+    sdk_commit = get_vendor_sdk_commit()
 
     logger.info(f"#### Building Docker image: {image_name}")
     logger.info(f"#### SDK version: {sdk_commit}")
diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
index b9afff4f..e3ca9a8f 100644
--- a/benchmarks/openagentsafety/run_infer.py
+++ b/benchmarks/openagentsafety/run_infer.py
@@ -12,7 +12,11 @@
 import requests
 from jinja2 import Environment, FileSystemLoader
 
-from benchmarks.openagentsafety.build_images import build_workspace_image
+from benchmarks.openagentsafety.build_images import (
+    build_workspace_image,
+    check_image_exists,
+    get_image_name,
+)
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.conversation import build_event_persistence_callback
 from benchmarks.utils.critics import create_critic
@@ -20,6 +24,7 @@
 from benchmarks.utils.evaluation import Evaluation
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
@@ -38,12 +43,16 @@ def convert_numpy_types(obj: Any) -> Any:
         return float(obj)
     elif isinstance(obj, np.ndarray):
         return obj.tolist()
-    elif pd.isna(obj):
-        return None
     elif isinstance(obj, dict):
         return {k: convert_numpy_types(v) for k, v in obj.items()}
     elif isinstance(obj, list):
         return [convert_numpy_types(item) for item in obj]
+    else:
+        try:
+            if pd.isna(obj):
+                return None
+        except (ValueError, TypeError):
+            pass
     return obj
 
 
@@ -57,8 +66,13 @@ def default(self, o):
             return float(o)
         elif isinstance(o, np.ndarray):
             return o.tolist()
-        elif pd.isna(o):
-            return None
+        elif hasattr(o, "model_dump"):
+            return o.model_dump()
+        try:
+            if pd.isna(o):
+                return None
+        except (ValueError, TypeError):
+            pass
         return super().default(o)
 
 
@@ -183,7 +197,7 @@ def cleanup_docker_containers():
                 "-a",
                 "-q",
                 "--filter",
-                "ancestor=openagentsafety-agent-server:local",
+                f"ancestor={get_image_name()}",
             ],
             capture_output=True,
             text=True,
@@ -378,7 +392,17 @@ def prepare_workspace(
             resource_factor: Resource factor for runtime allocation (default: 1).
             forward_env: Environment variables to forward into the workspace.
         """
-        server_image = build_workspace_image()
+        # Try to build image on-the-fly, fall back to pre-built if build fails
+        try:
+            server_image = build_workspace_image()
+        except Exception as build_error:
+            server_image = get_image_name()
+            
+            if not check_image_exists(server_image):
+                raise RuntimeError(
+                    f"On-the-fly build failed and pre-built image {server_image} does not exist"
+            )
+            logger.info(f"Using pre-built image {server_image}")
 
         workspace = DockerWorkspace(
             server_image=server_image,
@@ -462,7 +486,9 @@ def event_callback(event) -> None:
         try:
             with warnings.catch_warnings():
                 warnings.filterwarnings("ignore", category=UserWarning)
-                run_conversation_with_fake_user_response(conversation)
+                run_conversation_with_fake_user_response(
+                    conversation, run_timeout=self.metadata.conversation_timeout
+                )
             logger.info(f"Conversation completed for {instance.id}")
         except ValidationError as e:
             logger.warning(f"Validation error from custom events (continuing): {e}")
@@ -530,6 +556,65 @@ def event_callback(event) -> None:
         )
 
 
+def generate_report(output_jsonl: str, report_path: str, model_name: str) -> None:
+    """Generate a .report.json from the output.jsonl, matching the format
+    expected by nemo_evaluator (same schema as SWE-Bench / GAIA reports)."""
+    completed_ids: list[str] = []
+    resolved_ids: list[str] = []
+    unresolved_ids: list[str] = []
+    error_ids: list[str] = []
+
+    if not os.path.exists(output_jsonl):
+        logger.warning("No output.jsonl found at %s, skipping report", output_jsonl)
+        return
+
+    with open(output_jsonl, "r") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            instance_id = data.get("instance_id", "")
+            error = data.get("error")
+            test_result = data.get("test_result", {})
+
+            if error or test_result.get("error"):
+                error_ids.append(instance_id)
+            else:
+                completed_ids.append(instance_id)
+                # Treat as resolved when there is no error
+                resolved_ids.append(instance_id)
+
+    submitted_ids = completed_ids + error_ids
+    report = {
+        "benchmark": "openagentsafety",
+        "model_name_or_path": model_name,
+        "total_instances": len(submitted_ids),
+        "submitted_instances": len(submitted_ids),
+        "completed_instances": len(completed_ids),
+        "incomplete_instances": 0,
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "empty_patch_instances": 0,
+        "error_instances": len(error_ids),
+        "submitted_ids": submitted_ids,
+        "completed_ids": completed_ids,
+        "incomplete_ids": [],
+        "resolved_ids": resolved_ids,
+        "unresolved_ids": unresolved_ids,
+    }
+
+    with open(report_path, "w") as f:
+        json.dump(report, f, indent=4)
+
+    logger.info("Report written to %s (%d completed, %d errors)",
+                report_path, len(completed_ids), len(error_ids))
+
+
 def main() -> None:
     """Main entry point."""
     parser = get_parser(add_llm_config=True)
@@ -542,12 +627,7 @@ def main() -> None:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
     # Load LLM config
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     # Construct output directory
@@ -572,15 +652,18 @@ def main() -> None:
         dataset=args.dataset,
         dataset_split=args.split,
         max_iterations=args.max_iterations,
+        conversation_timeout=args.conversation_timeout,
         eval_output_dir=structured_output_dir,
         details={
-            "server_image": "openagentsafety-agent-server:local",
+            "server_image": get_image_name(),
             "platform": "linux/amd64",
         },
         eval_limit=args.n_limit,
         max_attempts=args.max_attempts,
         critic=critic,
         selected_instances_file=args.select,
+        max_retries=args.max_retries,
+        skip_failed_samples=args.skip_failed_samples,
     )
 
     # Initial cleanup
@@ -636,6 +719,12 @@ def _cb(instance: EvalInstance, out: EvalOutput) -> None:
     # Run evaluation
     evaluator.run(on_result=_default_on_result_writer(metadata.eval_output_dir))
 
+    # Generate .report.json for nemo_evaluator compatibility
+    report_path = os.path.join(
+        metadata.eval_output_dir, "output.report.json"
+    )
+    generate_report(evaluator.output_path, report_path, llm.model)
+
     # Final cleanup
     cleanup_docker_containers()
 
diff --git a/benchmarks/scripts/generate_llm_config.py b/benchmarks/scripts/generate_llm_config.py
new file mode 100644
index 00000000..344ac36b
--- /dev/null
+++ b/benchmarks/scripts/generate_llm_config.py
@@ -0,0 +1,84 @@
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+
+def generate_config(
+    model: str,
+    output_path: str,
+    api_base_url: str | None = None,
+    api_key_env: str | None = None,
+    temperature: float | None = None,
+    top_p: float | None = None,
+    max_completion_tokens: int | None = None,
+    timeout: int | None = None,
+    max_retries: int | None = None,
+) -> None:
+    llm_config: dict[str, object] = {"model": model}
+
+    if api_base_url:
+        llm_config["base_url"] = api_base_url
+    if api_key_env:
+        llm_config["api_key_env"] = api_key_env
+    if temperature is not None:
+        llm_config["temperature"] = temperature
+    if top_p is not None:
+        llm_config["top_p"] = top_p
+    if max_completion_tokens is not None:
+        llm_config["max_output_tokens"] = max_completion_tokens
+    if timeout is not None:
+        llm_config["timeout"] = timeout
+    if max_retries is not None:
+        llm_config["num_retries"] = max_retries
+
+    out_path = Path(output_path)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    out_path.write_text(json.dumps(llm_config, indent=2) + "\n", encoding="utf-8")
+
+    print(f"Wrote LLM config to {str(out_path)}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Generate LLM config from CLI args",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    parser.add_argument("--model", type=str, required=True, help="Model name/id")
+    parser.add_argument("--api-base-url", type=str, help="API base URL")
+    parser.add_argument(
+        "--api-key-env",
+        type=str,
+        help="Environment variable name containing the API key",
+    )
+    parser.add_argument("--temperature", type=float, help="Sampling temperature")
+    parser.add_argument("--top-p", type=float, help="Nucleus sampling (top-p)")
+    parser.add_argument("--max-completion-tokens", type=int, help="Max completion tokens")
+    parser.add_argument("--timeout", type=int, help="API timeout in seconds")
+    parser.add_argument("--max-retries", type=int, help="Max API call retries")
+    parser.add_argument(
+        "--output-path",
+        type=str,
+        required=True,
+        help="Where to write the generated JSON config",
+    )
+
+    args = parser.parse_args()
+
+    generate_config(
+        model=args.model,
+        output_path=args.output_path,
+        api_base_url=args.api_base_url,
+        api_key_env=args.api_key_env,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_completion_tokens=args.max_completion_tokens,
+        timeout=args.timeout,
+        max_retries=args.max_retries,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/scripts/run_benchmark.py b/benchmarks/scripts/run_benchmark.py
new file mode 100644
index 00000000..7b6b1cf8
--- /dev/null
+++ b/benchmarks/scripts/run_benchmark.py
@@ -0,0 +1,209 @@
+from __future__ import annotations
+
+import argparse
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+from benchmarks.scripts.generate_llm_config import generate_config
+
+
+INFER_ENTRYPOINTS = {
+    "swebench": "swebench-infer",
+    "gaia": "gaia-infer",
+    "commit0": "commit0-infer",
+    "multiswebench": "multiswebench-infer",
+    "swtbench": "swtbench-infer",
+    "swebenchmultimodal": "swebenchmultimodal-infer",
+    "openagentsafety": "openagentsafety-infer",
+}
+
+EVAL_ENTRYPOINTS = {
+    "swebench": "swebench-eval",
+    "gaia": "gaia-eval",
+    "commit0": "commit0-eval",
+    "multiswebench": "multiswebench-eval",
+    "swtbench": "swtbench-eval",
+    "swebenchmultimodal": "swebenchmultimodal-eval",
+    # openagentsafety doesn't have a separate eval entrypoint
+}
+
+# Patch-based benchmarks use "finish_with_patch" (requires git patch).
+# gaia and openagentsafety use "pass" (accept any completed output).
+BENCHMARK_CRITIC = {
+    "swebench": "finish_with_patch",
+    "swtbench": "finish_with_patch",
+    "swebenchmultimodal": "finish_with_patch",
+    "multiswebench": "finish_with_patch",
+    "commit0": "finish_with_patch",
+    "gaia": "pass",
+    "openagentsafety": "pass",
+}
+
+
+def _build_infer_cmd(args: argparse.Namespace, llm_config_path: Path) -> list[str]:
+    """Build the inference command with benchmark-specific args."""
+    cmd = [
+        INFER_ENTRYPOINTS[args.benchmark],
+        str(llm_config_path),
+        "--workspace", args.workspace,
+        "--max-iterations", str(args.max_iterations),
+        "--conversation-timeout", str(args.conversation_timeout),
+        "--num-workers", str(args.num_workers),
+        "--output-dir", str(args.output_dir),
+        "--max-attempts", str(args.max_attempts),
+        "--max-retries", str(args.instance_max_retries),
+        "--critic", BENCHMARK_CRITIC.get(args.benchmark, "finish_with_patch"),
+    ]
+    if args.dataset:
+        cmd.extend(["--dataset", args.dataset])
+    if args.split:
+        cmd.extend(["--split", args.split])
+
+    if args.note:
+        cmd.extend(["--note", args.note])
+    if args.n_limit is not None:
+        cmd.extend(["--n-limit", str(args.n_limit)])
+    if args.skip_failed_samples:
+        cmd.append("--skip-failed-samples")
+
+    # ----- Benchmark-specific inference args -----
+
+    # GAIA requires --level (e.g. 2023_level1, 2023_all)
+    if args.benchmark == "gaia" and args.level:
+        cmd.extend(["--level", args.level])
+
+    # commit0 requires --repo-split (e.g. lite, all)
+    if args.benchmark == "commit0" and args.repo_split:
+        cmd.extend(["--repo-split", args.repo_split])
+
+    # multiswebench requires --lang (e.g. java, python, go, c)
+    if args.benchmark == "multiswebench" and args.language:
+        cmd.extend(["--lang", args.language])
+
+    return cmd
+
+
+def _build_eval_cmd(args: argparse.Namespace, output_jsonl: Path) -> list[str]:
+    """Build the evaluation command with benchmark-specific args."""
+    benchmark = args.benchmark
+    if benchmark not in EVAL_ENTRYPOINTS:
+        return []
+
+    cmd = [EVAL_ENTRYPOINTS[benchmark], str(output_jsonl)]
+
+    if benchmark in ("swebench", "swebenchmultimodal") and args.dataset:
+        cmd.extend(["--dataset", args.dataset])
+
+    if benchmark == "swebench":
+        cmd.extend(["--run-id", "eval"])
+    if benchmark in ("swebench", "swebenchmultimodal"):
+        if args.modal is True:
+            cmd.append("--modal")
+        elif args.modal is False:
+            cmd.append("--no-modal")
+
+    if benchmark == "multiswebench" and args.dataset:
+        cmd.extend(["--dataset", args.dataset])
+        if args.language:
+            cmd.extend(["--lang", args.language])
+
+    return cmd
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+
+    # LLM config generation args
+    parser.add_argument("--model", type=str, required=True)
+    parser.add_argument("--api-base-url", type=str, required=True)
+    parser.add_argument("--api-key-env", type=str, default=None, help="Env var name for API key")
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--top-p", type=float, default=1.0)
+    parser.add_argument("--max-completion-tokens", type=int, default=4096)
+    parser.add_argument("--timeout", type=int, default=600)
+    parser.add_argument("--max-retries", type=int, default=3)
+
+    # Benchmark selection
+    parser.add_argument("--benchmark", required=True, choices=INFER_ENTRYPOINTS.keys())
+
+    # Common inference args
+    parser.add_argument("--dataset", type=str, default=None)
+    parser.add_argument("--split", type=str, default=None)
+    parser.add_argument("--workspace", type=str, default="docker")
+    parser.add_argument("--max-iterations", type=int, default=100)
+    parser.add_argument("--conversation-timeout", type=float, default=3600.0)
+    parser.add_argument("--num-workers", type=int, default=1)
+    parser.add_argument("--note", type=str, default="")
+    parser.add_argument("--output-dir", type=str, required=True)
+    parser.add_argument("--max-attempts", type=int, default=3)
+    parser.add_argument("--instance-max-retries", type=int, default=3)
+    parser.add_argument("--n-limit", type=int, default=None)
+    parser.add_argument("--skip-failed-samples", action="store_true")
+
+    # GAIA
+    parser.add_argument("--level", type=str, default="2023_all",
+                        help="GAIA level (e.g. 2023_level1, 2023_all)")
+    # commit0
+    parser.add_argument("--repo-split", type=str, default="lite",
+                        help="commit0 repo split (lite, all, or repo name)")
+    # multiswebench
+    parser.add_argument("--language", type=str, default=None,
+                        help="multiswebench language (java, python, go, c)")
+    # swebench/swebenchmultimodal
+    parser.add_argument(
+        "--modal",
+        dest="modal",
+        action=argparse.BooleanOptionalAction,
+        default=None,
+        help=(
+            "Enable/disable Modal for swebench and swebenchmultimodal evaluation. "
+            "If omitted, each benchmark uses its default."
+        ),
+    )
+
+    args = parser.parse_args()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    llm_config_path = output_dir / "llm_config.json"
+
+    # 1) Generate LLM config
+    generate_config(
+        model=args.model,
+        api_base_url=args.api_base_url,
+        api_key_env=args.api_key_env,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_completion_tokens=args.max_completion_tokens,
+        timeout=args.timeout,
+        max_retries=args.max_retries,
+        output_path=str(llm_config_path),
+    )
+
+    # 2) Run inference
+    # multiswebench reads LANGUAGE env var at module level for Docker image naming
+    if args.benchmark == "multiswebench" and args.language:
+        os.environ["LANGUAGE"] = args.language
+
+    infer_cmd = _build_infer_cmd(args, llm_config_path)
+    ret = subprocess.call(infer_cmd)
+    if ret != 0:
+        sys.exit(ret)
+
+    # 3) Find output.jsonl and run evaluation
+    output_files = sorted(output_dir.rglob("output.jsonl"))
+    if not output_files:
+        print(f"ERROR: Inference did not produce output.jsonl under {output_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    output_jsonl = output_files[-1]  # Use the latest one
+
+    eval_cmd = _build_eval_cmd(args, output_jsonl)
+    if eval_cmd:
+        sys.exit(subprocess.call(eval_cmd))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/swebench/eval_infer.py b/benchmarks/swebench/eval_infer.py
index bfed6217..1939e3e9 100644
--- a/benchmarks/swebench/eval_infer.py
+++ b/benchmarks/swebench/eval_infer.py
@@ -143,14 +143,34 @@ def run_swebench_evaluation(
         predictions_dir = predictions_path.parent
         predictions_filename = predictions_path.name
 
-        # Run SWE-Bench evaluation using global python (not UV environment)
-        # since swebench is installed globally
-        cmd = [
-            "uv",
-            "run",
-            "python",
-            "-m",
-            "swebench.harness.run_evaluation",
+        # Try uv first, fall back to current Python interpreter
+        try:
+            uv_check = subprocess.run(
+                ["uv", "--version"],
+                capture_output=True,
+                text=True,
+            )
+            uv_available = uv_check.returncode == 0
+        except FileNotFoundError:
+            uv_available = False
+
+        if uv_available:
+            cmd = [
+                "uv",
+                "run",
+                "python",
+                "-m",
+                "swebench.harness.run_evaluation",
+            ]
+        else:
+            logger.info("uv not available, using current Python interpreter")
+            cmd = [
+                sys.executable,
+                "-m",
+                "swebench.harness.run_evaluation",
+            ]
+
+        cmd.extend([
             "--dataset_name",
             dataset,
             "--predictions_path",
@@ -159,7 +179,7 @@ def run_swebench_evaluation(
             str(workers),
             "--run_id",
             run_id,
-        ]
+        ])
 
         # Add parameters
         cmd.extend(["--split", split])
@@ -316,6 +336,16 @@ def main() -> None:
             shutil.move(str(report_path), str(dest_report_path))
             logger.info(f"Moved report file to: {dest_report_path}")
 
+            # Add benchmark field to the report
+            with open(dest_report_path, "r") as f:
+                report_data = json.load(f)
+            if isinstance(args.dataset, str) and "/" in args.dataset:
+                report_data["benchmark"] = args.dataset.split("/")[-1].lower()
+            else:
+                report_data["benchmark"] = str(args.dataset).lower()
+            with open(dest_report_path, "w") as f:
+                json.dump(report_data, f, indent=4)
+
             # Update Laminar datapoints with evaluation scores
             LaminarService.get().update_evaluation_scores(
                 str(input_file), str(dest_report_path)
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index daafe8ad..f39e151d 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -25,13 +25,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import SDK_SHORT_SHA
+from benchmarks.utils.version import IMAGE_TAG_PREFIX
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
@@ -124,7 +125,7 @@ def prepare_workspace(
             f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else ""
         )
         base_agent_image = (
-            f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
+            f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
         )
         wrap_needed = should_wrap_instance_id(instance.id)
         agent_server_image = base_agent_image
@@ -170,14 +171,13 @@ def prepare_workspace(
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
-            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             if not runtime_api_key:
                 raise ValueError(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
             )
             if not image_exists(agent_server_image):
                 raise RuntimeError(
@@ -282,7 +282,9 @@ def evaluate_instance(
         )
         conversation.send_message(instruction)
         # Run conversation with fake user responses to handle agent messages
-        run_conversation_with_fake_user_response(conversation)
+        run_conversation_with_fake_user_response(
+            conversation, run_timeout=self.metadata.conversation_timeout
+        )
 
         # git add
         workspace.execute_command(f"cd {repo_path} ; git add -A")
@@ -323,7 +325,10 @@ def evaluate_instance(
 
 def main() -> None:
     prompt_dir = (Path(__file__).parent / "prompts").resolve()
-    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    try:
+        choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    except ValueError:
+        choices = [str(p) for p in prompt_dir.glob("*.j2")]
     default_prompt_path = prompt_dir / "default.j2"
     assert default_prompt_path.exists(), (
         f"Default prompt {default_prompt_path} not found"
@@ -344,12 +349,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
@@ -373,6 +373,7 @@ def main() -> None:
         dataset=args.dataset,
         dataset_split=args.split,
         max_iterations=args.max_iterations,
+        conversation_timeout=args.conversation_timeout,
         eval_output_dir=structured_output_dir,
         details={},
         prompt_path=args.prompt_path,
@@ -382,6 +383,7 @@ def main() -> None:
         critic=critic,
         selected_instances_file=args.select,
         max_retries=args.max_retries,
+        skip_failed_samples=args.skip_failed_samples,
         workspace_type=args.workspace,
     )
 
diff --git a/benchmarks/swebenchmultimodal/eval_infer.py b/benchmarks/swebenchmultimodal/eval_infer.py
index 86bfd298..fc8efa47 100644
--- a/benchmarks/swebenchmultimodal/eval_infer.py
+++ b/benchmarks/swebenchmultimodal/eval_infer.py
@@ -147,6 +147,7 @@ def update_report_with_component_scores(report_json_path: Path) -> dict[str, flo
         report = json.load(f)
 
     # Add component scores to report
+    report["benchmark"] = "swebench-multimodal"
     report["component_scores"] = scores
 
     # Write updated report
@@ -246,6 +247,7 @@ def run_swebench_multimodal_evaluation(
     split: str = "dev",
     workers: str = "12",
     run_id: str | None = None,
+    modal: bool = True,
 ) -> Path | None:
     """
     Run SWE-Bench Multimodal evaluation on the predictions file.
@@ -262,68 +264,96 @@ def run_swebench_multimodal_evaluation(
     """
     logger.info(f"Running SWE-Bench Multimodal evaluation on {predictions_file}")
 
-    # Get the directory of the predictions file
-    predictions_path = Path(predictions_file)
-    predictions_dir = predictions_path.parent
-    predictions_filename = predictions_path.name
-
-    # Default for run_id if not provided
-    run_id = run_id or predictions_path.stem
-
-    # Run SWE-Bench Multimodal evaluation using UV environment
-    # The key difference from regular SWE-Bench is the --modal true flag
-    cmd = [
-        "uv",
-        "run",
-        "python",
-        "-m",
-        "swebench.harness.run_evaluation",
-        "--dataset_name",
-        dataset,
-        "--split",
-        split,
-        "--predictions_path",
-        predictions_filename,
-        "--max_workers",
-        str(workers),
-        "--modal",
-        "true",
-        "--run_id",
-        run_id,
-    ]
-
-    logger.info(f"Running command: {' '.join(cmd)}")
-    logger.info(f"Working directory: {predictions_dir}")
-    logger.info("SWE-Bench Multimodal evaluation output:")
-    print("-" * 80)
-
     try:
+        # Get the directory of the predictions file
+        predictions_path = Path(predictions_file)
+        predictions_dir = predictions_path.parent
+        predictions_filename = predictions_path.name
+
+        # Generate run_id if not provided
+        if run_id is None:
+            run_id = f"eval_{predictions_path.stem}"
+
+        # Run SWE-Bench Multimodal evaluation
+        # The key difference from regular SWE-Bench is the --modal true flag
+        # Try uv first, fall back to current Python interpreter
+        try:
+            uv_check = subprocess.run(
+                ["uv", "--version"],
+                capture_output=True,
+                text=True,
+            )
+            uv_available = uv_check.returncode == 0
+        except FileNotFoundError:
+            uv_available = False
+
+        if uv_available:
+            cmd = [
+                "uv",
+                "run",
+                "python",
+                "-m",
+                "swebench.harness.run_evaluation",
+            ]
+        else:
+            logger.info("uv not available, using current Python interpreter")
+            cmd = [
+                sys.executable,
+                "-m",
+                "swebench.harness.run_evaluation",
+            ]
+
+        cmd.extend([
+            "--dataset_name",
+            dataset,
+            "--split",
+            split,
+            "--predictions_path",
+            predictions_filename,
+            "--max_workers",
+            str(workers),
+            "--run_id",
+            run_id,
+        ])
+        if modal:
+            cmd.extend(["--modal", "true"])
+
+        logger.info(f"Running command: {' '.join(cmd)}")
+        logger.info(f"Working directory: {predictions_dir}")
+        logger.info("SWE-Bench Multimodal evaluation output:")
+        print("-" * 80)
+
+        # Stream output directly to console, running from predictions file directory
         result = subprocess.run(cmd, text=True, cwd=predictions_dir)
-    except FileNotFoundError as e:
+
+        print("-" * 80)
+        if result.returncode == 0:
+            logger.info("SWE-Bench Multimodal evaluation completed successfully")
+        else:
+            logger.error(
+                f"SWE-Bench Multimodal evaluation failed with return code {result.returncode}"
+            )
+            raise subprocess.CalledProcessError(result.returncode, cmd)
+
+        # SWE-Bench multimodal writes its summary to <MODEL_NAME_OR_PATH>.<run_id>.json
+        report_path = predictions_dir / f"{MODEL_NAME_OR_PATH}.{run_id}.json"
+        if not report_path.exists():
+            raise FileNotFoundError(
+                f"Expected report file not found: {report_path}. "
+                "SWE-Bench harness output naming may have changed."
+            )
+
+        return report_path
+
+    except FileNotFoundError:
         logger.error(
             "SWE-Bench evaluation command not found. "
             "Make sure SWE-Bench is properly installed."
         )
-        raise e
-
-    print("-" * 80)
-    if result.returncode == 0:
-        logger.info("SWE-Bench Multimodal evaluation completed successfully")
-    else:
-        logger.error(
-            f"SWE-Bench Multimodal evaluation failed with return code {result.returncode}"
-        )
-        raise subprocess.CalledProcessError(result.returncode, cmd)
-
-    # SWE-Bench multimodal writes its summary to <MODEL_NAME_OR_PATH>.<run_id>.json
-    report_path = predictions_dir / f"{MODEL_NAME_OR_PATH}.{run_id}.json"
-    if not report_path.exists():
-        raise FileNotFoundError(
-            f"Expected report file not found: {report_path}. "
-            "SWE-Bench harness output naming may have changed."
-        )
-    logger.info(f"Found report.json at: {report_path}")
-    return report_path
+        raise
+    except Exception as e:
+        logger.error(f"Error running SWE-Bench Multimodal evaluation: {e}")
+        raise
 
 
 def main() -> None:
@@ -368,6 +398,13 @@ def main() -> None:
         help="Number of workers to use when evaluating",
     )
 
+    parser.add_argument(
+        "--modal",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Use Modal for evaluation (default: True). Use --no-modal for local evaluation.",
+    )
+
     parser.set_defaults(**EVAL_DEFAULTS)
 
     parser.add_argument(
@@ -405,7 +442,7 @@ def main() -> None:
         if not args.skip_evaluation:
             # Run multimodal evaluation
             report_path = run_swebench_multimodal_evaluation(
-                str(output_file), args.dataset, args.split, args.workers, args.run_id
+                str(output_file), args.dataset, args.split, args.workers, args.run_id, args.modal
             )
 
             # Calculate component scores if we have a report
@@ -414,6 +451,15 @@ def main() -> None:
                     "Calculating component scores (solveable/unsolveable accuracy)..."
                 )
                 component_scores = update_report_with_component_scores(report_path)
+                # Export a .report.json artifact so framework parsers
+                # can discover benchmark results consistently across benchmarks.
+                with open(report_path, "r") as f:
+                    report_data = json.load(f)
+                report_data["benchmark"] = "swebench-multimodal"
+                dest_report_path = input_file.with_suffix(".report.json")
+                with open(dest_report_path, "w") as f:
+                    json.dump(report_data, f, indent=4)
+                logger.info(f"Wrote report artifact to: {dest_report_path}")
                 if component_scores:
                     logger.info("=" * 60)
                     logger.info("COMPONENT SCORES SUMMARY")
diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
index 02101697..ee968cfe 100644
--- a/benchmarks/swebenchmultimodal/run_infer.py
+++ b/benchmarks/swebenchmultimodal/run_infer.py
@@ -23,13 +23,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import SDK_SHORT_SHA
+from benchmarks.utils.version import IMAGE_TAG_PREFIX
 from openhands.sdk import (
     LLM,
     Agent,
@@ -160,7 +161,7 @@ def prepare_workspace(
 
         if self.metadata.workspace_type == "docker":
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
             )
             SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
             logger.info(f"SKIP_BUILD={SKIP_BUILD}")
@@ -196,14 +197,13 @@ def prepare_workspace(
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
-            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             if not runtime_api_key:
                 raise ValueError(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
             )
             if not image_exists(agent_server_image):
                 raise RuntimeError(
@@ -212,7 +212,7 @@ def prepare_workspace(
                 )
             logger.info(
                 f"Using remote workspace with image {agent_server_image} "
-                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
+                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -369,7 +369,9 @@ def evaluate_instance(
             logger.info("No image_assets found, sending text-only instruction")
             conversation.send_message(instruction)
         # Run conversation with fake user responses to handle agent messages
-        run_conversation_with_fake_user_response(conversation)
+        run_conversation_with_fake_user_response(
+            conversation, run_timeout=self.metadata.conversation_timeout
+        )
 
         # git add
         workspace.execute_command(f"cd {repo_path} ; git add -A")
@@ -411,7 +413,10 @@ def evaluate_instance(
 
 def main() -> None:
     prompt_dir = (Path(__file__).parent / "prompts").resolve()
-    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    try:
+        choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    except ValueError:
+        choices = [str(p) for p in prompt_dir.glob("*.j2")]
     default_prompt_path = prompt_dir / "default.j2"
     assert default_prompt_path.exists(), (
         f"Default prompt {default_prompt_path} not found"
@@ -433,12 +438,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
@@ -462,6 +462,7 @@ def main() -> None:
         dataset=args.dataset,
         dataset_split=args.split,
         max_iterations=args.max_iterations,
+        conversation_timeout=args.conversation_timeout,
         eval_output_dir=structured_output_dir,
         details={},
         prompt_path=args.prompt_path,
@@ -471,6 +472,7 @@ def main() -> None:
         critic=critic,
         selected_instances_file=args.select,
         max_retries=args.max_retries,
+        skip_failed_samples=args.skip_failed_samples,
         workspace_type=args.workspace,
     )
 
diff --git a/benchmarks/swtbench/eval_infer.py b/benchmarks/swtbench/eval_infer.py
index c245aa42..fb8ebedc 100644
--- a/benchmarks/swtbench/eval_infer.py
+++ b/benchmarks/swtbench/eval_infer.py
@@ -148,7 +148,9 @@ def update_report_with_submitted_instances(
     )
 
 
-def convert_to_swtbench_format(input_file: str, output_file: str) -> None:
+def convert_to_swtbench_format(
+    input_file: str, output_file: str
+) -> None:
     """
     Convert OpenHands output.jsonl to SWT-Bench prediction format.
 
@@ -271,21 +273,32 @@ def run_swtbench_evaluation(
         # but using the uv environment's python executable which has all dependencies
         benchmarks_dir = Path(__file__).parent.parent.parent
 
-        # Get the python executable from the uv environment
-        python_executable = subprocess.run(
-            [
-                "uv",
-                "run",
-                "--directory",
-                str(benchmarks_dir),
-                "python",
-                "-c",
-                "import sys; print(sys.executable)",
-            ],
-            capture_output=True,
-            text=True,
-            cwd=benchmarks_dir,
-        ).stdout.strip()
+        # Get the python executable from the uv environment, fall back to current interpreter
+        try:
+            uv_result = subprocess.run(
+                [
+                    "uv",
+                    "run",
+                    "--directory",
+                    str(benchmarks_dir),
+                    "python",
+                    "-c",
+                    "import sys; print(sys.executable)",
+                ],
+                capture_output=True,
+                text=True,
+                cwd=benchmarks_dir,
+            )
+            uv_available = uv_result.returncode == 0 and uv_result.stdout.strip()
+        except FileNotFoundError:
+            uv_available = False
+            uv_result = None
+
+        if uv_available:
+            python_executable = uv_result.stdout.strip()
+        else:
+            python_executable = sys.executable
+            logger.info("uv not available, using current Python interpreter")
 
         # Set up environment with PYTHONPATH to include swt-bench directory
         env = os.environ.copy()
@@ -301,7 +314,7 @@ def run_swtbench_evaluation(
             "--max_workers",
             str(workers),
             "--run_id",
-            predictions_path.stem,
+            f"eval_{predictions_path.stem}",
         ]
 
         logger.info(f"Using Python executable: {python_executable}")
@@ -436,7 +449,7 @@ def main() -> None:
             cache_dir = Path.home() / ".cache" / "openhands" / "swt-bench"
             swt_bench_dir = cache_dir / "swt-bench"
             report_dir = swt_bench_dir / "evaluation_results"
-            run_id = output_file.stem
+            run_id = f"eval_{output_file.stem}"
             report_file = report_dir / f"{MODEL_NAME_OR_PATH}.{run_id}.json"
 
             target_dir = input_file.parent
@@ -444,6 +457,14 @@ def main() -> None:
             shutil.move(str(report_file), str(target_file))
             logger.info(f"Moved evaluation report to: {target_file}")
             dest_report_path = target_file
+
+            # Add benchmark field to the report
+            with open(target_file, "r") as f:
+                report_data = json.load(f)
+            report_data["benchmark"] = "swtbench"
+            with open(target_file, "w") as f:
+                json.dump(report_data, f, indent=4)
+
             update_report_with_submitted_instances(target_file, output_file)
 
             # Update Laminar datapoints with evaluation scores
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index af2724bb..e85b8a13 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -17,14 +17,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
+from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import SDK_SHORT_SHA
-from openhands.agent_server.docker.build import _base_slug
+from benchmarks.utils.version import IMAGE_TAG_PREFIX
 from openhands.sdk import LLM, Agent, Conversation, __version__, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
@@ -53,6 +53,10 @@ def get_agent_server_docker_image(
     target: str = "source-minimal",
 ) -> str:
     """Get the agent server Docker image for an instance."""
+    # Importing here because openhands.agent_server.docker.build runs git checks
+    # which fails when installed as a package outside the git repo
+    from openhands.agent_server.docker.build import _base_slug
+
     official_image_name = get_official_docker_image(instance_id, docker_image_prefix)
     return (
         "ghcr.io/all-hands-ai/agent-server"
@@ -166,7 +170,7 @@ def prepare_workspace(
 
         if self.metadata.workspace_type == "docker":
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
             )
             SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
             logger.info(f"SKIP_BUILD={SKIP_BUILD}")
@@ -180,12 +184,25 @@ def prepare_workspace(
                     "agent-server image."
                 )
                 # For SWT-bench, we use DockerDevWorkspace with base_image
-                workspace = DockerDevWorkspace(
-                    base_image=official_docker_image,
-                    working_dir="/workspace",
-                    target=build_target,
-                    forward_env=forward_env or [],
-                )
+                # Fall back to pre-built image if build fails
+                try:
+                    workspace = DockerDevWorkspace(
+                        base_image=official_docker_image,
+                        working_dir="/workspace",
+                        target=build_target,
+                        forward_env=forward_env or [],
+                    )
+                except Exception as build_error:
+                    if not image_exists(agent_server_image):
+                        raise RuntimeError(
+                            f"On-the-fly build failed and pre-built image {agent_server_image} does not exist"
+                        )
+                    workspace = DockerWorkspace(
+                        server_image=agent_server_image,
+                        working_dir="/workspace",
+                        forward_env=forward_env or [],
+                    )
+                    logger.info(f"Using pre-built image {agent_server_image}")
             else:
                 workspace = DockerWorkspace(
                     server_image=agent_server_image,
@@ -194,14 +211,13 @@ def prepare_workspace(
                 )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
-            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             if not runtime_api_key:
                 raise ValueError(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
             )
             if not image_exists(agent_server_image):
                 raise RuntimeError(
@@ -210,7 +226,7 @@ def prepare_workspace(
                 )
             logger.info(
                 f"Using remote workspace with image {agent_server_image} "
-                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
+                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -303,7 +319,9 @@ def evaluate_instance(
         )
         conversation.send_message(instruction)
         # Run conversation with fake user responses to handle agent messages
-        run_conversation_with_fake_user_response(conversation)
+        run_conversation_with_fake_user_response(
+            conversation, run_timeout=self.metadata.conversation_timeout
+        )
 
         # git add
         workspace.execute_command(f"cd {repo_path} ; git add -A")
@@ -344,7 +362,10 @@ def evaluate_instance(
 def main() -> None:
     """Main entry point for SWT-bench evaluation."""
     prompt_dir = (Path(__file__).parent / "prompts").resolve()
-    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    try:
+        choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    except ValueError:
+        choices = [str(p) for p in prompt_dir.glob("*.j2")]
     default_prompt_path = prompt_dir / "default.j2"
     assert default_prompt_path.exists(), (
         f"Default prompt {default_prompt_path} not found"
@@ -365,12 +386,7 @@ def main() -> None:
     if args.max_attempts < 1:
         raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
 
-    llm_config_path = args.llm_config_path
-    if not os.path.isfile(llm_config_path):
-        raise ValueError(f"LLM config file {llm_config_path} does not exist")
-    with open(llm_config_path, "r") as f:
-        llm_config = f.read()
-    llm = LLM.model_validate_json(llm_config)
+    llm = load_llm_config(args.llm_config_path)
     logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
 
     dataset_description = (
@@ -392,6 +408,7 @@ def main() -> None:
         dataset=args.dataset,
         dataset_split=args.split,
         max_iterations=args.max_iterations,
+        conversation_timeout=args.conversation_timeout,
         eval_output_dir=structured_output_dir,
         details={},
         prompt_path=args.prompt_path,
@@ -401,6 +418,7 @@ def main() -> None:
         critic=critic,
         selected_instances_file=args.select,
         max_retries=args.max_retries,
+        skip_failed_samples=args.skip_failed_samples,
         workspace_type=args.workspace,
     )
 
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 6ae98855..f0818064 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -49,6 +49,14 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
         default=500,
         help="Maximum iterations (default: 500)",
     )
+    parser.add_argument(
+        "--conversation-timeout",
+        type=float,
+        default=3600.0,
+        help=(
+            "Timeout (seconds) for a single Conversation.run() call on remote workspaces "
+        ),
+    )
     parser.add_argument("--num-workers", type=int, help="Number of inference workers")
     parser.add_argument("--note", type=str, help="Optional evaluation note")
     parser.add_argument(
@@ -84,4 +92,9 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
         default=3,
         help="Maximum retries for instances that throw exceptions (default: 3)",
     )
+    parser.add_argument(
+        "--skip-failed-samples",
+        action="store_true",
+        help="Skip failed samples and treat as not solved",
+    )
     return parser
diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
index 9c700f1d..1653f84a 100644
--- a/benchmarks/utils/build_utils.py
+++ b/benchmarks/utils/build_utils.py
@@ -28,7 +28,6 @@
 )
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.image_utils import image_exists
-from openhands.agent_server.docker.build import BuildOptions, TargetType, build
 from openhands.sdk import get_logger
 
 
@@ -279,9 +278,13 @@ def build_image(
     base_image: str,
     target_image: str,
     custom_tag: str,
-    target: TargetType = "source-minimal",
+    target: str = "source-minimal",
     push: bool = False,
 ) -> BuildOutput:
+    # Importing here because openhands.agent_server.docker.build runs git checks
+    # which fails when installed as a package outside the git repo
+    from openhands.agent_server.docker.build import BuildOptions, build
+
     # Get SDK info from submodule to ensure tags use the correct SDK SHA
     git_ref, git_sha, sdk_version = _get_sdk_submodule_info()
 
@@ -312,7 +315,7 @@ def _build_with_logging(
     base_image: str,
     target_image: str,
     custom_tag: str = "",
-    target: TargetType = "source-minimal",
+    target: str = "source-minimal",
     push: bool = False,
     max_retries: int = 3,
     post_build_fn: Callable[[BuildOutput, bool], BuildOutput] | None = None,
@@ -405,7 +408,7 @@ def default_build_output_dir(
 
 def build_all_images(
     base_images: list[str],
-    target: TargetType,
+    target: str,
     build_dir: Path,
     image: str = EVAL_AGENT_SERVER_IMAGE,
     push: bool = False,
diff --git a/benchmarks/utils/constants.py b/benchmarks/utils/constants.py
index 882f1b63..ddd96006 100644
--- a/benchmarks/utils/constants.py
+++ b/benchmarks/utils/constants.py
@@ -1,5 +1,11 @@
+import os
+
 OUTPUT_FILENAME = "output.jsonl"
-EVAL_AGENT_SERVER_IMAGE = "ghcr.io/openhands/eval-agent-server"
+
+# Image name for agent server (can be overridden via env var)
+EVAL_AGENT_SERVER_IMAGE = os.getenv(
+    "OPENHANDS_EVAL_AGENT_SERVER_IMAGE", "ghcr.io/openhands/eval-agent-server"
+)
 
 # Model identifier used in swebench-style prediction entries.
 # The swebench harness uses this value to create log directory structures
diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py
index 32177e38..1548b624 100644
--- a/benchmarks/utils/evaluation.py
+++ b/benchmarks/utils/evaluation.py
@@ -6,6 +6,7 @@
 import json
 import os
 import sys
+import traceback
 import time
 from abc import ABC, abstractmethod
 from concurrent.futures import FIRST_COMPLETED, Future, ProcessPoolExecutor, wait
@@ -55,6 +56,15 @@ class PendingInstance:
 OnResult = Callable[[EvalInstance, EvalOutput], None]
 
 
+class SampleFailedError(Exception):
+    """Raised when a sample fails and skip_failed_samples=False."""
+
+    def __init__(self, instance_id: str, error: str):
+        self.instance_id = instance_id
+        self.error = error
+        super().__init__(f"Sample {instance_id} failed: {error}")
+
+
 class Evaluation(ABC, BaseModel):
     """Abstract orchestrator for instance processing (process-based)."""
 
@@ -132,12 +142,23 @@ def evaluate_instance(
         raise NotImplementedError
 
     def _create_error_output(
-        self, instance: EvalInstance, error: Exception, retry_count: int
+        self,
+        instance: EvalInstance,
+        error: Exception,
+        retry_count: int,
+        *,
+        stack: str | None = None,
     ) -> EvalOutput:
         """Create an EvalOutput object for a failed instance."""
+        err_type = error.__class__.__name__
+        err_msg = str(error)
         return EvalOutput(
             instance_id=instance.id,
-            test_result={},
+            test_result={
+                "error_type": err_type,
+                "error_message": err_msg,
+                "error_stack": stack,
+            },
             instruction=None,
             error=(
                 f"Instance failed after {retry_count} retries. Last error: {str(error)}"
@@ -411,6 +432,10 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
                             instance, out = fut.result()
                             pending_info = pending_instances.get(fut)
 
+                            # Fail fast if skip_failed_samples=False and sample errored
+                            if out.error and not self.metadata.skip_failed_samples:
+                                raise SampleFailedError(instance.id, out.error)
+
                             # Add Laminar metadata to EvalOutput
                             if out.metadata is None:
                                 out.metadata = self.metadata.model_copy(deep=True)
@@ -422,6 +447,9 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
                             )
 
                             attempt_on_result(instance, out)
+                        except SampleFailedError:
+                            # Re-raise to fail the entire evaluation
+                            raise
                         except Exception as e:
                             logger.error(
                                 f"Unexpected error from worker process: {str(e)[:50]}",
@@ -455,6 +483,7 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
                                     f"{self.instance_timeout}s timeout"
                                 ),
                                 attempt,
+                                stack=None,
                             )
                             if error_output.metadata is None:
                                 error_output.metadata = self.metadata.model_copy(
@@ -581,6 +610,7 @@ def _process_one_mp(
             retry_count = 0
             runtime_failure_count = 0
             last_error = None
+            last_error_stack: str | None = None
             max_retries = self.metadata.max_retries
             runtime_runs: list[RemoteRuntimeAllocation] = []
 
@@ -648,6 +678,7 @@ def _process_one_mp(
                     return instance, out
                 except Exception as e:
                     last_error = e
+                    last_error_stack = traceback.format_exc()
                     retry_count += 1
                     lmnr_span.record_exception(e)
 
@@ -692,7 +723,10 @@ def _process_one_mp(
                         )
                         # Create error output for final failure
                         error_output = self._create_error_output(
-                            instance, last_error, max_retries
+                            instance,
+                            last_error,
+                            max_retries,
+                            stack=last_error_stack,
                         )
                         if runtime_runs:
                             error_output.runtime_runs = runtime_runs
@@ -723,7 +757,10 @@ def _process_one_mp(
 
             # This should never be reached, but added for type safety
             error_output = self._create_error_output(
-                instance, Exception("Unexpected error: no attempts made"), max_retries
+                instance,
+                Exception("Unexpected error: no attempts made"),
+                max_retries,
+                stack=None,
             )
             if runtime_runs:
                 error_output.runtime_runs = runtime_runs
diff --git a/benchmarks/utils/fake_user_response.py b/benchmarks/utils/fake_user_response.py
index 8b2848fa..befcfa75 100644
--- a/benchmarks/utils/fake_user_response.py
+++ b/benchmarks/utils/fake_user_response.py
@@ -119,6 +119,7 @@ def run_conversation_with_fake_user_response(
     conversation: "BaseConversation",
     fake_user_response_fn: FakeUserResponseFn = fake_user_response,
     max_fake_responses: int = 10,
+    run_timeout: float | None = None,
 ) -> None:
     """Run a conversation with automatic fake user responses.
 
@@ -137,13 +138,20 @@ def run_conversation_with_fake_user_response(
             Defaults to fake_user_response.
         max_fake_responses: Maximum number of fake responses to send before
             stopping. This prevents infinite loops.
+        run_timeout: Optional timeout in seconds for conversation.run() calls
     """
 
     fake_response_count = 0
 
+    # Only RemoteConversation.run() supports a timeout kwarg.
+    from openhands.sdk.conversation.impl.remote_conversation import RemoteConversation
+
     while True:
         # Run the conversation
-        conversation.run()
+        if run_timeout is not None and isinstance(conversation, RemoteConversation):
+            conversation.run(timeout=run_timeout)
+        else:
+            conversation.run()
 
         # Check the execution status
         status = conversation.state.execution_status
diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py
index a463f3b4..b328a54f 100644
--- a/benchmarks/utils/image_utils.py
+++ b/benchmarks/utils/image_utils.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
 import base64
+import subprocess
 import sys
 
 import requests
@@ -54,12 +55,31 @@ def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None:
     return None
 
 
+def _local_image_exists(image_ref: str) -> bool:
+    """Check if image exists in local Docker daemon."""
+    try:
+        result = subprocess.run(
+            ["docker", "images", "-q", image_ref],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        return bool(result.stdout.strip())
+    except (subprocess.SubprocessError, FileNotFoundError):
+        return False
+
+
 def image_exists(
     image_ref: str,
     gh_username: str | None = None,
     gh_pat: str | None = None,  # GitHub PAT with read:packages for private GHCR
     docker_token: str | None = None,  # Docker Hub JWT if you already have one
 ) -> bool:
+    # Check local Docker first
+    if _local_image_exists(image_ref):
+        return True
+
+    # Then check remote registry
     registry, repo, ref = _parse(image_ref)
     headers = {"Accept": ACCEPT}
 
diff --git a/benchmarks/utils/llm_config.py b/benchmarks/utils/llm_config.py
new file mode 100644
index 00000000..952704cd
--- /dev/null
+++ b/benchmarks/utils/llm_config.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+from openhands.sdk import LLM
+
+
+def load_llm_config(config_path: str | Path) -> LLM:
+    config_path = Path(config_path)
+    if not config_path.is_file():
+        raise ValueError(f"LLM config file {config_path} does not exist")
+
+    with config_path.open("r") as f:
+        llm_config = json.load(f)
+
+    # load api_key from env var if api_key_env is specified
+    if "api_key_env" in llm_config:
+        env_var = llm_config.pop("api_key_env")
+        api_key = os.environ.get(env_var, "")
+        if not api_key:
+            raise ValueError(
+                f"Environment variable {env_var} is not set or empty. "
+                f"Please set it with your API key."
+            )
+        llm_config["api_key"] = api_key
+
+    # strip /chat/completions from base_url for LiteLLM compatibility
+    if "base_url" in llm_config:
+        base_url = llm_config["base_url"]
+        base_url = base_url.rstrip("/")
+        if base_url.endswith("/chat/completions"):
+            base_url = base_url.removesuffix("/chat/completions")
+        llm_config["base_url"] = base_url
+
+    return LLM.model_validate(llm_config)
diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py
index f04b405d..4c4738b4 100644
--- a/benchmarks/utils/models.py
+++ b/benchmarks/utils/models.py
@@ -18,6 +18,13 @@ class EvalMetadata(BaseModel):
     dataset: str
     dataset_split: str = Field(default="test")
     max_iterations: int
+    conversation_timeout: float = Field(
+        default=3600.0,
+        ge=0,
+        description=(
+            "Timeout in seconds for a single Conversation.run() call (remote workspaces). "
+        ),
+    )
     eval_output_dir: str
     details: dict[str, Any] | None = None
     prompt_path: str | None = Field(
@@ -48,6 +55,13 @@ class EvalMetadata(BaseModel):
         ge=0,
         description="Maximum number of retries for instances that throw exceptions",
     )
+    skip_failed_samples: bool = Field(
+        default=True,
+        description=(
+            "If True, failed samples are skipped and treated as not solved. "
+            "If False, the entire evaluation fails on the first failed sample."
+        ),
+    )
     workspace_type: Literal["docker", "remote"] = Field(
         default="docker",
         description="Type of workspace to use, e.g., 'docker' or 'remote'",
diff --git a/benchmarks/utils/version.py b/benchmarks/utils/version.py
index 951c6592..15b2da3f 100644
--- a/benchmarks/utils/version.py
+++ b/benchmarks/utils/version.py
@@ -1,4 +1,6 @@
+import os
 import subprocess
+import warnings
 from pathlib import Path
 
 
@@ -18,10 +20,19 @@ def _get_submodule_sha(submodule_path: Path) -> str:
 
 def get_sdk_sha() -> str:
     """
-    Get the current git sha from the SDK submodule.
+    Get the SDK SHA from git submodule, falling back to "unknown".
     """
-    return _get_submodule_sha(PROJECT_ROOT / "vendor" / "software-agent-sdk")
+    try:
+        return _get_submodule_sha(PROJECT_ROOT / "vendor" / "software-agent-sdk")
+    except subprocess.CalledProcessError:
+        warnings.warn(
+            "Could not get SDK SHA from git submodule. Using 'unknown' as fallback. "
+        )
+        return "unknown"
 
 
 SDK_SHA = get_sdk_sha()
 SDK_SHORT_SHA = SDK_SHA[:7]
+
+# This is used as the first part of the image tag: <prefix>-<custom_tag>-<target>
+IMAGE_TAG_PREFIX = os.getenv("IMAGE_TAG_PREFIX", SDK_SHORT_SHA)
diff --git a/nemo_evaluator/openhands_benchmarks/__init__.py b/nemo_evaluator/openhands_benchmarks/__init__.py
new file mode 100644
index 00000000..b28b04f6
--- /dev/null
+++ b/nemo_evaluator/openhands_benchmarks/__init__.py
@@ -0,0 +1,3 @@
+
+
+
diff --git a/nemo_evaluator/openhands_benchmarks/framework.yml b/nemo_evaluator/openhands_benchmarks/framework.yml
new file mode 100644
index 00000000..c14aa91d
--- /dev/null
+++ b/nemo_evaluator/openhands_benchmarks/framework.yml
@@ -0,0 +1,266 @@
+framework:
+  name: openhands_benchmarks
+  pkg_name: openhands_benchmarks
+  full_name: OpenHands Benchmarks
+  description: Multi-benchmark evaluation harness using the OpenHands agent framework.
+  url: https://github.com/All-Hands-AI/openhands-agent-benchmarks
+
+defaults:
+  command: >-
+    python3 -m benchmarks.scripts.run_benchmark
+    --model openai/{{target.api_endpoint.model_id}}
+    --api-base-url {{target.api_endpoint.url}}
+    {% if target.api_endpoint.api_key_name is not none %}--api-key-env {{target.api_endpoint.api_key_name}}{% endif %}
+    --temperature {{config.params.temperature}}
+    --top-p {{config.params.top_p}}
+    --max-completion-tokens {{config.params.max_new_tokens}}
+    --timeout {{config.params.request_timeout}}
+    --max-retries {{config.params.max_retries}}
+    --benchmark {{config.params.extra.benchmark}}
+    {% if config.params.extra.dataset is defined and config.params.extra.dataset is not none %}--dataset {{config.params.extra.dataset}}{% endif %}
+    {% if config.params.extra.split is defined and config.params.extra.split is not none %}--split {{config.params.extra.split}}{% endif %}
+    --workspace {{config.params.extra.workspace}}
+    --max-iterations {{config.params.extra.max_steps}}
+    --conversation-timeout {{config.params.extra.conversation_timeout}}
+    --num-workers {{config.params.parallelism}}
+    --note {{config.type}}
+    --output-dir {{config.output_dir}}
+    --max-attempts {{config.params.extra.max_attempts}}
+    --instance-max-retries {{config.params.extra.instance_max_retries}}
+    {% if config.params.limit_samples is not none %}--n-limit {{config.params.limit_samples}}{% endif %}
+    {% if config.params.extra.skip_failed_samples %}--skip-failed-samples{% endif %}
+    {% if config.params.extra.level is defined and config.params.extra.level is not none %}--level {{config.params.extra.level}}{% endif %}
+    {% if config.params.extra.repo_split is defined and config.params.extra.repo_split is not none %}--repo-split {{config.params.extra.repo_split}}{% endif %}
+    {% if config.params.extra.language is defined and config.params.extra.language is not none %}--language {{config.params.extra.language}}{% endif %}
+    {% if config.params.extra.modal is defined and config.params.extra.modal is not none %}{% if config.params.extra.modal %}--modal{% else %}--no-modal{% endif %}{% endif %}
+
+  config:
+    params:
+      limit_samples: null
+      temperature: 0.6
+      top_p: 1.0
+      max_new_tokens: 64000
+      request_timeout: 84000
+      max_retries: 5
+      parallelism: 1
+      extra:
+        workspace: docker
+        max_steps: 100
+        conversation_timeout: 28000
+        max_attempts: 3
+        instance_max_retries: 3
+        skip_failed_samples: false
+  target:
+    api_endpoint:
+      adapter_config:
+        mode: client  # disable adapters by default
+
+evaluations:
+  # SWE-bench variants
+  - name: swebench-verified
+    description: SWE-bench Verified - 500 human-validated GitHub issues
+    defaults:
+      config:
+        type: swebench-verified
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: swebench
+            dataset: princeton-nlp/SWE-bench_Verified
+            split: test
+
+  - name: swebench-lite
+    description: SWE-bench Lite - 300 curated GitHub issues
+    defaults:
+      config:
+        type: swebench-lite
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: swebench
+            dataset: princeton-nlp/SWE-bench_Lite
+            split: test
+
+  - name: swebench-full
+    description: SWE-bench Full - Complete dataset of GitHub issues
+    defaults:
+      config:
+        type: swebench-full
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: swebench
+            dataset: princeton-nlp/SWE-bench
+            split: test
+
+  # GAIA benchmark
+  - name: gaia
+    description: GAIA - General AI Assistant benchmark for real-world tasks requiring reasoning, tool use, and web browsing
+    defaults:
+      config:
+        type: gaia
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: gaia
+            dataset: gaia-benchmark/GAIA
+            split: test
+            level: "2023_all"
+
+  # Commit0 benchmark
+  - name: commit0
+    description: Commit0 - Repository-level code generation benchmark
+    defaults:
+      config:
+        type: commit0
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: commit0
+            dataset: wentingzhao/commit0_combined
+            split: test
+            repo_split: lite
+            max_attempts: 1
+
+  # Multi-SWE-bench (multilingual)
+  - name: multiswebench-java
+    description: Multi-SWE-bench Java - Multilingual SWE-bench for Java repositories
+    defaults:
+      config:
+        type: multiswebench-java
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: multiswebench
+            dataset: bytedance-research/Multi-SWE-Bench
+            split: java_verified
+            language: java
+
+  - name: multiswebench-python  # empty subset
+    description: Multi-SWE-bench Python - Multilingual SWE-bench for Python repositories
+    defaults:
+      config:
+        type: multiswebench-python
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: multiswebench
+            dataset: bytedance-research/Multi-SWE-Bench
+            split: python_verified
+            language: python
+
+  - name: multiswebench-go
+    description: Multi-SWE-bench Go - Multilingual SWE-bench for Go repositories
+    defaults:
+      config:
+        type: multiswebench-go
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: multiswebench
+            dataset: bytedance-research/Multi-SWE-Bench
+            split: go_verified
+            language: go
+
+  - name: multiswebench-c
+    description: Multi-SWE-bench C - Multilingual SWE-bench for C repositories
+    defaults:
+      config:
+        type: multiswebench-c
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: multiswebench
+            dataset: bytedance-research/Multi-SWE-Bench
+            split: c_verified
+            language: c
+
+  - name: multiswebench-cpp
+    description: Multi-SWE-bench C++ - Multilingual SWE-bench for C++ repositories
+    defaults:
+      config:
+        type: multiswebench-cpp
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: multiswebench
+            dataset: bytedance-research/Multi-SWE-Bench
+            split: cpp_verified
+            language: cpp
+
+  - name: multiswebench-js
+    description: Multi-SWE-bench JavaScript - Multilingual SWE-bench for JavaScript repositories
+    defaults:
+      config:
+        type: multiswebench-js
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: multiswebench
+            dataset: bytedance-research/Multi-SWE-Bench
+            split: js_verified
+            language: js
+
+  - name: multiswebench-rust
+    description: Multi-SWE-bench Rust - Multilingual SWE-bench for Rust repositories
+    defaults:
+      config:
+        type: multiswebench-rust
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: multiswebench
+            dataset: bytedance-research/Multi-SWE-Bench
+            split: rust_verified
+            language: rust
+
+  - name: multiswebench-ts
+    description: Multi-SWE-bench TypeScript - Multilingual SWE-bench for TypeScript repositories
+    defaults:
+      config:
+        type: multiswebench-ts
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: multiswebench
+            dataset: bytedance-research/Multi-SWE-Bench
+            split: ts_verified
+            language: ts
+
+  # SWT-bench
+  - name: swtbench
+    description: SWT-bench - Software testing benchmark for test generation
+    defaults:
+      config:
+        type: swtbench
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: swtbench
+
+  # SWE-bench Multimodal
+  - name: swebench-multimodal
+    description: SWE-bench Multimodal - GitHub issues with visual context
+    defaults:
+      config:
+        type: swebench-multimodal
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: swebenchmultimodal
+            dataset: princeton-nlp/SWE-bench_Multimodal
+            split: dev  # test spit did not work
+            modal: false
+
+  # OpenAgentSafety benchmark
+  - name: openagentsafety
+    description: OpenAgentSafety - Safety evaluation benchmark for AI agents
+    defaults:
+      config:
+        type: openagentsafety
+        supported_endpoint_types: [chat]
+        params:
+          extra:
+            benchmark: openagentsafety
+            dataset: mgulavani/openagentsafety_full_updated_v3
+            split: train
diff --git a/nemo_evaluator/openhands_benchmarks/output.py b/nemo_evaluator/openhands_benchmarks/output.py
new file mode 100644
index 00000000..9104c680
--- /dev/null
+++ b/nemo_evaluator/openhands_benchmarks/output.py
@@ -0,0 +1,54 @@
+import json
+import pathlib
+
+from nemo_evaluator.api.api_dataclasses import EvaluationResult
+
+
+def parse_output(output_dir: str) -> EvaluationResult:
+    output_path = pathlib.Path(output_dir)
+
+    # Find any .report.json file (all benchmarks use this naming convention)
+    report_files = sorted(output_path.rglob("*.report.json"))
+
+    if not report_files:
+        raise FileNotFoundError(
+            f"No .report.json file found under {output_dir}. "
+            "Make sure the evaluation completed successfully."
+        )
+
+    if len(report_files) > 1:
+        raise ValueError(
+            f"Multiple .report.json files found: {report_files}. "
+            "`output_dir` must contain a single evaluation run."
+        )
+
+    report = json.loads(report_files[0].read_text(encoding="utf-8"))
+
+    # Get benchmark name from report
+    task_name = report["benchmark"]
+
+    # All benchmarks have these common fields in their report
+    resolved = report.get("resolved_instances", 0)
+    submitted = report.get("submitted_instances", 0)
+
+    # Calculate accuracy (handle division by zero)
+    accuracy = resolved / submitted if submitted > 0 else 0.0
+
+    metrics = {
+        "accuracy": {
+            "scores": {
+                "accuracy": {
+                    "value": accuracy,
+                    "stats": {
+                        "resolved": resolved,
+                        "total": submitted,
+                    },
+                }
+            }
+        }
+    }
+
+    tasks = {task_name: {"metrics": metrics}}
+    groups = {task_name: {"metrics": metrics}}
+
+    return EvaluationResult(tasks=tasks, groups=groups)
diff --git a/pyproject.toml b/pyproject.toml
index 843655a6..83bdcf79 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,10 +42,13 @@ dependencies = [
     "lmnr>=0.7.24",
     "multi-swe-bench>=1.1.1",
     "swt-bench @ git+https://github.com/logic-star-ai/swt-bench.git@5fdcd446ff05e248ecfffc19d560a210699f71f8",
-]
+    "nemo_evaluator",
+    ]
 
 [project.scripts]
 validate-cfg = "benchmarks.scripts.validate_cfg:main"
+generate-llm-config = "benchmarks.scripts.generate_llm_config:main"
+run-benchmark = "benchmarks.scripts.run_benchmark:main"
 swebench-infer = "benchmarks.swebench.run_infer:main"
 swtbench-infer = "benchmarks.swtbench.run_infer:main"
 swebench-eval = "benchmarks.swebench.eval_infer:main"
@@ -68,12 +71,16 @@ build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
 where = ["."]
-include = ["benchmarks"]
+include = ["benchmarks", "benchmarks*", "nemo_evaluator", "nemo_evaluator*"]
 
 [tool.setuptools]
 # Install the top-level sitecustomize module so Python auto-loads our Modal logging patch.
 py-modules = ["sitecustomize"]
 
+[tool.setuptools.package-data]
+nemo_evaluator = ["**/*.yml"]
+benchmarks = ["**/*.j2", "**/Dockerfile*", "**/*.json"]
+
 [dependency-groups]
 dev = [
     "pre-commit>=4.3.0",
@@ -90,7 +97,6 @@ dev = [
 [tool.ruff]
 target-version = "py312"
 line-length = 88
-exclude = ["legacy"]
 
 [tool.ruff.format]
 quote-style = "double"

From c5f40518280f60c74761474517ddac5833390939 Mon Sep 17 00:00:00 2001
From: Ewa Dobrowolska <edobrowolska@nvidia.com>
Date: Thu, 12 Feb 2026 22:04:50 +0100
Subject: [PATCH 2/2] remove redundant VERSION file

---
 VERSION | 1 -
 1 file changed, 1 deletion(-)
 delete mode 100644 VERSION

diff --git a/VERSION b/VERSION
deleted file mode 100644
index 6e8bf73a..00000000
--- a/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-0.1.0