From 441328aa8df91984387a3564e8ae1ed7550ac22d Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Tue, 17 Feb 2026 17:46:10 +0000
Subject: [PATCH] Add git reset validation script and fix missing resets

This PR adds a validation script that ensures git reset is called after
every git clone or git checkout operation in the benchmark codebase.
This prevents agents from looking at commits that are not part of the
benchmark during evaluation.

Changes:
- Add validate_git_reset.py script to check for git reset after clone/checkout
- Add pre-commit hook for automatic validation
- Fix 6 files with missing git reset operations:
  - benchmarks/commit0/run_infer.py
  - legacy/ml_bench/run_infer.py
  - legacy/lca_ci_build_repair/run_infer.py
  - legacy/lca_ci_build_repair/eval_infer.py
  - legacy/swefficiency/scripts/setup/prepare_swe_utils.sh
  - legacy/testgeneval/scripts/setup/prepare_swe_utils.sh
- Add comments for version_control.sh where git reset is not needed
- Add test suite for the validation script (14 tests)
- Add CLI entry point: validate-git-reset

The validation accepts 'git reset' in comments as valid, allowing
explicit documentation when reset is intentionally skipped.

Fixes #424

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .pre-commit-config.yaml                       |   8 +
 benchmarks/commit0/run_infer.py               |   6 +
 benchmarks/scripts/validate_git_reset.py      | 178 ++++++++++++++++++
 legacy/lca_ci_build_repair/eval_infer.py      |   5 +
 legacy/lca_ci_build_repair/run_infer.py       |   5 +
 legacy/ml_bench/run_infer.py                  |   5 +
 .../scripts/setup/prepare_swe_utils.sh        |   1 +
 .../scripts/setup/prepare_swe_utils.sh        |   1 +
 legacy/utils/version_control.sh               |   4 +
 pyproject.toml                                |   1 +
 tests/test_validate_git_reset.py              | 150 +++++++++++++++
 11 files changed, 364 insertions(+)
 create mode 100644 benchmarks/scripts/validate_git_reset.py
 create mode 100644 tests/test_validate_git_reset.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c130cfd2..c5c2c4c5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -37,3 +37,11 @@ repos:
         pass_filenames: true
         always_run: false
         exclude: ^legacy/
+      - id: validate-git-reset
+        name: Validate git reset after clone/checkout
+        entry: uv
+        args: [run, python, benchmarks/scripts/validate_git_reset.py]
+        language: system
+        types_or: [python, shell]
+        pass_filenames: true
+        always_run: false
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index df10feb2..777d87ab 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -245,6 +245,12 @@ def prepare_workspace(
             raise RuntimeError(f"Failed to clone repo: {res.stderr}")
         logger.info(f"Cloned repository: {instance.data['repo']}")
 
+        # Reset to ensure clean state at HEAD
+        reset_cmd = f"cd /workspace/{workspace_dir_name} && git reset --hard HEAD"
+        res = workspace.execute_command(reset_cmd, timeout=60)
+        if res.exit_code != 0:
+            raise RuntimeError(f"Failed to reset repo: {res.stderr}")
+
         # Create new branch
         branch_cmd = f"cd /workspace/{workspace_dir_name} && git checkout -b openhands"
         res = workspace.execute_command(branch_cmd, timeout=600)
diff --git a/benchmarks/scripts/validate_git_reset.py b/benchmarks/scripts/validate_git_reset.py
new file mode 100644
index 00000000..9bf1855d
--- /dev/null
+++ b/benchmarks/scripts/validate_git_reset.py
@@ -0,0 +1,178 @@
+"""Validation script to ensure git reset follows git clone/checkout operations.
+
+During benchmark evaluation, we want to test a git repository at a specific commit.
+To prevent the agent from looking at commits that are not part of the benchmark,
+this script validates that every `git clone` or `git checkout` is followed by a
+`git reset` command (or has a comment indicating git reset is not needed).
+
+Usage:
+    validate-git-reset [path...]
+
+If no paths are provided, defaults to scanning the repository root.
+"""
+
+import argparse
+import re
+import sys
+from pathlib import Path
+
+
+# Patterns for finding git clone/checkout commands
+GIT_CLONE_PATTERN = re.compile(r"git\s+clone\b")
+GIT_CHECKOUT_PATTERN = re.compile(r"git\s+checkout\b")
+
+# Pattern for git reset (can be in code or in a comment)
+GIT_RESET_PATTERN = re.compile(r"git\s+reset\b")
+
+# File extensions to check
+CHECK_EXTENSIONS = {".py", ".sh"}
+
+# Files to skip (relative to repository root)
+SKIP_PATTERNS = [
+    "validate_git_reset.py",  # This script itself
+    "test_validate_git_reset.py",  # Test file for this script
+]
+
+
+def should_skip_file(file_path: Path) -> bool:
+    """Check if a file should be skipped from validation."""
+    for pattern in SKIP_PATTERNS:
+        if file_path.name == pattern or pattern in str(file_path):
+            return True
+    return False
+
+
+def find_git_operations(
+    content: str,
+) -> list[tuple[int, str, str]]:
+    """Find all git clone and git checkout operations in the content.
+
+    Returns a list of tuples: (line_number, line_content, operation_type)
+    """
+    operations = []
+    lines = content.split("\n")
+
+    for i, line in enumerate(lines):
+        line_num = i + 1  # 1-indexed line numbers
+        if GIT_CLONE_PATTERN.search(line):
+            operations.append((line_num, line, "git clone"))
+        # Only flag git checkout if it's not "git checkout -b" (creating a branch)
+        # which doesn't need a reset since it's creating a new branch
+        elif GIT_CHECKOUT_PATTERN.search(line):
+            # Skip "git checkout -b" (create branch) as it doesn't need reset
+            if not re.search(r"git\s+checkout\s+-b\b", line):
+                operations.append((line_num, line, "git checkout"))
+
+    return operations
+
+
+def has_git_reset_nearby(
+    content: str,
+    operation_line: int,
+    context_lines_after: int = 20,
+    context_lines_before: int = 5,
+) -> bool:
+    """Check if there's a git reset within context lines around the operation.
+
+    Also accepts git reset in comments as valid (to allow explicit documentation
+    that reset is intentionally skipped).
+    """
+    lines = content.split("\n")
+    start_line = max(0, operation_line - 1 - context_lines_before)
+    end_line = min(operation_line + context_lines_after, len(lines))
+
+    # Check lines before and after the operation
+    for i in range(start_line, end_line):
+        if GIT_RESET_PATTERN.search(lines[i]):
+            return True
+
+    return False
+
+
+def validate_file(file_path: Path) -> list[tuple[int, str, str]]:
+    """Validate a single file for git reset after clone/checkout.
+
+    Returns a list of violations: (line_number, line_content, operation_type)
+    """
+    violations = []
+
+    try:
+        content = file_path.read_text()
+    except (OSError, UnicodeDecodeError):
+        return violations
+
+    operations = find_git_operations(content)
+
+    for line_num, line_content, op_type in operations:
+        if not has_git_reset_nearby(content, line_num):
+            violations.append((line_num, line_content.strip(), op_type))
+
+    return violations
+
+
+def find_files_to_check(paths: list[Path]) -> list[Path]:
+    """Find all files that should be checked for git operations."""
+    files = []
+
+    for path in paths:
+        if path.is_file():
+            if path.suffix in CHECK_EXTENSIONS and not should_skip_file(path):
+                files.append(path)
+        elif path.is_dir():
+            for ext in CHECK_EXTENSIONS:
+                for file in path.rglob(f"*{ext}"):
+                    if not should_skip_file(file):
+                        files.append(file)
+
+    return files
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Validate that git reset follows git clone/checkout operations"
+    )
+    parser.add_argument(
+        "paths",
+        nargs="*",
+        type=Path,
+        default=[Path(".")],
+        help="Paths to check (files or directories). Defaults to current directory.",
+    )
+    args = parser.parse_args()
+
+    files = find_files_to_check(args.paths)
+
+    all_violations: list[tuple[Path, int, str, str]] = []
+
+    for file in files:
+        violations = validate_file(file)
+        for line_num, line_content, op_type in violations:
+            all_violations.append((file, line_num, line_content, op_type))
+
+    if all_violations:
+        print("ERROR: Found git clone/checkout without git reset:", file=sys.stderr)
+        print(file=sys.stderr)
+        for file, line_num, line_content, op_type in all_violations:
+            print(f"  {file}:{line_num}: {op_type}", file=sys.stderr)
+            print(f"    {line_content}", file=sys.stderr)
+            print(file=sys.stderr)
+        print(
+            "To fix: Add 'git reset --hard <commit>' after the git operation,",
+            file=sys.stderr,
+        )
+        print(
+            "or add a comment containing 'git reset' to indicate it's intentional.",
+            file=sys.stderr,
+        )
+        print(
+            "Example: # git reset is not needed here because...",
+            file=sys.stderr,
+        )
+        return 1
+
+    print(f"OK: Checked {len(files)} files, no violations found.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/legacy/lca_ci_build_repair/eval_infer.py b/legacy/lca_ci_build_repair/eval_infer.py
index 93636515..07cd3958 100644
--- a/legacy/lca_ci_build_repair/eval_infer.py
+++ b/legacy/lca_ci_build_repair/eval_infer.py
@@ -105,6 +105,11 @@ def run_eval(
     obs = runtime.run_action(action)
     assert obs.exit_code == 0
 
+    action = CmdRunAction(command='git reset --hard HEAD')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
     script_dir = os.path.dirname(
         os.path.abspath(__file__)
     )  # Get the absolute path of the script
diff --git a/legacy/lca_ci_build_repair/run_infer.py b/legacy/lca_ci_build_repair/run_infer.py
index e6938301..d96f185f 100644
--- a/legacy/lca_ci_build_repair/run_infer.py
+++ b/legacy/lca_ci_build_repair/run_infer.py
@@ -132,6 +132,11 @@ def initialize_runtime(
     obs = runtime.run_action(action)
     assert obs.exit_code == 0
 
+    action = CmdRunAction(command='git reset --hard HEAD')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
     script_dir = os.path.dirname(
         os.path.abspath(__file__)
     )  # Get the absolute path of the script
diff --git a/legacy/ml_bench/run_infer.py b/legacy/ml_bench/run_infer.py
index 48e3d678..7c0b5f12 100644
--- a/legacy/ml_bench/run_infer.py
+++ b/legacy/ml_bench/run_infer.py
@@ -122,6 +122,11 @@ def initialize_runtime(
     obs = runtime.run_action(action)
     assert obs.exit_code == 0
 
+    action = CmdRunAction(command=f'cd /workspace/{repo_name} && git reset --hard HEAD')
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    assert obs.exit_code == 0
+
     action = CmdRunAction(command=f'chmod -R 777 /workspace/{repo_name}')
     logger.info(action, extra={'msg_type': 'ACTION'})
     obs = runtime.run_action(action)
diff --git a/legacy/swefficiency/scripts/setup/prepare_swe_utils.sh b/legacy/swefficiency/scripts/setup/prepare_swe_utils.sh
index c5726a40..f99bcb16 100755
--- a/legacy/swefficiency/scripts/setup/prepare_swe_utils.sh
+++ b/legacy/swefficiency/scripts/setup/prepare_swe_utils.sh
@@ -9,6 +9,7 @@ echo "==== Prepare SWE-bench repo ===="
 OH_SWE_BENCH_REPO_PATH="https://github.com/All-Hands-AI/SWE-bench.git"
 OH_SWE_BENCH_REPO_BRANCH="eval"
 git clone -b $OH_SWE_BENCH_REPO_BRANCH $OH_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OH-SWE-bench
+cd $EVAL_WORKSPACE/OH-SWE-bench && git reset --hard HEAD
 
 # 2. Prepare DATA
 echo "==== Prepare SWE-bench data ===="
diff --git a/legacy/testgeneval/scripts/setup/prepare_swe_utils.sh b/legacy/testgeneval/scripts/setup/prepare_swe_utils.sh
index 528224fe..ed14f776 100755
--- a/legacy/testgeneval/scripts/setup/prepare_swe_utils.sh
+++ b/legacy/testgeneval/scripts/setup/prepare_swe_utils.sh
@@ -9,6 +9,7 @@ echo "==== Prepare SWE-bench repo ===="
 OH_SWE_BENCH_REPO_PATH="https://github.com/OpenHands/SWE-bench.git"
 OH_SWE_BENCH_REPO_BRANCH="eval"
 git clone -b $OH_SWE_BENCH_REPO_BRANCH $OH_SWE_BENCH_REPO_PATH $EVAL_WORKSPACE/OH-SWE-bench
+cd $EVAL_WORKSPACE/OH-SWE-bench && git reset --hard HEAD
 
 # 2. Prepare DATA
 echo "==== Prepare SWE-bench data ===="
diff --git a/legacy/utils/version_control.sh b/legacy/utils/version_control.sh
index 5fc58e43..83f9e64c 100644
--- a/legacy/utils/version_control.sh
+++ b/legacy/utils/version_control.sh
@@ -17,12 +17,15 @@ checkout_eval_branch() {
     current_branch=$(git rev-parse --abbrev-ref HEAD)
     echo "Current version is: $current_branch"
     echo "Check out OpenHands to version: $COMMIT_HASH"
+    # git reset is not needed here - this checkout targets a specific commit hash
+    # and is used for version control within the development environment
     if ! git checkout $COMMIT_HASH; then
         echo "Failed to check out to $COMMIT_HASH"
         exit 1
     fi
 
     echo "Revert changes in evaluation folder"
+    # git reset is not needed - restoring evaluation folder from current branch
     git checkout $current_branch -- evaluation
 
     # Trap the EXIT signal to checkout original branch
@@ -36,6 +39,7 @@ checkout_original_branch() {
         return 0
     fi
     echo "Checkout back to original branch $current_branch"
+    # git reset is not needed - restoring to original branch after evaluation
     git checkout $current_branch
 }
 
diff --git a/pyproject.toml b/pyproject.toml
index fc5ed7e0..777ddc27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,6 +46,7 @@ dependencies = [
 
 [project.scripts]
 validate-cfg = "benchmarks.scripts.validate_cfg:main"
+validate-git-reset = "benchmarks.scripts.validate_git_reset:main"
 swebench-infer = "benchmarks.swebench.run_infer:main"
 swtbench-infer = "benchmarks.swtbench.run_infer:main"
 swebench-eval = "benchmarks.swebench.eval_infer:main"
diff --git a/tests/test_validate_git_reset.py b/tests/test_validate_git_reset.py
new file mode 100644
index 00000000..6dfc8fee
--- /dev/null
+++ b/tests/test_validate_git_reset.py
@@ -0,0 +1,150 @@
+"""Tests for the validate_git_reset script."""
+
+import tempfile
+from pathlib import Path
+
+from benchmarks.scripts.validate_git_reset import (
+    find_git_operations,
+    has_git_reset_nearby,
+    validate_file,
+)
+
+
+class TestFindGitOperations:
+    def test_finds_git_clone(self):
+        content = """
+some code
+git clone https://example.com/repo.git
+more code
+"""
+        operations = find_git_operations(content)
+        assert len(operations) == 1
+        assert operations[0][0] == 3  # line number
+        assert "git clone" in operations[0][1]  # line content
+        assert operations[0][2] == "git clone"  # operation type
+
+    def test_finds_git_checkout(self):
+        content = """
+some code
+git checkout main
+more code
+"""
+        operations = find_git_operations(content)
+        assert len(operations) == 1
+        assert operations[0][0] == 3
+        assert "git checkout" in operations[0][1]
+        assert operations[0][2] == "git checkout"
+
+    def test_ignores_git_checkout_b(self):
+        """git checkout -b creates a new branch, so doesn't need reset."""
+        content = """
+some code
+git checkout -b new-branch
+more code
+"""
+        operations = find_git_operations(content)
+        assert len(operations) == 0
+
+    def test_finds_multiple_operations(self):
+        content = """
+git clone https://repo1.git
+some code
+git clone https://repo2.git
+git checkout feature
+"""
+        operations = find_git_operations(content)
+        assert len(operations) == 3
+
+    def test_finds_operations_in_strings(self):
+        content = """
+action = CmdRunAction(command=f'git clone {repo_url}')
+"""
+        operations = find_git_operations(content)
+        assert len(operations) == 1
+
+
+class TestHasGitResetNearby:
+    def test_finds_reset_after_operation(self):
+        content = """git clone https://example.com/repo.git
+git reset --hard HEAD
+"""
+        assert has_git_reset_nearby(content, 1) is True
+
+    def test_finds_reset_in_comment_after(self):
+        content = """git clone https://example.com/repo.git
+# git reset is not needed here
+"""
+        assert has_git_reset_nearby(content, 1) is True
+
+    def test_finds_reset_in_comment_before(self):
+        content = """# git reset is not needed here because this is a utility
+git checkout main
+"""
+        assert has_git_reset_nearby(content, 2) is True
+
+    def test_no_reset_nearby(self):
+        content = """git clone https://example.com/repo.git
+some other code
+more code
+"""
+        assert has_git_reset_nearby(content, 1, context_lines_after=2) is False
+
+    def test_reset_beyond_context_window(self):
+        content = """git clone https://example.com/repo.git
+line 1
+line 2
+line 3
+line 4
+line 5
+git reset --hard HEAD
+"""
+        # Default context is 20 lines, so this should be found
+        assert has_git_reset_nearby(content, 1) is True
+        # With small context window, should not be found
+        assert has_git_reset_nearby(content, 1, context_lines_after=3) is False
+
+
+class TestValidateFile:
+    def test_valid_file_with_reset(self):
+        with tempfile.NamedTemporaryFile(suffix=".py", mode="w", delete=False) as f:
+            f.write("""
+clone_cmd = 'git clone https://example.com/repo.git'
+os.system(clone_cmd)
+os.system('git reset --hard HEAD')
+""")
+            f.flush()
+            violations = validate_file(Path(f.name))
+            assert len(violations) == 0
+
+    def test_valid_file_with_comment(self):
+        with tempfile.NamedTemporaryFile(suffix=".py", mode="w", delete=False) as f:
+            f.write("""
+# git reset is not needed here - this is a dev environment utility
+cmd = 'git checkout main'
+os.system(cmd)
+""")
+            f.flush()
+            violations = validate_file(Path(f.name))
+            assert len(violations) == 0
+
+    def test_invalid_file_missing_reset(self):
+        with tempfile.NamedTemporaryFile(suffix=".py", mode="w", delete=False) as f:
+            f.write("""
+clone_cmd = 'git clone https://example.com/repo.git'
+os.system(clone_cmd)
+# no reset here
+""")
+            f.flush()
+            violations = validate_file(Path(f.name))
+            assert len(violations) == 1
+            assert violations[0][2] == "git clone"
+
+    def test_shell_script_with_reset(self):
+        with tempfile.NamedTemporaryFile(suffix=".sh", mode="w", delete=False) as f:
+            f.write("""#!/bin/bash
+git clone https://example.com/repo.git /workspace/repo
+cd /workspace/repo && git reset --hard HEAD
+""")
+            f.flush()
+            violations = validate_file(Path(f.name))
+            assert len(violations) == 0