From 41e80a510a190a02bef470783638d9242dc99e1a Mon Sep 17 00:00:00 2001
From: muhtasham <muhtasham97@gmail.com>
Date: Sun, 26 Apr 2026 02:54:44 +0200
Subject: [PATCH] Add OpenAI sweep tooling and analysis helpers

- add benchmark runners, watcher, and eval pipeline scripts for OpenAI sweeps

- add generated configs, reporting utilities, and OpenAI feedback notes

- preserve model aliases in analysis and fix RoboCode tie handling
---
 codeclash/analysis/metrics/elo.py             |  27 +-
 codeclash/analysis/metrics/win_rate.py        |  14 +-
 codeclash/analysis/viz/heatmap_win_rates.py   |   5 +-
 codeclash/analysis/viz/utils.py               |  16 +
 .../arenas/huskybench/HuskyBench.Dockerfile   |   3 +-
 codeclash/arenas/robocode/robocode.py         |   9 +-
 configs/ablations/scaffold/README.md          | 184 +++++++
 configs/ablations/scaffold/manifest.yaml      | 142 +++++
 ...ake__gpt-5.3-codex__gpt-5__r15__s1000.yaml |  40 ++
 ...War__gpt-5.3-codex__gpt-5__r15__s1000.yaml |  37 ++
 ...lite__gpt-5.3-codex__gpt-5__r15__s250.yaml |  48 ++
 ...ench__gpt-5.3-codex__gpt-5__r15__s100.yaml |  39 ++
 ...Code__gpt-5.3-codex__gpt-5__r15__s250.yaml |  42 ++
 ...mble__gpt-5.3-codex__gpt-5__r15__s250.yaml |  39 ++
 ...lt__gpt-5.3-codex-default__r15__s1000.yaml |  36 ++
 ...lt__gpt-5.3-codex-default__r15__s1000.yaml |  33 ++
 ...ult__gpt-5.3-codex-default__r15__s250.yaml |  44 ++
 ...ult__gpt-5.3-codex-default__r15__s100.yaml |  35 ++
 ...ult__gpt-5.3-codex-default__r15__s250.yaml |  38 ++
 ...ult__gpt-5.3-codex-default__r15__s250.yaml |  35 ++
 ...-high__gpt-5.3-codex-high__r15__s1000.yaml |  40 ++
 ...-high__gpt-5.3-codex-high__r15__s1000.yaml |  37 ++
 ...4-high__gpt-5.3-codex-high__r15__s250.yaml |  48 ++
 ...4-high__gpt-5.3-codex-high__r15__s100.yaml |  39 ++
 ...4-high__gpt-5.3-codex-high__r15__s250.yaml |  42 ++
 ...4-high__gpt-5.3-codex-high__r15__s250.yaml |  39 ++
 ....4-low__gpt-5.3-codex-low__r15__s1000.yaml |  40 ++
 ....4-low__gpt-5.3-codex-low__r15__s1000.yaml |  37 ++
 ...5.4-low__gpt-5.3-codex-low__r15__s250.yaml |  48 ++
 ...5.4-low__gpt-5.3-codex-low__r15__s100.yaml |  39 ++
 ...5.4-low__gpt-5.3-codex-low__r15__s250.yaml |  42 ++
 ...5.4-low__gpt-5.3-codex-low__r15__s250.yaml |  39 ++
 ...ium__gpt-5.3-codex-medium__r15__s1000.yaml |  40 ++
 ...ium__gpt-5.3-codex-medium__r15__s1000.yaml |  37 ++
 ...dium__gpt-5.3-codex-medium__r15__s250.yaml |  48 ++
 ...dium__gpt-5.3-codex-medium__r15__s100.yaml |  39 ++
 ...dium__gpt-5.3-codex-medium__r15__s250.yaml |  42 ++
 ...dium__gpt-5.3-codex-medium__r15__s250.yaml |  39 ++
 ...ttleSnake__gpt-5.4__gpt-5__r15__s1000.yaml |  40 ++
 .../CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml  |  37 ++
 .../Halite__gpt-5.4__gpt-5__r15__s250.yaml    |  48 ++
 ...HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml |  39 ++
 .../RoboCode__gpt-5.4__gpt-5__r15__s250.yaml  |  42 ++
 ...obotRumble__gpt-5.4__gpt-5__r15__s250.yaml |  39 ++
 docs/openai_feedback_20260310.md              |  94 ++++
 scripts/finalize_openai_sweep_report.sh       |  82 +++
 scripts/plot_leaderboard_comparison.py        | 117 ++++
 scripts/plot_reasoning_head_to_head.py        | 226 ++++++++
 scripts/print_leaderboard_table.py            |  55 ++
 scripts/run_eval_pipeline.sh                  | 119 +++++
 scripts/run_gpt54_gpt53codex_round_robin.sh   | 312 +++++++++++
 .../run_gpt54_vs_gpt53codex_high_remaining.sh |  81 +++
 scripts/run_gpt54_vs_gpt53codex_reasoning.sh  | 222 ++++++++
 scripts/run_openai_model_benchmarks.sh        | 498 ++++++++++++++++++
 scripts/run_openai_sweep.sh                   | 332 ++++++++++++
 scripts/scrape_viewer_leaderboard_runs.py     | 324 ++++++++++++
 scripts/watch_sweep_progress.sh               | 199 +++++++
 tests/arenas/test_robocode.py                 |  32 ++
 58 files changed, 4512 insertions(+), 27 deletions(-)
 create mode 100644 configs/ablations/scaffold/README.md
 create mode 100644 configs/ablations/scaffold/manifest.yaml
 create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/BattleSnake__gpt-5.3-codex__gpt-5__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/CoreWar__gpt-5.3-codex__gpt-5__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/Halite__gpt-5.3-codex__gpt-5__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/HuskyBench__gpt-5.3-codex__gpt-5__r15__s100.yaml
 create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/RoboCode__gpt-5.3-codex__gpt-5__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/RobotRumble__gpt-5.3-codex__gpt-5__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/BattleSnake__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/CoreWar__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/Halite__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/HuskyBench__gpt-5.4-default__gpt-5.3-codex-default__r15__s100.yaml
 create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RoboCode__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RobotRumble__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/BattleSnake__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/CoreWar__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml
 create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/BattleSnake__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/CoreWar__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/Halite__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/HuskyBench__gpt-5.4-low__gpt-5.3-codex-low__r15__s100.yaml
 create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RoboCode__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RobotRumble__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/BattleSnake__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/CoreWar__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/Halite__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/HuskyBench__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s100.yaml
 create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RoboCode__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RobotRumble__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/BattleSnake__gpt-5.4__gpt-5__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml
 create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/Halite__gpt-5.4__gpt-5__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml
 create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/RoboCode__gpt-5.4__gpt-5__r15__s250.yaml
 create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/RobotRumble__gpt-5.4__gpt-5__r15__s250.yaml
 create mode 100644 docs/openai_feedback_20260310.md
 create mode 100755 scripts/finalize_openai_sweep_report.sh
 create mode 100644 scripts/plot_leaderboard_comparison.py
 create mode 100644 scripts/plot_reasoning_head_to_head.py
 create mode 100644 scripts/print_leaderboard_table.py
 create mode 100755 scripts/run_eval_pipeline.sh
 create mode 100755 scripts/run_gpt54_gpt53codex_round_robin.sh
 create mode 100755 scripts/run_gpt54_vs_gpt53codex_high_remaining.sh
 create mode 100755 scripts/run_gpt54_vs_gpt53codex_reasoning.sh
 create mode 100755 scripts/run_openai_model_benchmarks.sh
 create mode 100755 scripts/run_openai_sweep.sh
 create mode 100644 scripts/scrape_viewer_leaderboard_runs.py
 create mode 100755 scripts/watch_sweep_progress.sh

diff --git a/codeclash/analysis/metrics/elo.py b/codeclash/analysis/metrics/elo.py
index 929d3154..c9207234 100644
--- a/codeclash/analysis/metrics/elo.py
+++ b/codeclash/analysis/metrics/elo.py
@@ -14,7 +14,7 @@
 from tqdm import tqdm
 
 from codeclash.analysis.significance import calculate_p_value
-from codeclash.analysis.viz.utils import ASSETS_DIR, FONT_BOLD, MODEL_TO_DISPLAY_NAME
+from codeclash.analysis.viz.utils import ASSETS_DIR, FONT_BOLD, MODEL_TO_DISPLAY_NAME, model_display_name
 from codeclash.constants import LOCAL_LOG_DIR, RESULT_TIE
 from codeclash.utils.log import add_file_handler, get_logger
 
@@ -75,9 +75,6 @@ def __init__(
             lambda: defaultdict(list)
         )
 
-    def _get_unique_model_name(self, model: str) -> str:
-        return model.rpartition("/")[2]
-
     def _get_sorted_pair(self, p1: str, p2: str) -> tuple[str, str]:
         return tuple(sorted([p1, p2]))
 
@@ -154,8 +151,6 @@ def _process_tournament(self, metadata_path: Path) -> None:
             return
 
         player_names = [p["name"] for p in players]
-        models = [p["config"]["model"]["model_name"].strip("@") for p in players]
-
         # Aggregate scores for each round
         p1_round_scores = []
         p2_round_scores = []
@@ -199,7 +194,7 @@ def _process_tournament(self, metadata_path: Path) -> None:
             p2_score = sum(p2_round_scores)
 
         # Convert to unique names and sorted pair when updating matrix
-        unique_names = [self._get_unique_model_name(m) for m in models]
+        unique_names = player_names
         sorted_pair = self._get_sorted_pair(unique_names[0], unique_names[1])
 
         if unique_names[0] == sorted_pair[0]:
@@ -550,7 +545,7 @@ def create_elo_plots(self, output_dir: Path) -> None:
         player_order = [all_players[i] for i in all_indices]
 
         # Translate to display names
-        display_names = [MODEL_TO_DISPLAY_NAME.get(p, p) for p in player_order]
+        display_names = [model_display_name(p) for p in player_order]
 
         # Create mapping from player to y-position
         player_to_pos = {p: i for i, p in enumerate(player_order)}
@@ -698,7 +693,7 @@ def create_validation_plots(self, output_dir: Path, regularization: float = 0.01
 
                 ax.set_xlabel("BT Strength", fontproperties=FONT_BOLD, fontsize=12)
                 ax.set_ylabel("Negative Log-Likelihood", fontproperties=FONT_BOLD, fontsize=12)
-                display_name = MODEL_TO_DISPLAY_NAME.get(player, player)
+                display_name = model_display_name(player)
                 ax.set_title(display_name, fontproperties=FONT_BOLD, fontsize=14)
                 legend = ax.legend(prop=FONT_BOLD, fontsize=10, loc="upper right")
                 legend.set_frame_on(False)
@@ -777,7 +772,7 @@ def _create_rank_matrix_plot(
         rank_matrix = (rank_matrix / self.n_bootstrap) * 100
 
         # Translate player names to display names
-        display_names = [MODEL_TO_DISPLAY_NAME.get(p, p) for p in players]
+        display_names = [model_display_name(p) for p in players]
 
         fig, ax = plt.subplots(figsize=(6, 6))
         im = ax.imshow(rank_matrix, cmap="YlOrRd", aspect="auto", vmin=0, vmax=100)
@@ -826,7 +821,7 @@ def _create_elo_violin_plot(
         elo_data = [elo_samples[p] for p in players]
 
         # Translate player names to display names
-        display_names = [MODEL_TO_DISPLAY_NAME.get(p, p) for p in players]
+        display_names = [model_display_name(p) for p in players]
 
         fig, ax = plt.subplots(figsize=(6, 6))
 
@@ -1095,7 +1090,7 @@ def _plot_results(self, results_by_max_round: dict[int, dict[str, dict[str, floa
                             elos_list.append(results_by_max_round[max_round][game_name][player])
 
                 if max_rounds_list:
-                    display_name = MODEL_TO_DISPLAY_NAME.get(player, player)
+                    display_name = model_display_name(player)
                     ax.plot(max_rounds_list, elos_list, marker="o", label=display_name, linewidth=2, markersize=6)
 
             ax.set_xlabel("Max Round", fontproperties=FONT_BOLD, fontsize=14)
@@ -1212,7 +1207,7 @@ def _plot_results(self, results_by_round: dict[int, dict[str, dict[str, float]]]
                             elos_list.append(results_by_round[round_num][game_name][player])
 
                 if rounds_list:
-                    display_name = MODEL_TO_DISPLAY_NAME.get(player, player)
+                    display_name = model_display_name(player)
                     ax.plot(rounds_list, elos_list, marker="o", label=display_name, linewidth=2, markersize=6)
 
             ax.set_xlabel("Round", fontproperties=FONT_BOLD, fontsize=14)
@@ -1348,7 +1343,7 @@ def write_latex_table(results: dict[str, dict], output_dir: Path) -> None:
     lines.append(r"\midrule")
 
     for player, all_elo in sorted_players:
-        display_name = MODEL_TO_DISPLAY_NAME.get(player, player)
+        display_name = model_display_name(player)
         row_parts = [display_name.replace("_", r"\_")]
 
         for game_name in games_in_table:
@@ -1407,7 +1402,7 @@ def write_website_results(results: dict[str, dict], output_dir: Path) -> None:
         # Create leaderboard entries
         board = []
         for rank, (player, elo) in enumerate(sorted_players):
-            entry = {"rank": rank + 1, "model": MODEL_TO_DISPLAY_NAME.get(player, player), "elo": int(round(elo))}
+            entry = {"rank": rank + 1, "model": model_display_name(player), "elo": int(round(elo))}
             # Add confidence interval if available
             if elo_std is not None:
                 player_idx = players.index(player)
@@ -1506,7 +1501,7 @@ def write_latex_table_plain(results: dict[str, dict], output_dir: Path) -> None:
     lines.append(r"\midrule")
 
     for player, all_elo in sorted_players:
-        display_name = MODEL_TO_DISPLAY_NAME.get(player, player)
+        display_name = model_display_name(player)
         row_parts = [display_name.replace("_", r"\_")]
 
         for game_name in games_in_table:
diff --git a/codeclash/analysis/metrics/win_rate.py b/codeclash/analysis/metrics/win_rate.py
index aaa3df0c..58fdf664 100755
--- a/codeclash/analysis/metrics/win_rate.py
+++ b/codeclash/analysis/metrics/win_rate.py
@@ -31,16 +31,16 @@ def main(log_dir: Path):
     model_profiles = {}
     for game_log_folder in tqdm([x.parent for x in log_dir.rglob("metadata.json")]):
         game_id = game_log_folder.name.split(".")[1]
-        player_ids = [x.name for x in (game_log_folder / "players").iterdir() if x.is_dir()]
         metadata = json.load(open(game_log_folder / "metadata.json"))
         try:
-            player_to_model = {
-                x["name"]: x["config"]["model"]["model_name"].strip("@").split("/")[-1]
-                for x in metadata["config"]["players"]
-            }
+            player_ids = [x["name"] for x in metadata["config"]["players"]]
+            player_to_model = {x["name"]: x["name"] for x in metadata["config"]["players"]}
         except KeyError:
             continue
-        num_rounds = len(metadata["round_stats"])
+        round_stats = metadata.get("round_stats")
+        if not isinstance(round_stats, dict) or not round_stats:
+            continue
+        num_rounds = len(round_stats)
 
         # Only count each unique model once per game
         unique_models = {player_to_model[player] for player in player_ids}
@@ -55,7 +55,7 @@ def main(log_dir: Path):
                     player_id=player_id, model_name=model_name, game_id=game_id, count=num_rounds
                 )
 
-        for round, details in metadata["round_stats"].items():
+        for round, details in round_stats.items():
             if round == "0":
                 # Skip initial round
                 continue
diff --git a/codeclash/analysis/viz/heatmap_win_rates.py b/codeclash/analysis/viz/heatmap_win_rates.py
index f817f503..b2139f1b 100755
--- a/codeclash/analysis/viz/heatmap_win_rates.py
+++ b/codeclash/analysis/viz/heatmap_win_rates.py
@@ -59,7 +59,7 @@ def main(log_dir: Path, unit: str = "rounds", output_file: Path = ASSETS_DIR / "
 
     # Build matrix
     models = sorted({m for pair in results.keys() for m in pair})
-    clean_names = [MODEL_TO_DISPLAY_NAME[m.split("/")[-1]] for m in models]
+    clean_names = [MODEL_TO_DISPLAY_NAME.get(m.split("/")[-1], m.split("/")[-1]) for m in models]
     n = len(models)
 
     matrix = np.full((n, n), np.nan)
@@ -73,7 +73,8 @@ def main(log_dir: Path, unit: str = "rounds", output_file: Path = ASSETS_DIR / "
         total_wins = sum(results[(m1, m2)][0] for m2 in models if m1 != m2)
         total_matches = sum(results[(m1, m2)][1] for m2 in models if m1 != m2)
         avg_win_rate = total_wins / total_matches if total_matches > 0 else 0
-        print(f"{MODEL_TO_DISPLAY_NAME[m1.split('/')[-1]]}: {avg_win_rate:.2%} win rate over {total_matches} matches")
+        label = MODEL_TO_DISPLAY_NAME.get(m1.split("/")[-1], m1.split("/")[-1])
+        print(f"{label}: {avg_win_rate:.2%} win rate over {total_matches} matches")
 
     # Plot
     FONT_BOLD.set_size(18)
diff --git a/codeclash/analysis/viz/utils.py b/codeclash/analysis/viz/utils.py
index 168e8c64..5e934cad 100644
--- a/codeclash/analysis/viz/utils.py
+++ b/codeclash/analysis/viz/utils.py
@@ -21,6 +21,22 @@
     "o3": "o3",
 }
 
+
+def model_display_name(model: str) -> str:
+    label = MODEL_TO_DISPLAY_NAME.get(model, model)
+    tier_labels = {
+        "-default": " (Default)",
+        "-low": " (Low)",
+        "-medium": " (Medium)",
+        "-high": " (High)",
+    }
+    for suffix, pretty in tier_labels.items():
+        if model.endswith(suffix):
+            base = model[: -len(suffix)]
+            base_label = MODEL_TO_DISPLAY_NAME.get(base, base)
+            return f"{base_label}{pretty}"
+    return label
+
 MODEL_TO_COLOR = {
     "anthropic/claude-sonnet-4-20250514": "#FFD449",
     "anthropic/claude-sonnet-4-5-20250929": "#F75C03",
diff --git a/codeclash/arenas/huskybench/HuskyBench.Dockerfile b/codeclash/arenas/huskybench/HuskyBench.Dockerfile
index 0b1d4d52..9e1070f2 100644
--- a/codeclash/arenas/huskybench/HuskyBench.Dockerfile
+++ b/codeclash/arenas/huskybench/HuskyBench.Dockerfile
@@ -16,5 +16,6 @@ RUN git clone https://github.com/CodeClash-ai/HuskyBench.git /workspace \
     && git remote set-url origin https://github.com/CodeClash-ai/HuskyBench.git
 WORKDIR /workspace
 
-RUN pip install -r engine/requirements.txt
+RUN pip install --no-cache-dir Cython setuptools wheel \
+    && pip install --no-cache-dir -r engine/requirements.txt
 RUN mkdir -p /workspace/engine/output
diff --git a/codeclash/arenas/robocode/robocode.py b/codeclash/arenas/robocode/robocode.py
index a7dc16fb..da473873 100644
--- a/codeclash/arenas/robocode/robocode.py
+++ b/codeclash/arenas/robocode/robocode.py
@@ -10,6 +10,7 @@
 
 from codeclash.agents.player import Player
 from codeclash.arenas.arena import CodeArena, RoundStats
+from codeclash.constants import RESULT_TIE
 from codeclash.utils.environment import create_file_in_container
 
 RC_FILE = Path("MyTank.java")
@@ -140,7 +141,13 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats):
                     player = match.group(2).rsplit(".", 1)[0]
                     scores[player] += int(match.group(3))
 
-        stats.winner = max(scores, key=scores.get)
+        if not scores:
+            stats.winner = RESULT_TIE
+            return
+
+        max_score = max(scores.values())
+        leaders = [player for player, score in scores.items() if score == max_score]
+        stats.winner = RESULT_TIE if len(leaders) > 1 else leaders[0]
         stats.scores = scores
         for player, score in scores.items():
             stats.player_stats[player].score = score
diff --git a/configs/ablations/scaffold/README.md b/configs/ablations/scaffold/README.md
new file mode 100644
index 00000000..e2b9b6c5
--- /dev/null
+++ b/configs/ablations/scaffold/README.md
@@ -0,0 +1,184 @@
+# Scaffold / Harness Ablation
+
+This folder defines a concrete experiment matrix for answering:
+
+1. How much of CodeClash performance is due to the model versus the agent harness?
+2. Do Codex-style agent stacks help because of the scaffold alone, or because the model and scaffold are co-designed?
+
+## Current repository limitation
+
+Today this repository only exposes two agent types:
+
+- `mini`
+- `dummy`
+
+See `codeclash/agents/__init__.py`.
+
+That means the experiments below are split into:
+
+- `ready_now`: Can be run once the referenced model exists in `configs/models.yaml`
+- `blocked_on_adapter`: Requires adding a new agent adapter for `swe-agent`, `openhands`, or `codex-sdk`
+
+## Design rules
+
+All harness comparisons should keep the following fixed unless the experiment explicitly says otherwise:
+
+- same arena
+- same opponent panel
+- same model
+- same number of rounds
+- same per-round step limit
+- same per-round dollar limit
+- same tool surface
+- same repository snapshot
+- same visibility into logs and docs
+- same replication count
+
+Do not give one harness extra tools, hidden memory, or a longer prompt unless that is the variable under test.
+
+## Phases
+
+### Phase A: Cheap scaffold-only screen
+
+Purpose:
+Measure scaffold effects while holding the model fixed.
+
+System under test:
+
+- model: `@openai/gpt-5-mini`
+- harnesses:
+  - `mini` (existing baseline)
+  - `swe-agent` (planned)
+  - `openhands` (planned)
+  - `codex-sdk` (planned thin adapter, not the full Codex product stack)
+
+Opponents:
+
+- `@anthropic/claude-sonnet-4-5-20250929`
+- `@openai/o3`
+- `@x-ai/grok-code-fast-1`
+
+Arenas:
+
+- `BattleSnake` (`r5`, `s1000`)
+- `CoreWar` (`r5`, `s1000`)
+- `RobotRumble` (`r5`, `s250`)
+
+Replications:
+
+- `2` independent tournaments per cell
+
+Run count:
+
+- `4 harnesses x 3 opponents x 3 arenas x 2 reps = 72 tournaments`
+
+Advance rule:
+
+- Promote the best two harnesses by pooled Elo / win rate
+- Require no obvious regression in validation rate, bash success, or recovery after failure
+
+### Phase B: Cheap Codex stack test
+
+Purpose:
+Separate generic scaffold effects from model-stack co-design.
+
+Systems:
+
+- `best_generic_harness + @openai/gpt-5-mini`
+- `best_generic_harness + @openai/gpt-5.1-codex-mini`
+- `codex-sdk + @openai/gpt-5.1-codex-mini`
+
+Opponents:
+
+- same as Phase A
+
+Arenas:
+
+- same as Phase A
+
+Replications:
+
+- `2` independent tournaments per cell
+
+Run count:
+
+- `3 systems x 3 opponents x 3 arenas x 2 reps = 54 tournaments`
+
+Interpretation:
+
+- If `codex-sdk + gpt-5.1-codex-mini` beats `best_generic_harness + gpt-5.1-codex-mini`, the scaffold matters.
+- If `best_generic_harness + gpt-5.1-codex-mini` already captures most of the gain, the model matters more than the scaffold.
+
+### Phase C: Expensive confirmation
+
+Purpose:
+Confirm the screen on a stronger model after the cheap runs identify promising cells.
+
+Systems:
+
+- top `2` systems from Phase B
+
+Model:
+
+- `@openai/gpt-5.4`
+
+Opponents:
+
+- `@anthropic/claude-sonnet-4-5-20250929`
+- `@openai/o3`
+
+Arenas:
+
+- all six benchmark arenas
+
+Tournament budget:
+
+- `r15`
+- standard paper simulation counts per arena
+
+Replications:
+
+- `1` independent tournament per cell
+
+Run count:
+
+- `2 systems x 2 opponents x 6 arenas x 1 rep = 24 tournaments`
+
+## Primary metrics
+
+- pooled Elo
+- per-arena Elo
+- head-to-head win rate excluding ties
+- top-1 consistency under bootstrap
+- pairwise order agreement under bootstrap
+
+## Diagnostic metrics
+
+- bash/action success rate
+- next-step recovery after failed command
+- fraction of rounds with grounded edits
+- fraction of rounds with simulation-based validation
+- fraction of rounds with unit-test validation
+- mean files edited per round
+- mean thought length / steps per round
+
+## Minimum logging requirements
+
+For every tournament, retain:
+
+- `metadata.json`
+- trajectories
+- per-round diffs
+- round stats
+- cost and API call counts
+
+For scaffold adapters, also log:
+
+- prompt template used
+- tool whitelist / sandbox mode
+- whether notes persist across rounds
+- any harness-specific retries or auto-fixes
+
+## Why this matrix
+
+This matrix deliberately starts with cheap screening. `GPT-5.4` should only be used after the cheap phase narrows the search space. The point is not to prove that one harness wins one benchmark snapshot; the point is to isolate whether improvements survive when model, budget, and arena are held fixed.
diff --git a/configs/ablations/scaffold/manifest.yaml b/configs/ablations/scaffold/manifest.yaml
new file mode 100644
index 00000000..d9db41a6
--- /dev/null
+++ b/configs/ablations/scaffold/manifest.yaml
@@ -0,0 +1,142 @@
+version: 1
+
+notes:
+  - "This is an experiment manifest, not an execution format consumed by CodeClash today."
+  - "Current repo support is limited to agent=mini and agent=dummy."
+  - "Cells with blocked_on_adapter require a new agent implementation."
+
+fairness_constraints:
+  same_arena: true
+  same_rounds_per_cell: true
+  same_step_limit_per_round: true
+  same_cost_limit_per_round: true
+  same_repo_snapshot: true
+  same_tool_surface: true
+  same_log_visibility: true
+  same_replication_count: true
+
+opponent_panel:
+  - model: "@anthropic/claude-sonnet-4-5-20250929"
+  - model: "@openai/o3"
+  - model: "@x-ai/grok-code-fast-1"
+
+arena_panel_screen:
+  - arena: "BattleSnake"
+    rounds: 5
+    sims_per_round: 1000
+  - arena: "CoreWar"
+    rounds: 5
+    sims_per_round: 1000
+  - arena: "RobotRumble"
+    rounds: 5
+    sims_per_round: 250
+
+arena_panel_confirm:
+  - arena: "BattleSnake"
+    rounds: 15
+    sims_per_round: 1000
+  - arena: "CoreWar"
+    rounds: 15
+    sims_per_round: 1000
+  - arena: "Halite"
+    rounds: 15
+    sims_per_round: 250
+  - arena: "HuskyBench"
+    rounds: 15
+    sims_per_round: 100
+  - arena: "RoboCode"
+    rounds: 15
+    sims_per_round: 250
+  - arena: "RobotRumble"
+    rounds: 15
+    sims_per_round: 250
+
+phases:
+  - id: "phase_a_scaffold_screen"
+    status: "partially_blocked"
+    replications: 2
+    goal: "Measure harness effects while holding the model fixed."
+    systems:
+      - id: "mini__gpt5mini"
+        agent: "mini"
+        model: "@openai/gpt-5-mini"
+        status: "ready_now"
+      - id: "swe_agent__gpt5mini"
+        agent: "swe-agent"
+        model: "@openai/gpt-5-mini"
+        status: "blocked_on_adapter"
+      - id: "openhands__gpt5mini"
+        agent: "openhands"
+        model: "@openai/gpt-5-mini"
+        status: "blocked_on_adapter"
+      - id: "codex_sdk__gpt5mini"
+        agent: "codex-sdk"
+        model: "@openai/gpt-5-mini"
+        status: "blocked_on_adapter"
+    run_count_formula: "4 harnesses x 3 opponents x 3 arenas x 2 reps"
+    run_count_total: 72
+    success_rule:
+      shortlist_top_systems: 2
+      require_no_major_regression_in:
+        - "simulation_validation_rate"
+        - "bash_success_rate"
+        - "recovery_after_failed_command"
+
+  - id: "phase_b_codex_stack"
+    status: "blocked"
+    replications: 2
+    goal: "Separate generic scaffold effects from Codex-specific model-stack co-design."
+    systems:
+      - id: "best_generic__gpt5mini"
+        agent: "TBD_from_phase_a"
+        model: "@openai/gpt-5-mini"
+        status: "blocked_on_phase_a"
+      - id: "best_generic__gpt5_1_codex_mini"
+        agent: "TBD_from_phase_a"
+        model: "@openai/gpt-5.1-codex-mini"
+        status: "blocked_on_phase_a_and_model_entry"
+      - id: "codex_sdk__gpt5_1_codex_mini"
+        agent: "codex-sdk"
+        model: "@openai/gpt-5.1-codex-mini"
+        status: "blocked_on_adapter_and_model_entry"
+    run_count_formula: "3 systems x 3 opponents x 3 arenas x 2 reps"
+    run_count_total: 54
+    interpretation_checks:
+      - "Does codex-sdk help when the model is held fixed at gpt-5.1-codex-mini?"
+      - "Does the Codex-family model help even without the codex-sdk harness?"
+
+  - id: "phase_c_confirm_gpt54"
+    status: "blocked"
+    replications: 1
+    goal: "Confirm the best systems on a stronger expensive model."
+    systems:
+      - id: "top1_from_phase_b__gpt54"
+        agent: "TBD_from_phase_b"
+        model: "@openai/gpt-5.4"
+        status: "blocked_on_phase_b_and_model_entry"
+      - id: "top2_from_phase_b__gpt54"
+        agent: "TBD_from_phase_b"
+        model: "@openai/gpt-5.4"
+        status: "blocked_on_phase_b_and_model_entry"
+    opponents:
+      - model: "@anthropic/claude-sonnet-4-5-20250929"
+      - model: "@openai/o3"
+    run_count_formula: "2 systems x 2 opponents x 6 arenas x 1 rep"
+    run_count_total: 24
+
+metrics:
+  primary:
+    - "pooled_elo"
+    - "per_arena_elo"
+    - "head_to_head_win_rate_excluding_ties"
+    - "bootstrap_top1_consistency"
+    - "bootstrap_pairwise_order_agreement"
+  diagnostics:
+    - "bash_success_rate"
+    - "recovery_after_failed_command"
+    - "grounded_edit_rate"
+    - "simulation_validation_rate"
+    - "unit_test_validation_rate"
+    - "files_edited_per_round"
+    - "steps_per_round"
+    - "model_cost_usd"
diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/BattleSnake__gpt-5.3-codex__gpt-5__r15__s1000.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/BattleSnake__gpt-5.3-codex__gpt-5__r15__s1000.yaml
new file mode 100644
index 00000000..c1df5b3e
--- /dev/null
+++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/BattleSnake__gpt-5.3-codex__gpt-5__r15__s1000.yaml
@@ -0,0 +1,40 @@
+tournament:
+  rounds: 15
+game:
+  name: BattleSnake
+  sims_per_round: 1000
+  args:
+    width: 11
+    height: 11
+    browser: false
+players:
+- agent: mini
+  name: gpt-5.3-codex
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called BattleSnake.
+    Your bot (`main.py`) controls a snake on a grid-based board.
+    Snakes collect food, avoid collisions, and try to outlast their opponents.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `main.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/CoreWar__gpt-5.3-codex__gpt-5__r15__s1000.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/CoreWar__gpt-5.3-codex__gpt-5__r15__s1000.yaml
new file mode 100644
index 00000000..def48781
--- /dev/null
+++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/CoreWar__gpt-5.3-codex__gpt-5__r15__s1000.yaml
@@ -0,0 +1,37 @@
+tournament:
+  rounds: 15
+game:
+  name: CoreWar
+  sims_per_round: 1000
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.3-codex
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called CoreWar.
+    CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate.
+    Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `warrior.red`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/Halite__gpt-5.3-codex__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/Halite__gpt-5.3-codex__gpt-5__r15__s250.yaml
new file mode 100644
index 00000000..8640a350
--- /dev/null
+++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/Halite__gpt-5.3-codex__gpt-5__r15__s250.yaml
@@ -0,0 +1,48 @@
+tournament:
+  rounds: 15
+game:
+  name: Halite
+  sims_per_round: 250
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.3-codex
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called Halite.
+    Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength.
+    Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it.
+    The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions.
+
+    You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust.
+    Example implementations can be found under the `airesources/` folder.
+    Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages.
+    Please make sure your main file is named `main.<ext>`, where `<ext>` is the appropriate file extension for your chosen programming language.
+    You may include additional files as needed, but please ensure:
+    1. The `submission/` folder contains only files relevant to your bot.
+    2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission).
+    3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission/<language>/` folder to see how we will compile and run your bot.
+
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `submission`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/HuskyBench__gpt-5.3-codex__gpt-5__r15__s100.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/HuskyBench__gpt-5.3-codex__gpt-5__r15__s100.yaml
new file mode 100644
index 00000000..53df4943
--- /dev/null
+++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/HuskyBench__gpt-5.3-codex__gpt-5__r15__s100.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: HuskyBench
+  sims_per_round: 100
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.3-codex
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called HuskyBench.
+    In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips.
+    Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively.
+    Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round.
+    You can use run_game.sh to check if your bot runs in time.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `client/player.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/RoboCode__gpt-5.3-codex__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/RoboCode__gpt-5.3-codex__gpt-5__r15__s250.yaml
new file mode 100644
index 00000000..ec1c8774
--- /dev/null
+++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/RoboCode__gpt-5.3-codex__gpt-5__r15__s250.yaml
@@ -0,0 +1,42 @@
+tournament:
+  rounds: 15
+game:
+  name: RoboCode
+  sims_per_round: 250
+  args:
+    nodisplay: true
+    nosound: true
+  record_ratio: 0.2
+players:
+- agent: mini
+  name: gpt5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RoboCode.
+    Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar.
+    Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots.
+    Your bot logic must be written in Java and located in the `robots/custom/` directory.
+    Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robots/custom/`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/RobotRumble__gpt-5.3-codex__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/RobotRumble__gpt-5.3-codex__gpt-5__r15__s250.yaml
new file mode 100644
index 00000000..9663e660
--- /dev/null
+++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/RobotRumble__gpt-5.3-codex__gpt-5__r15__s250.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: RobotRumble
+  sims_per_round: 250
+  args:
+    raw: true
+players:
+- agent: mini
+  name: gpt-5.3-codex
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RobotRumble.
+    RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid.
+    Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match.
+    NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robot.js`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/BattleSnake__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/BattleSnake__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml
new file mode 100644
index 00000000..c2d34dde
--- /dev/null
+++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/BattleSnake__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml
@@ -0,0 +1,36 @@
+tournament:
+  rounds: 15
+game:
+  name: BattleSnake
+  sims_per_round: 1000
+  args:
+    width: 11
+    height: 11
+    browser: false
+players:
+- agent: mini
+  name: gpt-5.4-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+- agent: mini
+  name: gpt-5.3-codex-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called BattleSnake.
+    Your bot (`main.py`) controls a snake on a grid-based board.
+    Snakes collect food, avoid collisions, and try to outlast their opponents.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `main.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/CoreWar__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/CoreWar__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml
new file mode 100644
index 00000000..f2e539c7
--- /dev/null
+++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/CoreWar__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml
@@ -0,0 +1,33 @@
+tournament:
+  rounds: 15
+game:
+  name: CoreWar
+  sims_per_round: 1000
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+- agent: mini
+  name: gpt-5.3-codex-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called CoreWar.
+    CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate.
+    Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `warrior.red`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/Halite__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/Halite__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml
new file mode 100644
index 00000000..fc8f7eee
--- /dev/null
+++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/Halite__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml
@@ -0,0 +1,44 @@
+tournament:
+  rounds: 15
+game:
+  name: Halite
+  sims_per_round: 250
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+- agent: mini
+  name: gpt-5.3-codex-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called Halite.
+    Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength.
+    Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it.
+    The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions.
+
+    You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust.
+    Example implementations can be found under the `airesources/` folder.
+    Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages.
+    Please make sure your main file is named `main.<ext>`, where `<ext>` is the appropriate file extension for your chosen programming language.
+    You may include additional files as needed, but please ensure:
+    1. The `submission/` folder contains only files relevant to your bot.
+    2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission).
+    3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission/<language>/` folder to see how we will compile and run your bot.
+
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `submission`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/HuskyBench__gpt-5.4-default__gpt-5.3-codex-default__r15__s100.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/HuskyBench__gpt-5.4-default__gpt-5.3-codex-default__r15__s100.yaml
new file mode 100644
index 00000000..7ccd9220
--- /dev/null
+++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/HuskyBench__gpt-5.4-default__gpt-5.3-codex-default__r15__s100.yaml
@@ -0,0 +1,35 @@
+tournament:
+  rounds: 15
+game:
+  name: HuskyBench
+  sims_per_round: 100
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+- agent: mini
+  name: gpt-5.3-codex-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called HuskyBench.
+    In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips.
+    Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively.
+    Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round.
+    You can use run_game.sh to check if your bot runs in time.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `client/player.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RoboCode__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RoboCode__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml
new file mode 100644
index 00000000..72fdaf5c
--- /dev/null
+++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RoboCode__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml
@@ -0,0 +1,38 @@
+tournament:
+  rounds: 15
+game:
+  name: RoboCode
+  sims_per_round: 250
+  args:
+    nodisplay: true
+    nosound: true
+  record_ratio: 0.2
+players:
+- agent: mini
+  name: gpt5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+- agent: mini
+  name: gpt-5.3-codex-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RoboCode.
+    Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar.
+    Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots.
+    Your bot logic must be written in Java and located in the `robots/custom/` directory.
+    Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robots/custom/`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RobotRumble__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RobotRumble__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml
new file mode 100644
index 00000000..ce4ed43d
--- /dev/null
+++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RobotRumble__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml
@@ -0,0 +1,35 @@
+tournament:
+  rounds: 15
+game:
+  name: RobotRumble
+  sims_per_round: 250
+  args:
+    raw: true
+players:
+- agent: mini
+  name: gpt-5.4-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+- agent: mini
+  name: gpt-5.3-codex-default
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RobotRumble.
+    RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid.
+    Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match.
+    NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robot.js`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/BattleSnake__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/BattleSnake__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml
new file mode 100644
index 00000000..7458df6c
--- /dev/null
+++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/BattleSnake__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml
@@ -0,0 +1,40 @@
+tournament:
+  rounds: 15
+game:
+  name: BattleSnake
+  sims_per_round: 1000
+  args:
+    width: 11
+    height: 11
+    browser: false
+players:
+- agent: mini
+  name: gpt-5.4-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5.3-codex-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called BattleSnake.
+    Your bot (`main.py`) controls a snake on a grid-based board.
+    Snakes collect food, avoid collisions, and try to outlast their opponents.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `main.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/CoreWar__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/CoreWar__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml
new file mode 100644
index 00000000..3b856293
--- /dev/null
+++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/CoreWar__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml
@@ -0,0 +1,37 @@
+tournament:
+  rounds: 15
+game:
+  name: CoreWar
+  sims_per_round: 1000
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5.3-codex-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called CoreWar.
+    CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate.
+    Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `warrior.red`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml
new file mode 100644
index 00000000..ae84473f
--- /dev/null
+++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml
@@ -0,0 +1,48 @@
+tournament:
+  rounds: 15
+game:
+  name: Halite
+  sims_per_round: 250
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5.3-codex-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called Halite.
+    Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength.
+    Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it.
+    The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions.
+
+    You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust.
+    Example implementations can be found under the `airesources/` folder.
+    Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages.
+    Please make sure your main file is named `main.<ext>`, where `<ext>` is the appropriate file extension for your chosen programming language.
+    You may include additional files as needed, but please ensure:
+    1. The `submission/` folder contains only files relevant to your bot.
+    2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission).
+    3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission/<language>/` folder to see how we will compile and run your bot.
+
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `submission`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml
new file mode 100644
index 00000000..c057921f
--- /dev/null
+++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: HuskyBench
+  sims_per_round: 100
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5.3-codex-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called HuskyBench.
+    In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips.
+    Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively.
+    Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round.
+    You can use run_game.sh to check if your bot runs in time.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `client/player.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml
new file mode 100644
index 00000000..b52edef8
--- /dev/null
+++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml
@@ -0,0 +1,42 @@
+tournament:
+  rounds: 15
+game:
+  name: RoboCode
+  sims_per_round: 250
+  args:
+    nodisplay: true
+    nosound: true
+  record_ratio: 0.2
+players:
+- agent: mini
+  name: gpt5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5.3-codex-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RoboCode.
+    Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar.
+    Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots.
+    Your bot logic must be written in Java and located in the `robots/custom/` directory.
+    Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robots/custom/`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml
new file mode 100644
index 00000000..f88367f1
--- /dev/null
+++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: RobotRumble
+  sims_per_round: 250
+  args:
+    raw: true
+players:
+- agent: mini
+  name: gpt-5.4-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5.3-codex-high
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RobotRumble.
+    RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid.
+    Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match.
+    NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robot.js`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/BattleSnake__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/BattleSnake__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml
new file mode 100644
index 00000000..63dee234
--- /dev/null
+++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/BattleSnake__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml
@@ -0,0 +1,40 @@
+tournament:
+  rounds: 15
+game:
+  name: BattleSnake
+  sims_per_round: 1000
+  args:
+    width: 11
+    height: 11
+    browser: false
+players:
+- agent: mini
+  name: gpt-5.4-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+- agent: mini
+  name: gpt-5.3-codex-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called BattleSnake.
+    Your bot (`main.py`) controls a snake on a grid-based board.
+    Snakes collect food, avoid collisions, and try to outlast their opponents.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `main.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/CoreWar__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/CoreWar__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml
new file mode 100644
index 00000000..73ea79a6
--- /dev/null
+++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/CoreWar__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml
@@ -0,0 +1,37 @@
+tournament:
+  rounds: 15
+game:
+  name: CoreWar
+  sims_per_round: 1000
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+- agent: mini
+  name: gpt-5.3-codex-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called CoreWar.
+    CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate.
+    Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `warrior.red`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/Halite__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/Halite__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml
new file mode 100644
index 00000000..d71a6bbc
--- /dev/null
+++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/Halite__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml
@@ -0,0 +1,48 @@
+tournament:
+  rounds: 15
+game:
+  name: Halite
+  sims_per_round: 250
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+- agent: mini
+  name: gpt-5.3-codex-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called Halite.
+    Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength.
+    Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it.
+    The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions.
+
+    You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust.
+    Example implementations can be found under the `airesources/` folder.
+    Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages.
+    Please make sure your main file is named `main.<ext>`, where `<ext>` is the appropriate file extension for your chosen programming language.
+    You may include additional files as needed, but please ensure:
+    1. The `submission/` folder contains only files relevant to your bot.
+    2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission).
+    3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission/<language>/` folder to see how we will compile and run your bot.
+
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `submission`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/HuskyBench__gpt-5.4-low__gpt-5.3-codex-low__r15__s100.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/HuskyBench__gpt-5.4-low__gpt-5.3-codex-low__r15__s100.yaml
new file mode 100644
index 00000000..e3b67725
--- /dev/null
+++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/HuskyBench__gpt-5.4-low__gpt-5.3-codex-low__r15__s100.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: HuskyBench
+  sims_per_round: 100
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+- agent: mini
+  name: gpt-5.3-codex-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called HuskyBench.
+    In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips.
+    Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively.
+    Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round.
+    You can use run_game.sh to check if your bot runs in time.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `client/player.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RoboCode__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RoboCode__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml
new file mode 100644
index 00000000..534db3b2
--- /dev/null
+++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RoboCode__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml
@@ -0,0 +1,42 @@
+tournament:
+  rounds: 15
+game:
+  name: RoboCode
+  sims_per_round: 250
+  args:
+    nodisplay: true
+    nosound: true
+  record_ratio: 0.2
+players:
+- agent: mini
+  name: gpt5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+- agent: mini
+  name: gpt-5.3-codex-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RoboCode.
+    Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar.
+    Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots.
+    Your bot logic must be written in Java and located in the `robots/custom/` directory.
+    Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robots/custom/`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RobotRumble__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RobotRumble__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml
new file mode 100644
index 00000000..12a98655
--- /dev/null
+++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RobotRumble__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: RobotRumble
+  sims_per_round: 250
+  args:
+    raw: true
+players:
+- agent: mini
+  name: gpt-5.4-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+- agent: mini
+  name: gpt-5.3-codex-low
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'low'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RobotRumble.
+    RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid.
+    Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match.
+    NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robot.js`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/BattleSnake__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/BattleSnake__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml
new file mode 100644
index 00000000..3f3a3de3
--- /dev/null
+++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/BattleSnake__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml
@@ -0,0 +1,40 @@
+tournament:
+  rounds: 15
+game:
+  name: BattleSnake
+  sims_per_round: 1000
+  args:
+    width: 11
+    height: 11
+    browser: false
+players:
+- agent: mini
+  name: gpt-5.4-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+- agent: mini
+  name: gpt-5.3-codex-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called BattleSnake.
+    Your bot (`main.py`) controls a snake on a grid-based board.
+    Snakes collect food, avoid collisions, and try to outlast their opponents.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `main.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/CoreWar__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/CoreWar__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml
new file mode 100644
index 00000000..6ad0b786
--- /dev/null
+++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/CoreWar__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml
@@ -0,0 +1,37 @@
+tournament:
+  rounds: 15
+game:
+  name: CoreWar
+  sims_per_round: 1000
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+- agent: mini
+  name: gpt-5.3-codex-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called CoreWar.
+    CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate.
+    Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `warrior.red`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/Halite__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/Halite__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml
new file mode 100644
index 00000000..b55e7a00
--- /dev/null
+++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/Halite__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml
@@ -0,0 +1,48 @@
+tournament:
+  rounds: 15
+game:
+  name: Halite
+  sims_per_round: 250
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+- agent: mini
+  name: gpt-5.3-codex-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called Halite.
+    Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength.
+    Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it.
+    The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions.
+
+    You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust.
+    Example implementations can be found under the `airesources/` folder.
+    Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages.
+    Please make sure your main file is named `main.<ext>`, where `<ext>` is the appropriate file extension for your chosen programming language.
+    You may include additional files as needed, but please ensure:
+    1. The `submission/` folder contains only files relevant to your bot.
+    2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission).
+    3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission/<language>/` folder to see how we will compile and run your bot.
+
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `submission`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/HuskyBench__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s100.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/HuskyBench__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s100.yaml
new file mode 100644
index 00000000..4bed924f
--- /dev/null
+++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/HuskyBench__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s100.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: HuskyBench
+  sims_per_round: 100
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+- agent: mini
+  name: gpt-5.3-codex-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called HuskyBench.
+    In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips.
+    Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively.
+    Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round.
+    You can use run_game.sh to check if your bot runs in time.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `client/player.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RoboCode__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RoboCode__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml
new file mode 100644
index 00000000..51ae0878
--- /dev/null
+++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RoboCode__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml
@@ -0,0 +1,42 @@
+tournament:
+  rounds: 15
+game:
+  name: RoboCode
+  sims_per_round: 250
+  args:
+    nodisplay: true
+    nosound: true
+  record_ratio: 0.2
+players:
+- agent: mini
+  name: gpt5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+- agent: mini
+  name: gpt-5.3-codex-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RoboCode.
+    Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar.
+    Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots.
+    Your bot logic must be written in Java and located in the `robots/custom/` directory.
+    Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robots/custom/`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RobotRumble__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RobotRumble__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml
new file mode 100644
index 00000000..5ceed406
--- /dev/null
+++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RobotRumble__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: RobotRumble
+  sims_per_round: 250
+  args:
+    raw: true
+players:
+- agent: mini
+  name: gpt-5.4-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+- agent: mini
+  name: gpt-5.3-codex-medium
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.3-codex'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'medium'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RobotRumble.
+    RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid.
+    Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match.
+    NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robot.js`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4_vs_gpt-5/BattleSnake__gpt-5.4__gpt-5__r15__s1000.yaml b/configs/generated/gpt-5.4_vs_gpt-5/BattleSnake__gpt-5.4__gpt-5__r15__s1000.yaml
new file mode 100644
index 00000000..04d2e83e
--- /dev/null
+++ b/configs/generated/gpt-5.4_vs_gpt-5/BattleSnake__gpt-5.4__gpt-5__r15__s1000.yaml
@@ -0,0 +1,40 @@
+tournament:
+  rounds: 15
+game:
+  name: BattleSnake
+  sims_per_round: 1000
+  args:
+    width: 11
+    height: 11
+    browser: false
+players:
+- agent: mini
+  name: gpt-5.4
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called BattleSnake.
+    Your bot (`main.py`) controls a snake on a grid-based board.
+    Snakes collect food, avoid collisions, and try to outlast their opponents.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `main.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4_vs_gpt-5/CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml b/configs/generated/gpt-5.4_vs_gpt-5/CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml
new file mode 100644
index 00000000..50a28315
--- /dev/null
+++ b/configs/generated/gpt-5.4_vs_gpt-5/CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml
@@ -0,0 +1,37 @@
+tournament:
+  rounds: 15
+game:
+  name: CoreWar
+  sims_per_round: 1000
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called CoreWar.
+    CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate.
+    Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `warrior.red`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4_vs_gpt-5/Halite__gpt-5.4__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.4_vs_gpt-5/Halite__gpt-5.4__gpt-5__r15__s250.yaml
new file mode 100644
index 00000000..dff0c56f
--- /dev/null
+++ b/configs/generated/gpt-5.4_vs_gpt-5/Halite__gpt-5.4__gpt-5__r15__s250.yaml
@@ -0,0 +1,48 @@
+tournament:
+  rounds: 15
+game:
+  name: Halite
+  sims_per_round: 250
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called Halite.
+    Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength.
+    Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it.
+    The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions.
+
+    You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust.
+    Example implementations can be found under the `airesources/` folder.
+    Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages.
+    Please make sure your main file is named `main.<ext>`, where `<ext>` is the appropriate file extension for your chosen programming language.
+    You may include additional files as needed, but please ensure:
+    1. The `submission/` folder contains only files relevant to your bot.
+    2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission).
+    3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission/<language>/` folder to see how we will compile and run your bot.
+
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `submission`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4_vs_gpt-5/HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml b/configs/generated/gpt-5.4_vs_gpt-5/HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml
new file mode 100644
index 00000000..cf4cd1c1
--- /dev/null
+++ b/configs/generated/gpt-5.4_vs_gpt-5/HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: HuskyBench
+  sims_per_round: 100
+  args: {}
+players:
+- agent: mini
+  name: gpt-5.4
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called HuskyBench.
+    In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips.
+    Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively.
+    Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round.
+    You can use run_game.sh to check if your bot runs in time.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `client/player.py`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4_vs_gpt-5/RoboCode__gpt-5.4__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.4_vs_gpt-5/RoboCode__gpt-5.4__gpt-5__r15__s250.yaml
new file mode 100644
index 00000000..b07ab7b6
--- /dev/null
+++ b/configs/generated/gpt-5.4_vs_gpt-5/RoboCode__gpt-5.4__gpt-5__r15__s250.yaml
@@ -0,0 +1,42 @@
+tournament:
+  rounds: 15
+game:
+  name: RoboCode
+  sims_per_round: 250
+  args:
+    nodisplay: true
+    nosound: true
+  record_ratio: 0.2
+players:
+- agent: mini
+  name: gpt5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RoboCode.
+    Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar.
+    Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots.
+    Your bot logic must be written in Java and located in the `robots/custom/` directory.
+    Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robots/custom/`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/configs/generated/gpt-5.4_vs_gpt-5/RobotRumble__gpt-5.4__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.4_vs_gpt-5/RobotRumble__gpt-5.4__gpt-5__r15__s250.yaml
new file mode 100644
index 00000000..b6eef98a
--- /dev/null
+++ b/configs/generated/gpt-5.4_vs_gpt-5/RobotRumble__gpt-5.4__gpt-5__r15__s250.yaml
@@ -0,0 +1,39 @@
+tournament:
+  rounds: 15
+game:
+  name: RobotRumble
+  sims_per_round: 250
+  args:
+    raw: true
+players:
+- agent: mini
+  name: gpt-5.4
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5.4'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+- agent: mini
+  name: gpt-5
+  config:
+    agent: !include mini/default.yaml
+    model:
+      model_name: 'openai/gpt-5'
+      model_class: litellm
+      model_kwargs:
+        reasoning_effort: 'high'
+prompts:
+  game_description: |-
+    You are a software developer ({{player_id}}) competing in a coding game called RobotRumble.
+    RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid.
+    Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match.
+    NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round.
+
+    The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}.
+    After you and your competitor finish editing your codebases, the game is run automatically.
+
+    Your task: improve the bot in `robot.js`, located in {{working_dir}}.
+    {{working_dir}} is your codebase, which contains both your both and supporting assets.
+    All of your commands will be executed in the {{working_dir}} directory (see notes below).
diff --git a/docs/openai_feedback_20260310.md b/docs/openai_feedback_20260310.md
new file mode 100644
index 00000000..b7ce8ca0
--- /dev/null
+++ b/docs/openai_feedback_20260310.md
@@ -0,0 +1,94 @@
+# OpenAI Benchmark Feedback
+
+Date: March 10, 2026
+
+## What We Ran
+
+We evaluated OpenAI coding models in a long-horizon iterative coding benchmark across six arenas:
+
+- BattleSnake
+- CoreWar
+- Halite
+- HuskyBench
+- RoboCode
+- RobotRumble
+
+Each matchup ran for 15 edit-and-play rounds. This is not a one-shot code generation benchmark; the models repeatedly edited their agents over time using feedback from prior rounds.
+
+## Sweep 1: Prior Pooled Sweep
+
+Run root:
+`/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312`
+
+Shareable plot:
+`/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312/analysis/elo/openai_feedback_per_arena.png`
+
+Overall pooled Elo:
+
+- GPT-5.4: 1298 +/- 98
+- GPT-5: 1210 +/- 65
+- GPT-5.3-Codex: 1092 +/- 108
+
+Interpretation:
+
+- GPT-5.4 was the strongest overall model in the pooled sweep.
+- GPT-5 was generally in the middle.
+- GPT-5.3-Codex trailed overall.
+
+Important nuance:
+
+- GPT-5 still looked stronger in some arenas, especially Halite and HuskyBench.
+- GPT-5.4 appears to win overall because it was stronger more consistently across the full suite, especially in BattleSnake and in the aggregate ranking.
+
+## Sweep 2: Direct GPT-5.4 vs GPT-5.3-Codex by Reasoning Mode
+
+Run root:
+`/Users/muhtasham/Documents/CodeClash/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105`
+
+Shareable plots:
+
+- Direct win rate by reasoning mode:
+  `/Users/muhtasham/Documents/CodeClash/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105/analysis/shareable/reasoning_mode_win_rate.png`
+- GPT-5.4 win rate by arena and reasoning mode:
+  `/Users/muhtasham/Documents/CodeClash/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105/analysis/shareable/reasoning_mode_arena_heatmap.png`
+- 8-variant Elo view:
+  `/Users/muhtasham/Documents/CodeClash/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105/analysis/elo/all_games_elo.png`
+
+Matched-tier direct head-to-head results, excluding ties:
+
+- Default: GPT-5.4 won 57, GPT-5.3-Codex won 37, ties 2
+- Low: GPT-5.4 won 48, GPT-5.3-Codex won 45, ties 3
+- Medium: GPT-5.4 won 81, GPT-5.3-Codex won 14, ties 1
+- High: GPT-5.4 won 62, GPT-5.3-Codex won 22, ties 12
+
+Interpretation:
+
+- GPT-5.4 beat GPT-5.3-Codex at every matched reasoning tier in this direct sweep.
+- Medium was the strongest GPT-5.4 setting in this benchmark.
+- High was also strong, but not clearly better than Medium.
+- Low was close to parity.
+
+Arena-level pattern:
+
+- GPT-5.4 was very strong in RoboCode across all tiers.
+- GPT-5.4 Medium was especially strong in CoreWar, Halite, HuskyBench, RoboCode, and RobotRumble.
+- GPT-5.3-Codex remained competitive or better in some BattleSnake settings.
+
+## Caveat on the 8-Variant Elo
+
+The 8-way Elo chart for the reasoning sweep should be treated as directional, not definitive.
+
+Reason:
+
+- the sweep only included same-tier direct pairings
+- the comparison graph is split into four disconnected components
+- there are no bridge matches between tiers
+
+So the most reliable conclusion from Sweep 2 is the direct same-tier head-to-head result, not the exact cross-tier Elo spacing among all eight variants.
+
+## Suggested Product Feedback
+
+- GPT-5.4 looks stronger than GPT-5.3-Codex on long-horizon iterative code improvement, especially at Medium reasoning.
+- More reasoning did not monotonically improve results; Medium outperformed High in this setup.
+- Performance remains arena-dependent. GPT-5 still looked strong in some environments in the pooled sweep, so model choice may depend on task structure rather than only aggregate Elo.
+- For future benchmarking, a connected round-robin across reasoning settings would produce a more trustworthy shared Elo ladder.
diff --git a/scripts/finalize_openai_sweep_report.sh b/scripts/finalize_openai_sweep_report.sh
new file mode 100755
index 00000000..9743c6e7
--- /dev/null
+++ b/scripts/finalize_openai_sweep_report.sh
@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+set -euo pipefail
+RUN_ROOT="/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312"
+REPORT="$RUN_ROOT/analysis/openai_vs_previous_leaderboard_summary.md"
+
+uv run python - <<'PY'
+import json, time
+from pathlib import Path
+run=Path('/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312')
+patterns=[
+  'PvpTournament.HuskyBench.r15.s100.p2.gpt-5.gpt-5.4.gpt-5.4-vs-gpt-5.*',
+  'PvpTournament.HuskyBench.r15.s100.p2.gpt-5.gpt-5.3-codex.gpt-5.3-codex-vs-gpt-5.*',
+]
+need={str(i) for i in range(16)}
+for _ in range(720):
+    done=0
+    for pat in patterns:
+        ds=sorted(run.glob(pat), key=lambda p:p.stat().st_mtime, reverse=True)
+        if not ds: continue
+        m=ds[0]/'metadata.json'
+        if not m.exists(): continue
+        try:j=json.loads(m.read_text())
+        except: continue
+        rs=set(j.get('round_stats',{}).keys())
+        if need.issubset(rs): done+=1
+    if done==2:
+        print('huskybench complete')
+        break
+    time.sleep(30)
+else:
+    raise SystemExit('timeout waiting for huskybench completion')
+PY
+
+/Users/muhtasham/Documents/CodeClash/scripts/run_eval_pipeline.sh --log-dir "$RUN_ROOT"
+
+uv run python - <<'PY'
+import re
+from pathlib import Path
+run=Path('/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312')
+report=run/'analysis'/'openai_vs_previous_leaderboard_summary.md'
+report.parent.mkdir(parents=True, exist_ok=True)
+all_tex=run/'analysis'/'elo'/'elo_table_plain.tex'
+text=all_tex.read_text() if all_tex.exists() else ''
+rows=[]
+for line in text.splitlines():
+    if '&' in line and '\\\\' in line and 'Model' not in line:
+        parts=[p.strip() for p in line.replace('\\\\','').split('&')]
+        if len(parts)>=3:
+            rows.append(parts)
+
+prev={
+'Claude Sonnet 4.5':1385,'GPT-5':1366,'o3':1343,'Claude Sonnet 4':1224,
+'GPT-5 Mini':1199,'Gemini 2.5 Pro':1124,'Grok Code Fast':1006,'Qwen3 Coder':952,
+}
+
+lines=[]
+lines.append('# OpenAI Sweep vs Previous Leaderboard')
+lines.append('')
+lines.append('Current run root: `/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312`')
+lines.append('')
+lines.append('## New-model ALL Elo (this run)')
+if rows:
+    lines.append('| Model | Elo |')
+    lines.append('|---|---:|')
+    for r in rows:
+        model=r[0]
+        elo=r[1] if len(r)>1 else ''
+        if any(k in model for k in ['gpt-5.4','gpt-5.3-codex','gpt-5']):
+            lines.append(f'| {model} | {elo} |')
+else:
+    lines.append('_Could not parse elo_table_plain.tex_')
+
+lines.append('')
+lines.append('## Comparison to previous public leaderboard (ALL Elo)')
+lines.append('- Previous top baseline in your provided table: **Claude Sonnet 4.5 = 1385 ± 18**')
+lines.append('- Previous **GPT-5 = 1366 ± 17**')
+lines.append('')
+lines.append('Interpretation guidance: this run uses a much smaller model set (gpt-5, gpt-5.4, gpt-5.3-codex), so Elo scale can shift. Compare directionally, not as strict absolute replacement of global board ranks.')
+
+report.write_text('\n'.join(lines)+'\n')
+print(report)
+PY
diff --git a/scripts/plot_leaderboard_comparison.py b/scripts/plot_leaderboard_comparison.py
new file mode 100644
index 00000000..038d33cb
--- /dev/null
+++ b/scripts/plot_leaderboard_comparison.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import json
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+
+from codeclash.analysis.viz.utils import FONT_BOLD, FONT_REG, MARKERS, MODEL_TO_COLOR, model_display_name
+
+
+GAME_ORDER = ["halite", "huskybench", "corewar", "robotrumble", "robocode", "battlesnake", "all"]
+GAME_LABELS = {
+    "halite": "Halite",
+    "huskybench": "HuskyBench",
+    "corewar": "CoreWar",
+    "robotrumble": "RobotRumble",
+    "robocode": "RoboCode",
+    "battlesnake": "BattleSnake",
+    "all": "Overall",
+}
+
+
+def canonical_model_name(name: str) -> str:
+    lowered = name.lower()
+    if lowered == "gpt-5":
+        return "gpt-5"
+    return lowered
+
+
+def base_model_name(name: str) -> str:
+    for suffix in ("-default", "-low", "-medium", "-high"):
+        if name.endswith(suffix):
+            return name[: -len(suffix)]
+    return name
+
+
+def load_rows(leaderboards_path: Path) -> tuple[list[str], dict[str, dict[str, tuple[int, int]]]]:
+    raw = json.loads(leaderboards_path.read_text())
+    models: list[str] = []
+    rows: dict[str, dict[str, tuple[int, int]]] = {}
+
+    for game_key in GAME_ORDER:
+        board = raw.get(game_key, {}).get("board", [])
+        rows[game_key] = {}
+        for entry in board:
+            model = canonical_model_name(entry["model"])
+            if model not in models:
+                models.append(model)
+            rows[game_key][model] = (int(entry["elo"]), int(entry["elo_std"]))
+    return models, rows
+
+
+def plot(rows: dict[str, dict[str, tuple[int, int]]], models: list[str], output_base: Path, title: str) -> None:
+    fig, ax = plt.subplots(figsize=(11.5, 7.0))
+
+    y_positions = list(range(len(GAME_ORDER)))
+    spread = 0.48
+    if len(models) == 1:
+        offsets = [0.0]
+    else:
+        step = spread / (len(models) - 1)
+        offsets = [(-spread / 2) + (i * step) for i in range(len(models))]
+
+    for idx, model in enumerate(models):
+        ys = [y + offsets[idx] for y in y_positions]
+        xs = [rows[g][model][0] for g in GAME_ORDER]
+        xerr = [rows[g][model][1] for g in GAME_ORDER]
+        color = MODEL_TO_COLOR.get(model, MODEL_TO_COLOR.get(base_model_name(model), "#4C78A8"))
+        marker = MARKERS[idx % len(MARKERS)]
+
+        ax.errorbar(
+            xs,
+            ys,
+            xerr=xerr,
+            fmt=marker,
+            color=color,
+            ecolor=color,
+            elinewidth=1.6,
+            capsize=3,
+            markersize=7,
+            label=model_display_name(model),
+        )
+
+    ax.set_yticks(y_positions)
+    ax.set_yticklabels([GAME_LABELS[g] for g in GAME_ORDER], fontproperties=FONT_REG, fontsize=12)
+    ax.invert_yaxis()
+    ax.set_xlabel("Elo", fontproperties=FONT_BOLD, fontsize=13)
+    ax.set_title(title, fontproperties=FONT_BOLD, fontsize=15, pad=12)
+    ax.grid(axis="x", alpha=0.25)
+    ax.axvline(1200, color="#888888", linestyle="--", linewidth=1, alpha=0.5)
+    ax.legend(frameon=False, prop=FONT_REG, loc="lower right", ncol=2)
+
+    for spine in ["top", "right"]:
+        ax.spines[spine].set_visible(False)
+
+    fig.tight_layout()
+    fig.savefig(output_base.with_suffix(".png"), dpi=220, bbox_inches="tight")
+    fig.savefig(output_base.with_suffix(".pdf"), bbox_inches="tight")
+    plt.close(fig)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Create a shareable per-game Elo comparison plot.")
+    parser.add_argument("leaderboards_json", type=Path)
+    parser.add_argument("--output-base", type=Path, required=True)
+    parser.add_argument("--title", type=str, default="OpenAI Benchmark Results by Arena")
+    args = parser.parse_args()
+
+    models, rows = load_rows(args.leaderboards_json)
+    plot(rows, models, args.output_base, args.title)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/plot_reasoning_head_to_head.py b/scripts/plot_reasoning_head_to_head.py
new file mode 100644
index 00000000..d183c145
--- /dev/null
+++ b/scripts/plot_reasoning_head_to_head.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from codeclash.analysis.viz.utils import FONT_BOLD, FONT_REG
+
+
+TIER_ORDER = ["default", "low", "medium", "high"]
+ARENA_ORDER = ["BattleSnake", "CoreWar", "Halite", "HuskyBench", "RoboCode", "RobotRumble"]
+ARENA_LABELS = {
+    "BattleSnake": "BattleSnake",
+    "CoreWar": "CoreWar",
+    "Halite": "Halite",
+    "HuskyBench": "HuskyBench",
+    "RoboCode": "RoboCode",
+    "RobotRumble": "RobotRumble",
+}
+
+
+def iter_live_metadata(run_root: Path):
+    for metadata_path in sorted(run_root.rglob("metadata.json")):
+        if "quarantine" in metadata_path.parts:
+            continue
+        yield metadata_path
+
+
+def parse_tier(model_name: str) -> str:
+    for tier in TIER_ORDER:
+        if model_name.endswith(f"-{tier}"):
+            return tier
+    raise ValueError(f"Could not parse tier from {model_name}")
+
+
+def load_stats(run_root: Path):
+    by_tier = {tier: {"gpt-5.4": 0, "gpt-5.3-codex": 0, "ties": 0, "total": 0} for tier in TIER_ORDER}
+    by_arena_tier = {
+        arena: {tier: {"gpt-5.4": 0, "gpt-5.3-codex": 0, "ties": 0, "total": 0} for tier in TIER_ORDER}
+        for arena in ARENA_ORDER
+    }
+
+    for metadata_path in iter_live_metadata(run_root):
+        data = json.loads(metadata_path.read_text())
+        game_name = data.get("config", {}).get("game", {}).get("name") or data.get("game", {}).get("name")
+        if game_name not in ARENA_ORDER:
+            continue
+
+        players = [p.get("name") for p in data.get("config", {}).get("players", []) if isinstance(p, dict)]
+        if len(players) != 2:
+            continue
+
+        tier = parse_tier(players[0])
+        rounds = data.get("round_stats", {})
+        if isinstance(rounds, dict):
+            rounds = list(rounds.values())
+
+        for round_stat in rounds:
+            winner = round_stat.get("winner")
+            bucket = by_tier[tier]
+            arena_bucket = by_arena_tier[game_name][tier]
+            if winner == "Tie":
+                bucket["ties"] += 1
+                arena_bucket["ties"] += 1
+            elif winner and winner.startswith("gpt-5.4"):
+                bucket["gpt-5.4"] += 1
+                arena_bucket["gpt-5.4"] += 1
+            elif winner and winner.startswith("gpt-5.3-codex"):
+                bucket["gpt-5.3-codex"] += 1
+                arena_bucket["gpt-5.3-codex"] += 1
+            else:
+                continue
+            bucket["total"] += 1
+            arena_bucket["total"] += 1
+
+    return by_tier, by_arena_tier
+
+
+def plot_overall(by_tier, output_path: Path) -> None:
+    tiers = TIER_ORDER
+    gpt54_rates = []
+    codex_rates = []
+
+    for tier in tiers:
+        total = by_tier[tier]["total"] or 1
+        gpt54_rates.append(100 * by_tier[tier]["gpt-5.4"] / total)
+        codex_rates.append(100 * by_tier[tier]["gpt-5.3-codex"] / total)
+
+    y = np.arange(len(tiers))
+    fig, ax = plt.subplots(figsize=(9, 4.8))
+
+    ax.barh(y - 0.18, gpt54_rates, height=0.34, color="#0B8F55", label="GPT-5.4")
+    ax.barh(y + 0.18, codex_rates, height=0.34, color="#C75B12", label="GPT-5.3-Codex")
+
+    for idx, tier in enumerate(tiers):
+        total = by_tier[tier]["total"]
+        ax.text(gpt54_rates[idx] + 1.2, y[idx] - 0.18, f"{by_tier[tier]['gpt-5.4']}/{total}", va="center", fontproperties=FONT_REG, fontsize=10)
+        ax.text(codex_rates[idx] + 1.2, y[idx] + 0.18, f"{by_tier[tier]['gpt-5.3-codex']}/{total}", va="center", fontproperties=FONT_REG, fontsize=10)
+
+    ax.set_yticks(y)
+    ax.set_yticklabels([tier.title() for tier in tiers], fontproperties=FONT_REG, fontsize=11)
+    ax.set_xlim(0, 100)
+    ax.set_xlabel("Win Rate Excluding Ties (%)", fontproperties=FONT_BOLD, fontsize=12)
+    ax.set_title("Direct Head-to-Head Win Rate by Reasoning Mode", fontproperties=FONT_BOLD, fontsize=14, pad=10)
+    ax.grid(axis="x", alpha=0.25)
+    ax.legend(frameon=False, prop=FONT_REG, loc="lower right")
+    for spine in ("top", "right"):
+        ax.spines[spine].set_visible(False)
+
+    fig.tight_layout()
+    fig.savefig(output_path.with_suffix(".png"), dpi=220, bbox_inches="tight")
+    fig.savefig(output_path.with_suffix(".pdf"), bbox_inches="tight")
+    plt.close(fig)
+
+
+def plot_heatmap(by_arena_tier, output_path: Path) -> None:
+    matrix = []
+    annotations = []
+    for arena in ARENA_ORDER:
+        row = []
+        ann_row = []
+        for tier in TIER_ORDER:
+            bucket = by_arena_tier[arena][tier]
+            total = bucket["total"] or 1
+            rate = 100 * bucket["gpt-5.4"] / total
+            row.append(rate)
+            ann_row.append(f"{bucket['gpt-5.4']}/{total}")
+        matrix.append(row)
+        annotations.append(ann_row)
+
+    matrix = np.array(matrix)
+
+    fig, ax = plt.subplots(figsize=(8.8, 5.8))
+    image = ax.imshow(matrix, cmap="RdYlGn", vmin=0, vmax=100, aspect="auto")
+
+    ax.set_xticks(np.arange(len(TIER_ORDER)))
+    ax.set_xticklabels([tier.title() for tier in TIER_ORDER], fontproperties=FONT_REG, fontsize=11)
+    ax.set_yticks(np.arange(len(ARENA_ORDER)))
+    ax.set_yticklabels([ARENA_LABELS[a] for a in ARENA_ORDER], fontproperties=FONT_REG, fontsize=11)
+    ax.set_title("GPT-5.4 Win Rate by Arena and Reasoning Mode", fontproperties=FONT_BOLD, fontsize=14, pad=10)
+
+    for row_idx in range(len(ARENA_ORDER)):
+        for col_idx in range(len(TIER_ORDER)):
+            value = matrix[row_idx, col_idx]
+            color = "#111111" if 25 <= value <= 75 else "#FFFFFF"
+            ax.text(
+                col_idx,
+                row_idx,
+                annotations[row_idx][col_idx],
+                ha="center",
+                va="center",
+                color=color,
+                fontproperties=FONT_REG,
+                fontsize=10,
+            )
+
+    cbar = fig.colorbar(image, ax=ax, fraction=0.046, pad=0.04)
+    cbar.set_label("GPT-5.4 Win Rate (%)", fontproperties=FONT_BOLD, fontsize=11)
+
+    fig.tight_layout()
+    fig.savefig(output_path.with_suffix(".png"), dpi=220, bbox_inches="tight")
+    fig.savefig(output_path.with_suffix(".pdf"), bbox_inches="tight")
+    plt.close(fig)
+
+
+def plot_per_game(by_arena_tier, output_path: Path) -> None:
+    fig, axes = plt.subplots(2, 3, figsize=(13.5, 8.2), sharex=True)
+    axes = axes.flatten()
+    y = np.arange(len(TIER_ORDER))
+
+    for ax, arena in zip(axes, ARENA_ORDER):
+        gpt54_rates = []
+        codex_rates = []
+        for tier in TIER_ORDER:
+            bucket = by_arena_tier[arena][tier]
+            total = bucket["total"] or 1
+            gpt54_rates.append(100 * bucket["gpt-5.4"] / total)
+            codex_rates.append(100 * bucket["gpt-5.3-codex"] / total)
+
+        ax.barh(y - 0.18, gpt54_rates, height=0.34, color="#0B8F55", label="GPT-5.4")
+        ax.barh(y + 0.18, codex_rates, height=0.34, color="#C75B12", label="GPT-5.3-Codex")
+
+        for idx, tier in enumerate(TIER_ORDER):
+            total = by_arena_tier[arena][tier]["total"]
+            ax.text(gpt54_rates[idx] + 1.0, y[idx] - 0.18, f"{by_arena_tier[arena][tier]['gpt-5.4']}/{total}", va="center", fontproperties=FONT_REG, fontsize=8.5)
+            ax.text(codex_rates[idx] + 1.0, y[idx] + 0.18, f"{by_arena_tier[arena][tier]['gpt-5.3-codex']}/{total}", va="center", fontproperties=FONT_REG, fontsize=8.5)
+
+        ax.set_title(ARENA_LABELS[arena], fontproperties=FONT_BOLD, fontsize=12, pad=8)
+        ax.set_xlim(0, 100)
+        ax.set_yticks(y)
+        ax.set_yticklabels([tier.title() for tier in TIER_ORDER], fontproperties=FONT_REG, fontsize=10)
+        ax.grid(axis="x", alpha=0.2)
+        for spine in ("top", "right"):
+            ax.spines[spine].set_visible(False)
+
+    handles, labels = axes[0].get_legend_handles_labels()
+    fig.legend(handles, labels, frameon=False, prop=FONT_REG, loc="lower center", ncol=2, bbox_to_anchor=(0.5, -0.01))
+    fig.suptitle("Direct Head-to-Head Win Rate by Arena and Reasoning Mode", fontproperties=FONT_BOLD, fontsize=15, y=0.98)
+    fig.supxlabel("Win Rate Excluding Ties (%)", fontproperties=FONT_BOLD, fontsize=12, y=0.04)
+    fig.tight_layout(rect=(0, 0.05, 1, 0.95))
+    fig.savefig(output_path.with_suffix(".png"), dpi=220, bbox_inches="tight")
+    fig.savefig(output_path.with_suffix(".pdf"), bbox_inches="tight")
+    plt.close(fig)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Plot direct head-to-head reasoning-mode results from tournament metadata.")
+    parser.add_argument("run_root", type=Path)
+    parser.add_argument("--output-dir", type=Path, required=True)
+    args = parser.parse_args()
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    by_tier, by_arena_tier = load_stats(args.run_root)
+    plot_overall(by_tier, args.output_dir / "reasoning_mode_win_rate")
+    plot_heatmap(by_arena_tier, args.output_dir / "reasoning_mode_arena_heatmap")
+    plot_per_game(by_arena_tier, args.output_dir / "reasoning_mode_win_rate_per_game")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/print_leaderboard_table.py b/scripts/print_leaderboard_table.py
new file mode 100644
index 00000000..9108d336
--- /dev/null
+++ b/scripts/print_leaderboard_table.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+import argparse
+import json
+from pathlib import Path
+
+
+GAME_ORDER = ["halite", "huskybench", "corewar", "robotrumble", "robocode", "battlesnake", "all"]
+
+
+def load_board(path: Path) -> dict:
+    with path.open() as f:
+        return json.load(f)
+
+
+def as_lookup(board: list[dict]) -> dict[str, tuple[int, int]]:
+    return {row["model"]: (int(row["elo"]), int(row["elo_std"])) for row in board}
+
+
+def main(input: Path, out: Path) -> None:
+    data = load_board(input)
+    models = {row["model"] for section in data.values() for row in section.get("board", [])}
+
+    rows = []
+    for model in sorted(models):
+        all_board = data.get("all", {}).get("board", [])
+        rank = next((int(r["rank"]) for r in all_board if r["model"] == model), 999)
+
+        vals = []
+        for game in GAME_ORDER:
+            lookup = as_lookup(data.get(game, {}).get("board", []))
+            elo, std = lookup.get(model, (0, 0))
+            vals.append(f"{elo} ± {std}")
+        rows.append((rank, model, vals))
+
+    rows.sort(key=lambda x: x[0])
+
+    header = "| Rank | Model | Halite | HuskyBench | CoreWar | RobotRumble | Robocode | BattleSnake | All |\n"
+    header += "|---:|---|---:|---:|---:|---:|---:|---:|---:|\n"
+    lines = [header]
+    for rank, model, vals in rows:
+        lines.append(
+            f"| {rank} | {model} | {vals[0]} | {vals[1]} | {vals[2]} | {vals[3]} | {vals[4]} | {vals[5]} | {vals[6]} |\n"
+        )
+
+    out.parent.mkdir(parents=True, exist_ok=True)
+    out.write_text("".join(lines))
+    print(f"Wrote {out}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", type=Path, required=True)
+    parser.add_argument("--out", type=Path, required=True)
+    args = parser.parse_args()
+    main(args.input, args.out)
diff --git a/scripts/run_eval_pipeline.sh b/scripts/run_eval_pipeline.sh
new file mode 100755
index 00000000..2905b017
--- /dev/null
+++ b/scripts/run_eval_pipeline.sh
@@ -0,0 +1,119 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+usage() {
+    cat <<'EOF'
+Usage:
+  scripts/run_eval_pipeline.sh --log-dir <path> [--output-dir <path>] [--viewer]
+
+Description:
+  Runs post-benchmark analysis pipeline for CodeClash logs:
+  1) backfill cost info into metadata
+  2) compute win-rate summary
+  3) compute Elo rankings + uncertainty outputs
+  4) generate win-rate heatmap PDF
+  5) render markdown leaderboard table for manual patching
+  6) optionally launch local viewer
+
+Arguments:
+  --log-dir <path>     Required. Root directory containing tournament logs.
+  --output-dir <path>  Optional. Defaults to <log-dir>/analysis.
+  --viewer             Optional. Launch viewer at end (blocks until Ctrl+C).
+  -h, --help           Show this help.
+EOF
+}
+
+LOG_DIR=""
+OUTPUT_DIR=""
+OPEN_VIEWER=0
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --log-dir)
+            LOG_DIR="${2:-}"
+            shift 2
+            ;;
+        --output-dir)
+            OUTPUT_DIR="${2:-}"
+            shift 2
+            ;;
+        --viewer)
+            OPEN_VIEWER=1
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown argument: $1" >&2
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+if [[ -z "${LOG_DIR}" ]]; then
+    echo "Error: --log-dir is required." >&2
+    usage
+    exit 1
+fi
+
+if [[ ! -d "${LOG_DIR}" ]]; then
+    echo "Error: log directory does not exist: ${LOG_DIR}" >&2
+    exit 1
+fi
+
+if [[ -z "${OUTPUT_DIR}" ]]; then
+    OUTPUT_DIR="${LOG_DIR%/}/analysis"
+fi
+
+ELO_OUT="${OUTPUT_DIR%/}/elo"
+HEATMAP_OUT="${OUTPUT_DIR%/}/heatmap_win_rates.pdf"
+TABLE_OUT="${OUTPUT_DIR%/}/leaderboard_table.md"
+
+mkdir -p "${ELO_OUT}"
+
+echo "==> Repo root: ${REPO_ROOT}"
+echo "==> Log dir: ${LOG_DIR}"
+echo "==> Output dir: ${OUTPUT_DIR}"
+
+cd "${REPO_ROOT}"
+
+echo "==> Step 1/5: Backfilling cost info into metadata..."
+uv run python "${REPO_ROOT}/scripts/include_cost_info_in_metadata.py" "${LOG_DIR}"
+
+echo "==> Step 2/5: Computing win-rate summary..."
+uv run python "${REPO_ROOT}/codeclash/analysis/metrics/win_rate.py" -d "${LOG_DIR}"
+
+echo "==> Step 3/5: Computing Elo rankings..."
+uv run python "${REPO_ROOT}/codeclash/analysis/metrics/elo.py" \
+    -d "${LOG_DIR}" \
+    --output-dir "${ELO_OUT}"
+
+echo "==> Step 4/5: Generating win-rate heatmap..."
+uv run python "${REPO_ROOT}/codeclash/analysis/viz/heatmap_win_rates.py" \
+    -d "${LOG_DIR}" \
+    -o "${HEATMAP_OUT}"
+
+echo "==> Step 5/5: Rendering leaderboard table..."
+uv run python "${REPO_ROOT}/scripts/print_leaderboard_table.py" \
+    --input "${ELO_OUT}/leaderboards.json" \
+    --out "${TABLE_OUT}"
+
+echo
+echo "Pipeline complete."
+echo "Elo outputs: ${ELO_OUT}"
+echo "Heatmap: ${HEATMAP_OUT}"
+echo "Leaderboard table: ${TABLE_OUT}"
+
+if [[ ${OPEN_VIEWER} -eq 1 ]]; then
+    echo "==> Launching viewer (Ctrl+C to stop)..."
+    uv run python "${REPO_ROOT}/scripts/run_viewer.py" -d "${LOG_DIR}"
+else
+    echo "To inspect trajectories: uv run python ${REPO_ROOT}/scripts/run_viewer.py -d ${LOG_DIR}"
+fi
diff --git a/scripts/run_gpt54_gpt53codex_round_robin.sh b/scripts/run_gpt54_gpt53codex_round_robin.sh
new file mode 100755
index 00000000..1dfaef80
--- /dev/null
+++ b/scripts/run_gpt54_gpt53codex_round_robin.sh
@@ -0,0 +1,312 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+RUN_ROOT="${REPO_ROOT}/logs/gpt54_gpt53codex_round_robin_$(date +%Y%m%d_%H%M%S)"
+MAX_CONFIG_RETRIES=2
+CONTINUE_ON_ERROR=0
+OPEN_VIEWER=0
+RESUME=0
+DRY_RUN=0
+PARALLEL=0
+MAX_PARALLEL=4
+
+usage() {
+    cat <<'EOF'
+Usage:
+  scripts/run_gpt54_gpt53codex_round_robin.sh [options]
+
+Description:
+  Runs a full round robin across 8 variants:
+    - gpt-5.4-default
+    - gpt-5.4-low
+    - gpt-5.4-medium
+    - gpt-5.4-high
+    - gpt-5.3-codex-default
+    - gpt-5.3-codex-low
+    - gpt-5.3-codex-medium
+    - gpt-5.3-codex-high
+
+  This creates one connected match graph so Elo is meaningful across all 8 variants.
+
+Options:
+  --run-root <path>         Set custom logs root for this batch.
+  --max-config-retries <n>  Retry each failed arena config up to n times (default: 2).
+  --continue-on-error       Continue after a failed pairing.
+  --resume                  Skip already-completed per-arena configs in an existing --run-root.
+  --parallel                Run pairings in parallel.
+  --max-parallel <n>        Maximum concurrent pairings when --parallel is set (default: 4).
+  --viewer                  Launch viewer after the eval pipeline.
+  --dry-run                 Print commands without running them.
+  -h, --help                Show help.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --run-root)
+            RUN_ROOT="${2:-}"
+            shift 2
+            ;;
+        --max-config-retries)
+            MAX_CONFIG_RETRIES="${2:-}"
+            shift 2
+            ;;
+        --continue-on-error)
+            CONTINUE_ON_ERROR=1
+            shift
+            ;;
+        --resume)
+            RESUME=1
+            shift
+            ;;
+        --parallel)
+            PARALLEL=1
+            shift
+            ;;
+        --max-parallel)
+            MAX_PARALLEL="${2:-}"
+            shift 2
+            ;;
+        --viewer)
+            OPEN_VIEWER=1
+            shift
+            ;;
+        --dry-run)
+            DRY_RUN=1
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown argument: $1" >&2
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+if ! [[ "${MAX_CONFIG_RETRIES}" =~ ^[0-9]+$ ]]; then
+    echo "Error: --max-config-retries must be a non-negative integer, got '${MAX_CONFIG_RETRIES}'" >&2
+    exit 1
+fi
+
+if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || [[ "${MAX_PARALLEL}" -lt 1 ]]; then
+    echo "Error: --max-parallel must be a positive integer, got '${MAX_PARALLEL}'" >&2
+    exit 1
+fi
+
+if [[ ! -x "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" ]]; then
+    echo "Missing or non-executable: ${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" >&2
+    exit 1
+fi
+
+if [[ ! -x "${REPO_ROOT}/scripts/run_eval_pipeline.sh" ]]; then
+    echo "Missing or non-executable: ${REPO_ROOT}/scripts/run_eval_pipeline.sh" >&2
+    exit 1
+fi
+
+mkdir -p "${RUN_ROOT}"
+
+declare -a VARIANT_KEYS=(
+    "gpt-5.4-default"
+    "gpt-5.4-low"
+    "gpt-5.4-medium"
+    "gpt-5.4-high"
+    "gpt-5.3-codex-default"
+    "gpt-5.3-codex-low"
+    "gpt-5.3-codex-medium"
+    "gpt-5.3-codex-high"
+)
+
+variant_model() {
+    case "$1" in
+        gpt-5.4-*) echo "openai/gpt-5.4" ;;
+        gpt-5.3-codex-*) echo "openai/gpt-5.3-codex" ;;
+        *) echo "Unknown variant: $1" >&2; exit 1 ;;
+    esac
+}
+
+variant_effort() {
+    case "$1" in
+        *-default) echo "" ;;
+        *-low) echo "low" ;;
+        *-medium) echo "medium" ;;
+        *-high) echo "high" ;;
+        *) echo "Unknown variant: $1" >&2; exit 1 ;;
+    esac
+}
+
+common_args=(
+    --log-dir "${RUN_ROOT}"
+    --max-config-retries "${MAX_CONFIG_RETRIES}"
+)
+
+if [[ ${CONTINUE_ON_ERROR} -eq 1 ]]; then
+    common_args+=(--continue-on-error)
+fi
+
+if [[ ${RESUME} -eq 1 ]]; then
+    common_args+=(--resume)
+fi
+
+run_pairing() {
+    local player_alias="$1"
+    local opponent_alias="$2"
+    local player_model
+    local opponent_model
+    local player_effort
+    local opponent_effort
+    local -a args
+
+    player_model="$(variant_model "${player_alias}")"
+    opponent_model="$(variant_model "${opponent_alias}")"
+    player_effort="$(variant_effort "${player_alias}")"
+    opponent_effort="$(variant_effort "${opponent_alias}")"
+
+    args=(
+        "${common_args[@]}"
+        --model "${player_model}"
+        --alias "${player_alias}"
+        --opponent "${opponent_model}"
+        --opponent-alias "${opponent_alias}"
+    )
+
+    if [[ -n "${player_effort}" ]]; then
+        args+=(--player-reasoning-effort "${player_effort}")
+    fi
+    if [[ -n "${opponent_effort}" ]]; then
+        args+=(--opponent-reasoning-effort "${opponent_effort}")
+    fi
+
+    echo "==> Pairing: ${player_alias} vs ${opponent_alias}"
+    "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" "${args[@]}"
+}
+
+print_pairing_command() {
+    local player_alias="$1"
+    local opponent_alias="$2"
+    local player_model
+    local opponent_model
+    local player_effort
+    local opponent_effort
+
+    player_model="$(variant_model "${player_alias}")"
+    opponent_model="$(variant_model "${opponent_alias}")"
+    player_effort="$(variant_effort "${player_alias}")"
+    opponent_effort="$(variant_effort "${opponent_alias}")"
+
+    printf "%s --model %s --alias %s --opponent %s --opponent-alias %s --log-dir %s --max-config-retries %s" \
+        "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" \
+        "${player_model}" \
+        "${player_alias}" \
+        "${opponent_model}" \
+        "${opponent_alias}" \
+        "${RUN_ROOT}" \
+        "${MAX_CONFIG_RETRIES}"
+
+    if [[ ${CONTINUE_ON_ERROR} -eq 1 ]]; then
+        printf " --continue-on-error"
+    fi
+    if [[ ${RESUME} -eq 1 ]]; then
+        printf " --resume"
+    fi
+    if [[ -n "${player_effort}" ]]; then
+        printf " --player-reasoning-effort %s" "${player_effort}"
+    fi
+    if [[ -n "${opponent_effort}" ]]; then
+        printf " --opponent-reasoning-effort %s" "${opponent_effort}"
+    fi
+    printf "\n"
+}
+
+declare -a PAIRS=()
+for ((i = 0; i < ${#VARIANT_KEYS[@]}; i++)); do
+    for ((j = i + 1; j < ${#VARIANT_KEYS[@]}; j++)); do
+        PAIRS+=("${VARIANT_KEYS[i]}|${VARIANT_KEYS[j]}")
+    done
+done
+
+echo "==> Repo root: ${REPO_ROOT}"
+echo "==> Run root: ${RUN_ROOT}"
+echo "==> Variants: ${#VARIANT_KEYS[@]}"
+echo "==> Pairings: ${#PAIRS[@]}"
+echo "==> Parallel pairings: ${PARALLEL}"
+echo "==> Max parallel pairings: ${MAX_PARALLEL}"
+echo "==> Continue on error: ${CONTINUE_ON_ERROR}"
+echo "==> Resume: ${RESUME}"
+echo "==> Max config retries: ${MAX_CONFIG_RETRIES}"
+
+if [[ ${DRY_RUN} -eq 1 ]]; then
+    for pair in "${PAIRS[@]}"; do
+        IFS="|" read -r player_alias opponent_alias <<<"${pair}"
+        print_pairing_command "${player_alias}" "${opponent_alias}"
+    done
+    if [[ ${OPEN_VIEWER} -eq 1 ]]; then
+        echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${RUN_ROOT} --viewer"
+    else
+        echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${RUN_ROOT}"
+    fi
+    exit 0
+fi
+
+if [[ ${PARALLEL} -eq 1 ]]; then
+    declare -a PIDS=()
+    declare -a PID_PAIRS=()
+    declare -i FAILURE_COUNT=0
+
+    wait_for_one() {
+        local pid="${PIDS[0]}"
+        local pair="${PID_PAIRS[0]}"
+        local status=0
+
+        if ! wait "${pid}"; then
+            status=$?
+            echo "==> Pairing failed: ${pair} (exit ${status})" >&2
+            FAILURE_COUNT+=1
+            if [[ ${CONTINUE_ON_ERROR} -ne 1 ]]; then
+                echo "==> Stopping due to pairing failure and --continue-on-error not set." >&2
+                exit "${status}"
+            fi
+        else
+            echo "==> Pairing finished: ${pair}"
+        fi
+
+        PIDS=("${PIDS[@]:1}")
+        PID_PAIRS=("${PID_PAIRS[@]:1}")
+    }
+
+    for pair in "${PAIRS[@]}"; do
+        IFS="|" read -r player_alias opponent_alias <<<"${pair}"
+        run_pairing "${player_alias}" "${opponent_alias}" &
+        PIDS+=("$!")
+        PID_PAIRS+=("${player_alias} vs ${opponent_alias}")
+        if [[ ${#PIDS[@]} -ge ${MAX_PARALLEL} ]]; then
+            wait_for_one
+        fi
+    done
+
+    while [[ ${#PIDS[@]} -gt 0 ]]; do
+        wait_for_one
+    done
+
+    if [[ ${FAILURE_COUNT} -gt 0 ]]; then
+        echo "==> ${FAILURE_COUNT} pairing(s) failed." >&2
+    fi
+else
+    for pair in "${PAIRS[@]}"; do
+        IFS="|" read -r player_alias opponent_alias <<<"${pair}"
+        run_pairing "${player_alias}" "${opponent_alias}"
+    done
+fi
+
+if [[ ${OPEN_VIEWER} -eq 1 ]]; then
+    "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${RUN_ROOT}" --viewer
+else
+    "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${RUN_ROOT}"
+fi
diff --git a/scripts/run_gpt54_vs_gpt53codex_high_remaining.sh b/scripts/run_gpt54_vs_gpt53codex_high_remaining.sh
new file mode 100755
index 00000000..a94eff86
--- /dev/null
+++ b/scripts/run_gpt54_vs_gpt53codex_high_remaining.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+REPO_ROOT="/Users/muhtasham/Documents/CodeClash"
+RUN_ROOT="${REPO_ROOT}/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105"
+CFG_ROOT="${REPO_ROOT}/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high"
+RUN_SUFFIX="gpt-5.4-high-vs-gpt-5.3-codex-high"
+
+cd "${REPO_ROOT}"
+
+# Force key source to repo .env so mini-swe-agent global env doesn't silently override.
+OPENAI_KEY_FROM_REPO="$(
+  uv run python - "${REPO_ROOT}/.env" <<'PY'
+import sys
+from pathlib import Path
+from dotenv import dotenv_values
+
+env_path = Path(sys.argv[1])
+if not env_path.exists():
+    raise SystemExit(2)
+val = dotenv_values(env_path).get("OPENAI_API_KEY", "")
+print(val or "")
+PY
+)"
+
+if [[ -z "${OPENAI_KEY_FROM_REPO}" ]]; then
+  echo "Error: OPENAI_API_KEY missing in ${REPO_ROOT}/.env" >&2
+  exit 1
+fi
+
+export OPENAI_API_KEY="${OPENAI_KEY_FROM_REPO}"
+unset OPENAI_KEY_FROM_REPO
+
+uv run python - "${REPO_ROOT}/.env" <<'PY'
+import hashlib
+import os
+import sys
+from pathlib import Path
+from dotenv import dotenv_values
+
+def fp(v: str) -> str:
+    return f"len={len(v)} sha256[:10]={hashlib.sha256(v.encode()).hexdigest()[:10]} tail={v[-4:]}"
+
+repo_env = Path(sys.argv[1])
+repo_key = dotenv_values(repo_env).get("OPENAI_API_KEY", "")
+env_key = os.environ.get("OPENAI_API_KEY", "")
+mini_env = Path.home() / "Library/Application Support/mini-swe-agent/.env"
+mini_key = dotenv_values(mini_env).get("OPENAI_API_KEY", "") if mini_env.exists() else ""
+
+print(f"==> OPENAI key source forced: {repo_env}")
+print(f"==> OPENAI key fingerprint: {fp(env_key)}")
+if mini_key and mini_key != repo_key:
+    print("==> note: mini-swe-agent global key differs; repo key is forced for this run.")
+PY
+
+mkdir -p "${RUN_ROOT}/quarantine"
+
+find "${RUN_ROOT}" -maxdepth 1 -type d \
+  -name 'PvpTournament.Halite.r15.s250.p2.gpt-5.3-codex-high.gpt-5.4-high.gpt-5.4-high-vs-gpt-5.3-codex-high.*' \
+  -exec mv {} "${RUN_ROOT}/quarantine"/ \;
+
+uv run python main.py \
+  "${CFG_ROOT}/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml" \
+  -o "${RUN_ROOT}" \
+  -s "${RUN_SUFFIX}"
+
+uv run python main.py \
+  "${CFG_ROOT}/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml" \
+  -o "${RUN_ROOT}" \
+  -s "${RUN_SUFFIX}"
+
+uv run python main.py \
+  "${CFG_ROOT}/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml" \
+  -o "${RUN_ROOT}" \
+  -s "${RUN_SUFFIX}"
+
+uv run python main.py \
+  "${CFG_ROOT}/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml" \
+  -o "${RUN_ROOT}" \
+  -s "${RUN_SUFFIX}"
diff --git a/scripts/run_gpt54_vs_gpt53codex_reasoning.sh b/scripts/run_gpt54_vs_gpt53codex_reasoning.sh
new file mode 100755
index 00000000..962eefeb
--- /dev/null
+++ b/scripts/run_gpt54_vs_gpt53codex_reasoning.sh
@@ -0,0 +1,222 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+PLAYER_MODEL="openai/gpt-5.4"
+OPPONENT_MODEL="openai/gpt-5.3-codex"
+RUN_ROOT="${REPO_ROOT}/logs/gpt54_vs_gpt53codex_reasoning_$(date +%Y%m%d_%H%M%S)"
+MAX_CONFIG_RETRIES=2
+CONTINUE_ON_ERROR=0
+OPEN_VIEWER=0
+RESUME=0
+DRY_RUN=0
+PARALLEL=0
+MAX_PARALLEL=4
+
+usage() {
+    cat <<'EOF'
+Usage:
+  scripts/run_gpt54_vs_gpt53codex_reasoning.sh [options]
+
+Description:
+  Runs direct head-to-head benchmark sweeps for openai/gpt-5.4 vs openai/gpt-5.3-codex
+  across four effort tiers in one shared log root:
+    - default
+    - low
+    - medium
+    - high
+
+Options:
+  --run-root <path>         Set custom logs root for this batch.
+  --max-config-retries <n>  Retry each failed arena config up to n times (default: 2).
+  --continue-on-error       Continue to remaining configs/tiers when one fails.
+  --resume                  Skip already-completed per-arena configs in an existing --run-root.
+  --parallel                Run tiers in parallel.
+  --max-parallel <n>        Maximum concurrent tiers when --parallel is set (default: 4).
+  --viewer                  Launch viewer after the eval pipeline.
+  --dry-run                 Print commands without running them.
+  -h, --help                Show help.
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --run-root)
+            RUN_ROOT="${2:-}"
+            shift 2
+            ;;
+        --max-config-retries)
+            MAX_CONFIG_RETRIES="${2:-}"
+            shift 2
+            ;;
+        --continue-on-error)
+            CONTINUE_ON_ERROR=1
+            shift
+            ;;
+        --resume)
+            RESUME=1
+            shift
+            ;;
+        --parallel)
+            PARALLEL=1
+            shift
+            ;;
+        --max-parallel)
+            MAX_PARALLEL="${2:-}"
+            shift 2
+            ;;
+        --viewer)
+            OPEN_VIEWER=1
+            shift
+            ;;
+        --dry-run)
+            DRY_RUN=1
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown argument: $1" >&2
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+if ! [[ "${MAX_CONFIG_RETRIES}" =~ ^[0-9]+$ ]]; then
+    echo "Error: --max-config-retries must be a non-negative integer, got '${MAX_CONFIG_RETRIES}'" >&2
+    exit 1
+fi
+
+if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || [[ "${MAX_PARALLEL}" -lt 1 ]]; then
+    echo "Error: --max-parallel must be a positive integer, got '${MAX_PARALLEL}'" >&2
+    exit 1
+fi
+
+if [[ ! -x "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" ]]; then
+    echo "Missing or non-executable: ${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" >&2
+    exit 1
+fi
+
+if [[ ! -x "${REPO_ROOT}/scripts/run_eval_pipeline.sh" ]]; then
+    echo "Missing or non-executable: ${REPO_ROOT}/scripts/run_eval_pipeline.sh" >&2
+    exit 1
+fi
+
+mkdir -p "${RUN_ROOT}"
+
+common_args=(
+    --model "${PLAYER_MODEL}"
+    --opponent "${OPPONENT_MODEL}"
+    --log-dir "${RUN_ROOT}"
+    --max-config-retries "${MAX_CONFIG_RETRIES}"
+)
+
+if [[ ${CONTINUE_ON_ERROR} -eq 1 ]]; then
+    common_args+=(--continue-on-error)
+fi
+
+if [[ ${RESUME} -eq 1 ]]; then
+    common_args+=(--resume)
+fi
+
+run_tier() {
+    local tier="$1"
+    local player_alias="gpt-5.4-${tier}"
+    local opponent_alias="gpt-5.3-codex-${tier}"
+    local -a args=("${common_args[@]}" --alias "${player_alias}" --opponent-alias "${opponent_alias}")
+
+    if [[ "${tier}" != "default" ]]; then
+        args+=(--player-reasoning-effort "${tier}" --opponent-reasoning-effort "${tier}")
+    fi
+
+    echo "==> Running tier: ${tier}"
+    "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" "${args[@]}"
+}
+
+echo "==> Repo root: ${REPO_ROOT}"
+echo "==> Run root: ${RUN_ROOT}"
+echo "==> Player model: ${PLAYER_MODEL}"
+echo "==> Opponent model: ${OPPONENT_MODEL}"
+echo "==> Tiers: default, low, medium, high"
+echo "==> Continue on error: ${CONTINUE_ON_ERROR}"
+echo "==> Resume: ${RESUME}"
+echo "==> Parallel tiers: ${PARALLEL}"
+echo "==> Max parallel tiers: ${MAX_PARALLEL}"
+echo "==> Max config retries: ${MAX_CONFIG_RETRIES}"
+
+if [[ ${DRY_RUN} -eq 1 ]]; then
+    for tier in default low medium high; do
+        if [[ "${tier}" == "default" ]]; then
+            echo "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh ${common_args[*]} --alias gpt-5.4-default --opponent-alias gpt-5.3-codex-default"
+        else
+            echo "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh ${common_args[*]} --alias gpt-5.4-${tier} --opponent-alias gpt-5.3-codex-${tier} --player-reasoning-effort ${tier} --opponent-reasoning-effort ${tier}"
+        fi
+    done
+    if [[ ${OPEN_VIEWER} -eq 1 ]]; then
+        echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${RUN_ROOT} --viewer"
+    else
+        echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${RUN_ROOT}"
+    fi
+    exit 0
+fi
+
+if [[ ${PARALLEL} -eq 1 ]]; then
+    declare -a PIDS=()
+    declare -a PID_TIERS=()
+    declare -i FAILURE_COUNT=0
+
+    wait_for_one() {
+        local pid="${PIDS[0]}"
+        local tier="${PID_TIERS[0]}"
+        local status=0
+
+        if ! wait "${pid}"; then
+            status=$?
+            echo "==> Tier failed: ${tier} (exit ${status})" >&2
+            FAILURE_COUNT+=1
+            if [[ ${CONTINUE_ON_ERROR} -ne 1 ]]; then
+                echo "==> Stopping due to tier failure and --continue-on-error not set." >&2
+                exit "${status}"
+            fi
+        else
+            echo "==> Tier finished: ${tier}"
+        fi
+
+        PIDS=("${PIDS[@]:1}")
+        PID_TIERS=("${PID_TIERS[@]:1}")
+    }
+
+    for tier in default low medium high; do
+        run_tier "${tier}" &
+        PIDS+=("$!")
+        PID_TIERS+=("${tier}")
+        if [[ ${#PIDS[@]} -ge ${MAX_PARALLEL} ]]; then
+            wait_for_one
+        fi
+    done
+
+    while [[ ${#PIDS[@]} -gt 0 ]]; do
+        wait_for_one
+    done
+
+    if [[ ${FAILURE_COUNT} -gt 0 ]]; then
+        echo "==> ${FAILURE_COUNT} tier(s) failed." >&2
+    fi
+else
+    run_tier default
+    run_tier low
+    run_tier medium
+    run_tier high
+fi
+
+if [[ ${OPEN_VIEWER} -eq 1 ]]; then
+    "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${RUN_ROOT}" --viewer
+else
+    "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${RUN_ROOT}"
+fi
diff --git a/scripts/run_openai_model_benchmarks.sh b/scripts/run_openai_model_benchmarks.sh
new file mode 100755
index 00000000..d258e655
--- /dev/null
+++ b/scripts/run_openai_model_benchmarks.sh
@@ -0,0 +1,498 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+usage() {
+    cat <<'EOF'
+Usage:
+  scripts/run_openai_model_benchmarks.sh --model <openai/model-id> [options]
+
+Description:
+  Generates configs for one OpenAI model vs a configurable opponent across the standard
+  benchmark arenas, runs all tournaments, and optionally runs post-eval analysis.
+
+Required:
+  --model <id>               Example: openai/gpt-5.4-pro-2026-03-05
+
+Optional:
+  --alias <name>             Player/config alias (default: model basename).
+  --opponent <id>            Opponent model id (default: openai/gpt-5).
+  --opponent-alias <name>    Opponent display alias in config (default: opponent basename).
+  --reasoning-effort <lvl>   Set reasoning_effort for both players (e.g. low|medium|high).
+  --player-reasoning-effort <lvl>
+                             Set reasoning_effort only for the evaluated model.
+  --opponent-reasoning-effort <lvl>
+                             Set reasoning_effort only for the opponent model.
+  --log-dir <path>           Logs root output dir (default: logs/<alias>_vs_<opponent>_<timestamp>).
+  --configs-dir <path>       Generated configs dir (default: configs/generated).
+  --resume                   Skip configs that already have a completed run in --log-dir.
+  --max-config-retries <n>   Retry each failed config up to n times (default: 2).
+  --continue-on-error        Continue to remaining configs even if one fails.
+  --post-eval                Run scripts/run_eval_pipeline.sh after runs.
+  --viewer                   With --post-eval, also launch viewer at end.
+  --dry-run                  Generate configs + print run commands only.
+  -h, --help                 Show help.
+
+Notes:
+  - This script uses model_class: litellm for both players.
+  - It expects OPENAI_API_KEY (and usually GITHUB_TOKEN) in your environment.
+EOF
+}
+
+MODEL=""
+ALIAS=""
+OPPONENT="openai/gpt-5"
+OPPONENT_ALIAS=""
+REASONING_EFFORT=""
+PLAYER_REASONING_EFFORT=""
+OPPONENT_REASONING_EFFORT=""
+LOG_DIR=""
+CONFIGS_DIR="${REPO_ROOT}/configs/generated"
+RUN_POST_EVAL=0
+OPEN_VIEWER=0
+DRY_RUN=0
+RESUME=0
+CONTINUE_ON_ERROR=0
+MAX_CONFIG_RETRIES=2
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --model)
+            MODEL="${2:-}"
+            shift 2
+            ;;
+        --alias)
+            ALIAS="${2:-}"
+            shift 2
+            ;;
+        --opponent)
+            OPPONENT="${2:-}"
+            shift 2
+            ;;
+        --opponent-alias)
+            OPPONENT_ALIAS="${2:-}"
+            shift 2
+            ;;
+        --reasoning-effort)
+            REASONING_EFFORT="${2:-}"
+            shift 2
+            ;;
+        --player-reasoning-effort)
+            PLAYER_REASONING_EFFORT="${2:-}"
+            shift 2
+            ;;
+        --opponent-reasoning-effort)
+            OPPONENT_REASONING_EFFORT="${2:-}"
+            shift 2
+            ;;
+        --log-dir)
+            LOG_DIR="${2:-}"
+            shift 2
+            ;;
+        --configs-dir)
+            CONFIGS_DIR="${2:-}"
+            shift 2
+            ;;
+        --post-eval)
+            RUN_POST_EVAL=1
+            shift
+            ;;
+        --resume)
+            RESUME=1
+            shift
+            ;;
+        --continue-on-error)
+            CONTINUE_ON_ERROR=1
+            shift
+            ;;
+        --max-config-retries)
+            MAX_CONFIG_RETRIES="${2:-}"
+            shift 2
+            ;;
+        --viewer)
+            OPEN_VIEWER=1
+            shift
+            ;;
+        --dry-run)
+            DRY_RUN=1
+            shift
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown argument: $1" >&2
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+if [[ -z "${MODEL}" ]]; then
+    echo "Error: --model is required." >&2
+    usage
+    exit 1
+fi
+
+if ! [[ "${MAX_CONFIG_RETRIES}" =~ ^[0-9]+$ ]]; then
+    echo "Error: --max-config-retries must be a non-negative integer, got '${MAX_CONFIG_RETRIES}'" >&2
+    exit 1
+fi
+
+# Force key source to repo .env so mini-swe-agent global env doesn't silently override.
+OPENAI_KEY_FROM_REPO="$(
+    uv run python - "${REPO_ROOT}/.env" <<'PY'
+import sys
+from pathlib import Path
+from dotenv import dotenv_values
+
+env_path = Path(sys.argv[1])
+if not env_path.exists():
+    raise SystemExit(2)
+val = dotenv_values(env_path).get("OPENAI_API_KEY", "")
+print(val or "")
+PY
+)"
+
+if [[ -z "${OPENAI_KEY_FROM_REPO}" ]]; then
+    echo "Error: OPENAI_API_KEY missing in ${REPO_ROOT}/.env" >&2
+    exit 1
+fi
+export OPENAI_API_KEY="${OPENAI_KEY_FROM_REPO}"
+unset OPENAI_KEY_FROM_REPO
+
+uv run python - "${REPO_ROOT}/.env" <<'PY'
+import hashlib
+import os
+import sys
+from pathlib import Path
+from dotenv import dotenv_values
+
+def fp(v: str) -> str:
+    return f"len={len(v)} sha256[:10]={hashlib.sha256(v.encode()).hexdigest()[:10]} tail={v[-4:]}"
+
+repo_env = Path(sys.argv[1])
+repo_key = dotenv_values(repo_env).get("OPENAI_API_KEY", "")
+env_key = os.environ.get("OPENAI_API_KEY", "")
+mini_env = Path.home() / "Library/Application Support/mini-swe-agent/.env"
+mini_key = dotenv_values(mini_env).get("OPENAI_API_KEY", "") if mini_env.exists() else ""
+
+print(f"==> OPENAI key source forced: {repo_env}")
+print(f"==> OPENAI key fingerprint: {fp(env_key)}")
+if mini_key and mini_key != repo_key:
+    print("==> note: mini-swe-agent global key differs; repo key is forced for this run.")
+PY
+
+normalize_model_id() {
+    local v="${1#@}"
+    if [[ "${v}" == */* ]]; then
+        echo "${v}"
+    else
+        echo "openai/${v}"
+    fi
+}
+
+# Accept "@openai/gpt-5", "openai/gpt-5", and bare "gpt-5".
+MODEL="$(normalize_model_id "${MODEL}")"
+OPPONENT="$(normalize_model_id "${OPPONENT}")"
+
+if [[ -n "${REASONING_EFFORT}" ]]; then
+    if [[ -z "${PLAYER_REASONING_EFFORT}" ]]; then
+        PLAYER_REASONING_EFFORT="${REASONING_EFFORT}"
+    fi
+    if [[ -z "${OPPONENT_REASONING_EFFORT}" ]]; then
+        OPPONENT_REASONING_EFFORT="${REASONING_EFFORT}"
+    fi
+fi
+
+requires_responses_model_class() {
+    local m="$1"
+    case "$m" in
+        openai/gpt-5.4-pro-2026-03-05) return 0 ;;
+        *) return 1 ;;
+    esac
+}
+
+PLAYER_MODEL_CLASS="litellm"
+OPPONENT_MODEL_CLASS="litellm"
+RESPONSES_SANITIZED_CLASS="codeclash.agents.litellm_response_sanitized_model.LitellmResponseSanitizedModel"
+responses_class_available=0
+if uv run python - "${RESPONSES_SANITIZED_CLASS}" <<'PY' >/dev/null 2>&1
+import importlib
+import sys
+
+class_path = sys.argv[1]
+module_name, class_name = class_path.rsplit(".", 1)
+mod = importlib.import_module(module_name)
+getattr(mod, class_name)
+PY
+then
+    responses_class_available=1
+fi
+if requires_responses_model_class "${MODEL}"; then
+    if [[ ${responses_class_available} -eq 1 ]]; then
+        PLAYER_MODEL_CLASS="${RESPONSES_SANITIZED_CLASS}"
+    else
+        echo "Warning: requested Responses model class '${RESPONSES_SANITIZED_CLASS}' is not importable; falling back to 'litellm'." >&2
+    fi
+fi
+if requires_responses_model_class "${OPPONENT}"; then
+    if [[ ${responses_class_available} -eq 1 ]]; then
+        OPPONENT_MODEL_CLASS="${RESPONSES_SANITIZED_CLASS}"
+    else
+        echo "Warning: requested Responses model class '${RESPONSES_SANITIZED_CLASS}' is not importable; falling back to 'litellm'." >&2
+    fi
+fi
+
+if [[ -z "${ALIAS}" ]]; then
+    ALIAS="${MODEL#openai/}"
+fi
+
+if [[ -z "${OPPONENT_ALIAS}" ]]; then
+    OPPONENT_ALIAS="${OPPONENT#openai/}"
+fi
+
+SAFE_ALIAS="${ALIAS//\//-}"
+SAFE_ALIAS="${SAFE_ALIAS//@/}"
+SAFE_OPPONENT_ALIAS="${OPPONENT_ALIAS//\//-}"
+SAFE_OPPONENT_ALIAS="${SAFE_OPPONENT_ALIAS//@/}"
+RUN_SUFFIX="${SAFE_ALIAS}-vs-${SAFE_OPPONENT_ALIAS}"
+
+if [[ -z "${LOG_DIR}" ]]; then
+    TS="$(date +%Y%m%d_%H%M%S)"
+    LOG_DIR="${REPO_ROOT}/logs/${SAFE_ALIAS}_vs_${SAFE_OPPONENT_ALIAS}_${TS}"
+fi
+
+RUN_CONFIG_DIR="${CONFIGS_DIR%/}/${SAFE_ALIAS}_vs_${SAFE_OPPONENT_ALIAS}"
+mkdir -p "${RUN_CONFIG_DIR}" "${LOG_DIR}"
+
+declare -a TEMPLATES=(
+    "${REPO_ROOT}/configs/main/BattleSnake__gpt-5__o3__r15__s1000.yaml"
+    "${REPO_ROOT}/configs/main/CoreWar__gpt-5__o3__r15__s1000.yaml"
+    "${REPO_ROOT}/configs/main/Halite__gpt-5__o3__r15__s250.yaml"
+    "${REPO_ROOT}/configs/main/RoboCode__gpt-5__o3__r15__s250.yaml"
+    "${REPO_ROOT}/configs/main/RobotRumble__gpt-5__o3__r15__s250.yaml"
+    "${REPO_ROOT}/configs/main/HuskyBench__gpt-5__o3__r15__s100.yaml"
+)
+
+for tpl in "${TEMPLATES[@]}"; do
+    if [[ ! -f "${tpl}" ]]; then
+        echo "Error: Missing template config: ${tpl}" >&2
+        exit 1
+    fi
+done
+
+declare -a GENERATED_CONFIGS=()
+for tpl in "${TEMPLATES[@]}"; do
+    base_name="$(basename "${tpl}")"
+    out_name="${base_name/__gpt-5__o3__/__${SAFE_ALIAS}__${SAFE_OPPONENT_ALIAS}__}"
+    out_path="${RUN_CONFIG_DIR}/${out_name}"
+
+    uv run python - "${tpl}" "${out_path}" "${MODEL}" "${ALIAS}" "${OPPONENT}" "${OPPONENT_ALIAS}" "${PLAYER_MODEL_CLASS}" "${OPPONENT_MODEL_CLASS}" "${PLAYER_REASONING_EFFORT}" "${OPPONENT_REASONING_EFFORT}" <<'PY'
+from pathlib import Path
+import re
+import sys
+
+src = Path(sys.argv[1])
+dst = Path(sys.argv[2])
+model = sys.argv[3]
+alias = sys.argv[4]
+opponent = sys.argv[5]
+opponent_alias = sys.argv[6]
+player_model_class = sys.argv[7]
+opponent_model_class = sys.argv[8]
+player_reasoning_effort = sys.argv[9]
+opponent_reasoning_effort = sys.argv[10]
+
+text = src.read_text()
+
+# First player name in template is gpt-5.
+text = re.sub(r"(?m)^  name: gpt-5$", f"  name: {alias}", text, count=1)
+# Second player name in template is o3.
+text = re.sub(r"(?m)^  name: o3$", f"  name: {opponent_alias}", text, count=1)
+
+# Convert model IDs from Portkey-style "@openai/*" to LiteLLM "openai/*".
+text = text.replace("model_name: '@openai/gpt-5'", f"model_name: '{model}'")
+text = text.replace("model_name: '@openai/o3'", f"model_name: '{opponent}'")
+
+# Base class for generated configs.
+text = text.replace("model_class: portkey", "model_class: litellm")
+
+# The template has exactly two player blocks; map class by player order.
+class_lines = [i for i, line in enumerate(text.splitlines()) if line.strip() == "model_class: litellm"]
+lines = text.splitlines()
+if len(class_lines) >= 1 and player_model_class != "litellm":
+    lines[class_lines[0]] = re.sub(r"litellm$", player_model_class, lines[class_lines[0]])
+if len(class_lines) >= 2 and opponent_model_class != "litellm":
+    lines[class_lines[1]] = re.sub(r"litellm$", opponent_model_class, lines[class_lines[1]])
+
+# Inject per-player reasoning effort as model_kwargs when requested.
+offset = 0
+targets = [
+    (class_lines[0] if len(class_lines) >= 1 else None, player_reasoning_effort),
+    (class_lines[1] if len(class_lines) >= 2 else None, opponent_reasoning_effort),
+]
+for class_idx, effort in targets:
+    if class_idx is None or not effort:
+        continue
+    idx = class_idx + offset
+    indent = re.match(r"^(\s*)", lines[idx]).group(1)
+    effort_escaped = effort.replace("'", "''")
+    lines[idx + 1 : idx + 1] = [
+        f"{indent}model_kwargs:",
+        f"{indent}  reasoning_effort: '{effort_escaped}'",
+    ]
+    offset += 2
+
+text = "\n".join(lines) + ("\n" if text.endswith("\n") else "")
+
+dst.write_text(text)
+PY
+
+    GENERATED_CONFIGS+=("${out_path}")
+done
+
+echo "==> Model: ${MODEL}"
+echo "==> Alias: ${ALIAS}"
+echo "==> Opponent: ${OPPONENT}"
+echo "==> Opponent alias: ${OPPONENT_ALIAS}"
+echo "==> Player model class: ${PLAYER_MODEL_CLASS}"
+echo "==> Opponent model class: ${OPPONENT_MODEL_CLASS}"
+echo "==> Player reasoning_effort: ${PLAYER_REASONING_EFFORT:-<default>}"
+echo "==> Opponent reasoning_effort: ${OPPONENT_REASONING_EFFORT:-<default>}"
+echo "==> Generated configs dir: ${RUN_CONFIG_DIR}"
+echo "==> Logs dir: ${LOG_DIR}"
+echo "==> Resume: ${RESUME}"
+echo "==> Continue on error: ${CONTINUE_ON_ERROR}"
+echo "==> Max config retries: ${MAX_CONFIG_RETRIES}"
+echo "==> Configs:"
+printf '  - %s\n' "${GENERATED_CONFIGS[@]}"
+
+if [[ ${DRY_RUN} -eq 1 ]]; then
+    echo
+    echo "Dry-run only. Commands that would run:"
+    for cfg in "${GENERATED_CONFIGS[@]}"; do
+        echo "uv run python ${REPO_ROOT}/main.py ${cfg} -o ${LOG_DIR} -s ${RUN_SUFFIX}"
+    done
+    if [[ ${RUN_POST_EVAL} -eq 1 ]]; then
+        if [[ ${OPEN_VIEWER} -eq 1 ]]; then
+            echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${LOG_DIR} --viewer"
+        else
+            echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${LOG_DIR}"
+        fi
+    fi
+    exit 0
+fi
+
+cd "${REPO_ROOT}"
+
+is_config_completed() {
+    local cfg_path="$1"
+    local log_dir="$2"
+    uv run python - "$cfg_path" "$log_dir" <<'PY'
+import json
+import sys
+from pathlib import Path
+
+import yaml
+
+cfg_path = Path(sys.argv[1])
+log_dir = Path(sys.argv[2])
+
+cfg = yaml.safe_load(cfg_path.read_text())
+cfg_game = cfg["game"]["name"]
+cfg_rounds = int(cfg["tournament"]["rounds"])
+cfg_players = sorted(p["name"] for p in cfg["players"])
+expected_round_keys = {str(i) for i in range(cfg_rounds + 1)}
+
+for meta_path in log_dir.rglob("metadata.json"):
+    try:
+        meta = json.loads(meta_path.read_text())
+    except Exception:
+        continue
+    conf = meta.get("config", {})
+    game = conf.get("game", {}).get("name")
+    players = sorted(p.get("name") for p in conf.get("players", []))
+    if game != cfg_game or players != cfg_players:
+        continue
+    round_stats = meta.get("round_stats", {})
+    if expected_round_keys.issubset(set(round_stats.keys())):
+        print(meta_path)
+        sys.exit(0)
+
+sys.exit(1)
+PY
+}
+
+declare -a COMPLETED_CONFIGS=()
+declare -a SKIPPED_CONFIGS=()
+declare -a FAILED_CONFIGS=()
+
+for cfg in "${GENERATED_CONFIGS[@]}"; do
+    echo
+    echo "==> Running benchmark: ${cfg}"
+
+    if [[ ${RESUME} -eq 1 ]]; then
+        if completed_path="$(is_config_completed "${cfg}" "${LOG_DIR}" 2>/dev/null)"; then
+            echo "    skipping (resume): found completed run at ${completed_path}"
+            SKIPPED_CONFIGS+=("${cfg}")
+            continue
+        fi
+    fi
+
+    max_attempts=$((MAX_CONFIG_RETRIES + 1))
+    attempt=1
+    ran_ok=0
+    while [[ ${attempt} -le ${max_attempts} ]]; do
+        echo "    attempt ${attempt}/${max_attempts}"
+        if uv run python "${REPO_ROOT}/main.py" "${cfg}" -o "${LOG_DIR}" -s "${RUN_SUFFIX}"; then
+            ran_ok=1
+            break
+        fi
+        if [[ ${attempt} -lt ${max_attempts} ]]; then
+            sleep_s=$((15 * (2 ** (attempt - 1))))
+            echo "    failed attempt ${attempt}; retrying in ${sleep_s}s..."
+            sleep "${sleep_s}"
+        fi
+        attempt=$((attempt + 1))
+    done
+
+    if [[ ${ran_ok} -eq 1 ]]; then
+        COMPLETED_CONFIGS+=("${cfg}")
+        continue
+    fi
+
+    FAILED_CONFIGS+=("${cfg}")
+    if [[ ${CONTINUE_ON_ERROR} -eq 0 ]]; then
+        echo "Error: benchmark failed and --continue-on-error is not set." >&2
+        exit 1
+    fi
+done
+
+echo
+echo "==> Run summary"
+echo "Completed: ${#COMPLETED_CONFIGS[@]}"
+echo "Skipped (resume): ${#SKIPPED_CONFIGS[@]}"
+echo "Failed: ${#FAILED_CONFIGS[@]}"
+if [[ ${#FAILED_CONFIGS[@]} -gt 0 ]]; then
+    printf '  - %s\n' "${FAILED_CONFIGS[@]}"
+fi
+
+if [[ ${RUN_POST_EVAL} -eq 1 ]]; then
+    echo
+    echo "==> Running post-eval pipeline..."
+    if [[ ${OPEN_VIEWER} -eq 1 ]]; then
+        "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${LOG_DIR}" --viewer
+    else
+        "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${LOG_DIR}"
+    fi
+fi
+
+echo
+echo "Done."
+echo "Logs: ${LOG_DIR}"
+echo "Generated configs: ${RUN_CONFIG_DIR}"
diff --git a/scripts/run_openai_sweep.sh b/scripts/run_openai_sweep.sh
new file mode 100755
index 00000000..16cb59cd
--- /dev/null
+++ b/scripts/run_openai_sweep.sh
@@ -0,0 +1,332 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+REPO="/Users/muhtasham/Documents/CodeClash"
+MODELS=(
+  "openai/gpt-5.4"
+  "openai/gpt-5.3-codex"
+)
+OPPONENT="openai/gpt-5"
+
+RUN_ROOT="$REPO/logs/new_openai_sweep_$(date +%Y%m%d_%H%M%S)"
+CHECK_ONLY=0
+PUSH_DIFFS=0
+OPEN_VIEWER=0
+RESUME=0
+CONTINUE_ON_ERROR=0
+MAX_CONFIG_RETRIES=2
+REASONING_EFFORT=""
+PLAYER_REASONING_EFFORT=""
+OPPONENT_REASONING_EFFORT=""
+
+usage() {
+  cat <<'EOF'
+Usage:
+  scripts/run_openai_sweep.sh [options]
+
+Options:
+  --run-root <path>   Set custom logs root for this sweep.
+  --opponent <id>     Opponent baseline model (default: openai/gpt-5).
+  --reasoning-effort <lvl>
+                      Set reasoning_effort for both players (e.g. low|medium|high).
+  --player-reasoning-effort <lvl>
+                      Set reasoning_effort for evaluated models only.
+  --opponent-reasoning-effort <lvl>
+                      Set reasoning_effort for opponent only.
+  --check-only        Run preflight checks + dry runs only, then exit.
+  --resume            Skip already-completed per-arena configs in an existing --run-root.
+  --max-config-retries <n>
+                      Retry each failed arena config up to n times (default: 2).
+  --continue-on-error Continue with other configs/models when a config fails.
+  --push-diffs        After eval, push per-tournament code diffs to arena repos.
+  --viewer            Launch local viewer at end of pipeline.
+  -h, --help          Show help.
+
+What this script does:
+  1) Preflight checks:
+     - local scripts/dependencies
+     - LiteLLM support for:
+       openai/gpt-5.4
+       openai/gpt-5.3-codex
+       and opponent baseline model
+     - dry-run config generation for each model
+  2) Full benchmark runs for all listed models (all standard arenas vs chosen opponent)
+  3) Combined post-eval pipeline over one shared run root
+  4) Optional diff-branch push to CodeClash arena repos
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --run-root)
+      RUN_ROOT="${2:-}"
+      shift 2
+      ;;
+    --opponent)
+      OPPONENT="${2:-}"
+      shift 2
+      ;;
+    --reasoning-effort)
+      REASONING_EFFORT="${2:-}"
+      shift 2
+      ;;
+    --player-reasoning-effort)
+      PLAYER_REASONING_EFFORT="${2:-}"
+      shift 2
+      ;;
+    --opponent-reasoning-effort)
+      OPPONENT_REASONING_EFFORT="${2:-}"
+      shift 2
+      ;;
+    --check-only)
+      CHECK_ONLY=1
+      shift
+      ;;
+    --resume)
+      RESUME=1
+      shift
+      ;;
+    --max-config-retries)
+      MAX_CONFIG_RETRIES="${2:-}"
+      shift 2
+      ;;
+    --continue-on-error)
+      CONTINUE_ON_ERROR=1
+      shift
+      ;;
+    --push-diffs)
+      PUSH_DIFFS=1
+      shift
+      ;;
+    --viewer)
+      OPEN_VIEWER=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage
+      exit 1
+      ;;
+  esac
+done
+
+if ! [[ "${MAX_CONFIG_RETRIES}" =~ ^[0-9]+$ ]]; then
+  echo "Error: --max-config-retries must be a non-negative integer, got '${MAX_CONFIG_RETRIES}'" >&2
+  exit 1
+fi
+
+normalize_model_id() {
+  local v="${1#@}"
+  if [[ "${v}" == */* ]]; then
+    echo "${v}"
+  else
+    echo "openai/${v}"
+  fi
+}
+
+OPPONENT="$(normalize_model_id "${OPPONENT}")"
+
+if [[ -n "${REASONING_EFFORT}" ]]; then
+  if [[ -z "${PLAYER_REASONING_EFFORT}" ]]; then
+    PLAYER_REASONING_EFFORT="${REASONING_EFFORT}"
+  fi
+  if [[ -z "${OPPONENT_REASONING_EFFORT}" ]]; then
+    OPPONENT_REASONING_EFFORT="${REASONING_EFFORT}"
+  fi
+fi
+
+COMMON_BENCH_ARGS=(--opponent "$OPPONENT" --max-config-retries "$MAX_CONFIG_RETRIES")
+if [[ "$RESUME" -eq 1 ]]; then
+  COMMON_BENCH_ARGS+=(--resume)
+fi
+if [[ "$CONTINUE_ON_ERROR" -eq 1 ]]; then
+  COMMON_BENCH_ARGS+=(--continue-on-error)
+fi
+if [[ -n "${PLAYER_REASONING_EFFORT}" ]]; then
+  COMMON_BENCH_ARGS+=(--player-reasoning-effort "$PLAYER_REASONING_EFFORT")
+fi
+if [[ -n "${OPPONENT_REASONING_EFFORT}" ]]; then
+  COMMON_BENCH_ARGS+=(--opponent-reasoning-effort "$OPPONENT_REASONING_EFFORT")
+fi
+
+if [[ ! -d "$REPO" ]]; then
+  echo "Repo not found: $REPO" >&2
+  exit 1
+fi
+
+if [[ ! -x "$REPO/scripts/run_openai_model_benchmarks.sh" ]]; then
+  echo "Missing or non-executable: $REPO/scripts/run_openai_model_benchmarks.sh" >&2
+  exit 1
+fi
+
+if [[ ! -x "$REPO/scripts/run_eval_pipeline.sh" ]]; then
+  echo "Missing or non-executable: $REPO/scripts/run_eval_pipeline.sh" >&2
+  exit 1
+fi
+
+if [[ ! -f "$REPO/scripts/push_log_to_gh.py" ]]; then
+  echo "Missing: $REPO/scripts/push_log_to_gh.py" >&2
+  exit 1
+fi
+
+mkdir -p "$RUN_ROOT"
+
+cd "$REPO"
+
+echo "==> Auth preflight: forcing OPENAI key from repo .env..."
+OPENAI_KEY_FROM_REPO="$(
+  uv run python - "$REPO/.env" <<'PY'
+import sys
+from pathlib import Path
+from dotenv import dotenv_values
+
+env_path = Path(sys.argv[1])
+if not env_path.exists():
+    raise SystemExit(2)
+val = dotenv_values(env_path).get("OPENAI_API_KEY", "")
+print(val or "")
+PY
+)"
+
+if [[ -z "$OPENAI_KEY_FROM_REPO" ]]; then
+  echo "Error: OPENAI_API_KEY missing in $REPO/.env" >&2
+  exit 1
+fi
+export OPENAI_API_KEY="$OPENAI_KEY_FROM_REPO"
+unset OPENAI_KEY_FROM_REPO
+
+uv run python - "$REPO/.env" <<'PY'
+import hashlib
+import os
+import sys
+from pathlib import Path
+from dotenv import dotenv_values
+
+def fp(v: str) -> str:
+    return f"len={len(v)} sha256[:10]={hashlib.sha256(v.encode()).hexdigest()[:10]} tail={v[-4:]}"
+
+repo_env = Path(sys.argv[1])
+repo_key = dotenv_values(repo_env).get("OPENAI_API_KEY", "")
+env_key = os.environ.get("OPENAI_API_KEY", "")
+mini_env = Path.home() / "Library/Application Support/mini-swe-agent/.env"
+mini_key = dotenv_values(mini_env).get("OPENAI_API_KEY", "") if mini_env.exists() else ""
+
+print(f"  source: {repo_env}")
+print(f"  active OPENAI_API_KEY: {fp(env_key)}")
+print(f"  repo   OPENAI_API_KEY: {fp(repo_key)}")
+if mini_key:
+    print(f"  mini   OPENAI_API_KEY: {fp(mini_key)}")
+    if mini_key != repo_key:
+        print("  note: mini-swe-agent global key differs; repo key is forced for this run.")
+PY
+
+echo "==> Sweep run root: $RUN_ROOT"
+echo "==> Models:"
+printf '  - %s\n' "${MODELS[@]}"
+echo "==> Opponent baseline: $OPPONENT"
+echo "==> Player reasoning_effort: ${PLAYER_REASONING_EFFORT:-<default>}"
+echo "==> Opponent reasoning_effort: ${OPPONENT_REASONING_EFFORT:-<default>}"
+
+echo
+echo "==> Preflight 1/3: LiteLLM model support checks..."
+uv run python - "$OPPONENT" <<'PY'
+import sys
+from importlib.metadata import version
+import litellm
+
+opponent = sys.argv[1]
+models = [
+    "openai/gpt-5.4",
+    "openai/gpt-5.3-codex",
+    opponent,
+]
+
+print(f"litellm_version={version('litellm')}")
+ok = True
+for m in models:
+    print(f"\nMODEL {m}")
+    try:
+        print("  provider:", litellm.get_llm_provider(model=m))
+    except Exception as e:
+        ok = False
+        print(f"  provider_error: {type(e).__name__}: {e}")
+    try:
+        info = litellm.get_model_info(model=m)
+        print(
+            "  model_info:",
+            {
+                "max_input_tokens": info.get("max_input_tokens"),
+                "max_output_tokens": info.get("max_output_tokens"),
+                "supports_function_calling": info.get("supports_function_calling"),
+            },
+        )
+    except Exception as e:
+        ok = False
+        print(f"  model_info_error: {type(e).__name__}: {e}")
+
+if not ok:
+    sys.exit(1)
+PY
+
+echo
+echo "==> Preflight 2/3: Dry-run config generation checks..."
+for MODEL in "${MODELS[@]}"; do
+  echo "  -> $MODEL"
+  "$REPO/scripts/run_openai_model_benchmarks.sh" \
+    --model "$MODEL" \
+    --log-dir "$RUN_ROOT" \
+    "${COMMON_BENCH_ARGS[@]}" \
+    --dry-run >/dev/null
+done
+
+echo
+echo "==> Preflight 3/3: GitHub CLI auth check (needed for optional upload/push flows)..."
+gh auth status >/dev/null
+echo "  gh auth: OK"
+
+if [[ "$CHECK_ONLY" -eq 1 ]]; then
+  echo
+  echo "Preflight + dry-run checks passed. Exiting due to --check-only."
+  exit 0
+fi
+
+echo
+echo "==> Running full sweeps..."
+for MODEL in "${MODELS[@]}"; do
+  echo
+  echo "### Running model: $MODEL"
+  "$REPO/scripts/run_openai_model_benchmarks.sh" \
+    --model "$MODEL" \
+    --log-dir "$RUN_ROOT" \
+    "${COMMON_BENCH_ARGS[@]}"
+done
+
+echo
+echo "==> Running combined post-eval pipeline..."
+if [[ "$OPEN_VIEWER" -eq 1 ]]; then
+  "$REPO/scripts/run_eval_pipeline.sh" --log-dir "$RUN_ROOT" --viewer
+else
+  "$REPO/scripts/run_eval_pipeline.sh" --log-dir "$RUN_ROOT"
+fi
+
+if [[ "$PUSH_DIFFS" -eq 1 ]]; then
+  echo
+  echo "==> Pushing per-tournament diffs to arena repos..."
+  find "$RUN_ROOT" -type f -name metadata.json -print0 \
+    | xargs -0 -I{} dirname "{}" \
+    | sort -u \
+    | while read -r folder; do
+        echo "  -> $folder"
+        uv run python "$REPO/scripts/push_log_to_gh.py" "$folder"
+      done
+fi
+
+echo
+echo "Done."
+echo "Run root: $RUN_ROOT"
+echo "Combined leaderboard JSON: $RUN_ROOT/analysis/elo/leaderboards.json"
diff --git a/scripts/scrape_viewer_leaderboard_runs.py b/scripts/scrape_viewer_leaderboard_runs.py
new file mode 100644
index 00000000..7db572a7
--- /dev/null
+++ b/scripts/scrape_viewer_leaderboard_runs.py
@@ -0,0 +1,324 @@
+#!/usr/bin/env python3
+"""Scrape viewer.codeclash.ai completed runs and download metadata.json files.
+
+By default this targets the 8-model public leaderboard cohort and the 6 arenas:
+BattleSnake, CoreWar, Halite, RobotRumble, RoboCode, HuskyBench.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import urllib.parse
+import urllib.request
+from dataclasses import dataclass
+from pathlib import Path
+
+
+DEFAULT_MODELS = [
+    "claude-sonnet-4-5-20250929",
+    "gpt-5",
+    "o3",
+    "claude-sonnet-4-20250514",
+    "gpt-5-mini",
+    "gemini-2.5-pro",
+    "grok-code-fast-1",
+    "qwen3-coder-plus-2025-09-23",
+]
+
+DEFAULT_GAMES = ["BattleSnake", "CoreWar", "Halite", "RobotRumble", "RoboCode", "HuskyBench"]
+
+VIEWER_BASE = "https://viewer.codeclash.ai"
+
+
+@dataclass(frozen=True)
+class RunRef:
+    rel_path: str
+    game: str
+    rounds: int
+    sims: int
+    players: int
+    p1: str
+    p2: str
+    ts: str
+
+
+def fetch_index_html() -> str:
+    with urllib.request.urlopen(VIEWER_BASE + "/", timeout=60) as r:
+        return r.read().decode("utf-8", errors="replace")
+
+
+def extract_paths(html: str) -> list[str]:
+    # Example: data-path="completed/PvpTournament.BattleSnake.r15.s1000.p2.a.b.251002061714"
+    return sorted(set(re.findall(r'data-path="(completed/PvpTournament\.[^"]+)"', html)))
+
+
+def _alias_variants(alias: str) -> set[str]:
+    norm = re.sub(r"[^a-zA-Z0-9]", "", alias).lower()
+    return {alias, norm}
+
+
+def _build_variant_lookup(models: set[str]) -> dict[str, str]:
+    out: dict[str, str] = {}
+    for m in models:
+        for v in _alias_variants(m):
+            out[v] = m
+    return out
+
+
+def parse_run(path: str, models: set[str], variant_to_alias: dict[str, str]) -> RunRef | None:
+    # Parse fixed front/back first, then decode the middle p1.p2 region safely.
+    if not path.startswith("completed/PvpTournament."):
+        return None
+    # Strip prefix "completed/PvpTournament."
+    body = path[len("completed/PvpTournament.") :]
+    # Find timestamp token (12 digits), allowing optional suffix after it
+    # (e.g. ".<uuid>-uuid" in some games).
+    try:
+        pre = body
+        parts = pre.split(".")
+        ts_idx = None
+        for i in range(len(parts) - 1, -1, -1):
+            if re.fullmatch(r"\d{12}", parts[i]):
+                ts_idx = i
+                break
+        if ts_idx is None:
+            return None
+        ts = parts[ts_idx]
+        pre = ".".join(parts[:ts_idx])
+    except ValueError:
+        return None
+    parts = pre.split(".")
+    if len(parts) < 5:
+        return None
+    game = parts[0]
+    rounds_s = parts[1]
+    sims_s = parts[2]
+    players_s = parts[3]
+    model_region = ".".join(parts[4:])
+
+    if not rounds_s.startswith("r") or not sims_s.startswith("s") or not players_s.startswith("p"):
+        return None
+
+    rounds = int(rounds_s[1:])
+    sims = int(sims_s[1:])
+    players = int(players_s[1:])
+
+    # Identify p1/p2 using known model aliases (including normalized
+    # hyphenless variants used in some logs).
+    p1 = p2 = None
+    variants = sorted(variant_to_alias.keys(), key=len, reverse=True)
+    for alias_variant in variants:
+        pref = alias_variant + "."
+        if model_region.startswith(pref):
+            tail = model_region[len(pref) :]
+            if tail in variant_to_alias:
+                p1 = variant_to_alias[alias_variant]
+                p2 = variant_to_alias[tail]
+                break
+    if p1 is None or p2 is None:
+        return None
+
+    return RunRef(
+        rel_path=path,
+        game=game,
+        rounds=rounds,
+        sims=sims,
+        players=players,
+        p1=p1,
+        p2=p2,
+        ts=ts,
+    )
+
+
+def build_download_url(rel_path: str) -> str:
+    # Endpoint expects absolute path on the viewer host.
+    abs_path = f"/home/klieret/CodeClash/logs/{rel_path}/metadata.json"
+    q = urllib.parse.urlencode({"path": abs_path})
+    return f"{VIEWER_BASE}/download-file/?{q}"
+
+
+def build_game_page_url(rel_path: str) -> str:
+    return f"{VIEWER_BASE}/game/{rel_path}.html"
+
+
+def _extract_json_object_after_marker(text: str, marker: str) -> str | None:
+    idx = text.find(marker)
+    if idx < 0:
+        return None
+    i = idx + len(marker)
+    while i < len(text) and text[i].isspace():
+        i += 1
+    if i >= len(text) or text[i] != "{":
+        return None
+
+    # Brace-match while respecting quoted strings.
+    depth = 0
+    in_str = False
+    escaped = False
+    start = i
+    for j in range(i, len(text)):
+        ch = text[j]
+        if in_str:
+            if escaped:
+                escaped = False
+            elif ch == "\\":
+                escaped = True
+            elif ch == '"':
+                in_str = False
+            continue
+        if ch == '"':
+            in_str = True
+            continue
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return text[start : j + 1]
+    return None
+
+
+def _extract_embedded_metadata_from_game_html(html: str) -> dict | None:
+    blob = _extract_json_object_after_marker(html, "initializeJSONEditors(")
+    if not blob:
+        return None
+    return json.loads(blob)
+
+
+def download(url: str, out_file: Path) -> bool:
+    out_file.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        with urllib.request.urlopen(url, timeout=60) as r:
+            data = r.read()
+        out_file.write_bytes(data)
+        # basic sanity
+        json.loads(out_file.read_text())
+        return True
+    except Exception:
+        return False
+
+
+def download_metadata_via_game_page(rel_path: str, out_file: Path) -> bool:
+    out_file.parent.mkdir(parents=True, exist_ok=True)
+    page_url = build_game_page_url(rel_path)
+    try:
+        with urllib.request.urlopen(page_url, timeout=60) as r:
+            html = r.read().decode("utf-8", errors="replace")
+        payload = _extract_embedded_metadata_from_game_html(html)
+        if payload is None:
+            return False
+        # Viewer pages embed a wrapper object used by front-end widgets. The
+        # actual tournament metadata is under "results".
+        metadata = payload.get("results") if isinstance(payload, dict) else None
+        if not isinstance(metadata, dict):
+            metadata = payload
+        out_file.write_text(json.dumps(metadata, indent=2))
+        # basic sanity
+        json.loads(out_file.read_text())
+        return True
+    except Exception:
+        return False
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--output-root",
+        type=Path,
+        required=True,
+        help="Local root to save downloaded runs (folders with metadata.json).",
+    )
+    ap.add_argument("--models", nargs="*", default=DEFAULT_MODELS, help="Model aliases to include.")
+    ap.add_argument("--games", nargs="*", default=DEFAULT_GAMES, help="Game names to include.")
+    ap.add_argument("--rounds", type=int, default=15)
+    ap.add_argument("--players", type=int, default=2)
+    ap.add_argument(
+        "--strategy",
+        choices=["latest-per-pair-game", "all-matching"],
+        default="latest-per-pair-game",
+        help="Download only latest run per (game,unordered pair), or all matching runs.",
+    )
+    args = ap.parse_args()
+
+    models = set(args.models)
+    games = set(args.games)
+    variant_to_alias = _build_variant_lookup(models)
+
+    html = fetch_index_html()
+    paths = extract_paths(html)
+
+    runs: list[RunRef] = []
+    for p in paths:
+        r = parse_run(p, models, variant_to_alias)
+        if r is None:
+            continue
+        if r.game not in games:
+            continue
+        if r.rounds != args.rounds or r.players != args.players:
+            continue
+        runs.append(r)
+
+    if args.strategy == "latest-per-pair-game":
+        best: dict[tuple[str, tuple[str, str]], RunRef] = {}
+        for r in runs:
+            pair = tuple(sorted((r.p1, r.p2)))
+            k = (r.game, pair)
+            prev = best.get(k)
+            if prev is None or int(r.ts) > int(prev.ts):
+                best[k] = r
+        selected = sorted(best.values(), key=lambda x: (x.game, tuple(sorted((x.p1, x.p2))), x.ts))
+    else:
+        selected = sorted(runs, key=lambda x: (x.game, x.p1, x.p2, x.ts))
+
+    ok = 0
+    fail = 0
+    manifest = []
+    for r in selected:
+        url = build_download_url(r.rel_path)
+        page_url = build_game_page_url(r.rel_path)
+        out_file = args.output_root / r.rel_path / "metadata.json"
+        success = download_metadata_via_game_page(r.rel_path, out_file)
+        if not success:
+            success = download(url, out_file)
+        manifest.append(
+            {
+                "rel_path": r.rel_path,
+                "game": r.game,
+                "p1": r.p1,
+                "p2": r.p2,
+                "rounds": r.rounds,
+                "players": r.players,
+                "sims": r.sims,
+                "ts": r.ts,
+                "page_url": page_url,
+                "download_url": url,
+                "ok": success,
+                "local_metadata": str(out_file),
+            }
+        )
+        if success:
+            ok += 1
+        else:
+            fail += 1
+
+    args.output_root.mkdir(parents=True, exist_ok=True)
+    (args.output_root / "download_manifest.json").write_text(json.dumps(manifest, indent=2))
+
+    summary = {
+        "selected_runs": len(selected),
+        "download_ok": ok,
+        "download_failed": fail,
+        "output_root": str(args.output_root),
+        "strategy": args.strategy,
+        "models": sorted(models),
+        "games": sorted(games),
+    }
+    (args.output_root / "summary.json").write_text(json.dumps(summary, indent=2))
+    print(json.dumps(summary, indent=2))
+    return 0 if fail == 0 else 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/watch_sweep_progress.sh b/scripts/watch_sweep_progress.sh
new file mode 100755
index 00000000..1729f382
--- /dev/null
+++ b/scripts/watch_sweep_progress.sh
@@ -0,0 +1,199 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
+
+RUN_ROOT=""
+OPPONENT_ALIAS="gpt-5"
+INTERVAL=15
+ONCE=0
+ALL_TIERS=0
+
+usage() {
+  cat <<'EOF'
+Usage:
+  scripts/watch_sweep_progress.sh [options]
+
+Options:
+  --run-root <path>       Explicit sweep log root.
+  --opponent-alias <name> Match generated config dirs: *_vs_<name> (default: gpt-5).
+  --all-tiers             Show default, low, medium, and high tiers together.
+  --interval <seconds>    Refresh interval (default: 15).
+  --once                  Print one snapshot and exit.
+  -h, --help              Show help.
+
+Default run-root auto-detection order:
+  1) latest logs/new_openai_sweep_*
+  2) latest logs/gpt54_vs_gpt53codex_reasoning_*
+EOF
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --run-root)
+      RUN_ROOT="${2:-}"
+      shift 2
+      ;;
+    --opponent-alias)
+      OPPONENT_ALIAS="${2:-}"
+      shift 2
+      ;;
+    --all-tiers)
+      ALL_TIERS=1
+      shift
+      ;;
+    --interval)
+      INTERVAL="${2:-}"
+      shift 2
+      ;;
+    --once)
+      ONCE=1
+      shift
+      ;;
+    -h|--help)
+      usage
+      exit 0
+      ;;
+    *)
+      echo "Unknown argument: $1" >&2
+      usage
+      exit 1
+      ;;
+  esac
+done
+
+if [[ -z "${RUN_ROOT}" ]]; then
+  RUN_ROOT="$(
+    ls -td \
+      "${REPO_ROOT}"/logs/new_openai_sweep_* \
+      "${REPO_ROOT}"/logs/gpt54_vs_gpt53codex_reasoning_* \
+      2>/dev/null | head -n 1 || true
+  )"
+fi
+
+if [[ -z "${RUN_ROOT}" || ! -d "${RUN_ROOT}" ]]; then
+  echo "No valid run root found. Pass --run-root explicitly." >&2
+  exit 1
+fi
+
+if ! [[ "${INTERVAL}" =~ ^[0-9]+$ ]]; then
+  echo "--interval must be an integer number of seconds." >&2
+  exit 1
+fi
+
+print_snapshot() {
+  local opponent_alias="$1"
+  cd "${REPO_ROOT}"
+  uv run python - "${REPO_ROOT}" "${RUN_ROOT}" "${opponent_alias}" <<'PY'
+from pathlib import Path
+import json
+import yaml
+
+from codeclash import CONFIG_DIR
+from codeclash.utils.yaml_utils import resolve_includes
+
+repo = Path(__import__("sys").argv[1])
+run_root = Path(__import__("sys").argv[2])
+opponent_alias = __import__("sys").argv[3]
+
+def normalize_players(players: list[str], opponent_alias: str) -> list[str]:
+    """Normalize known stale aliases in generated configs/metadata for matching."""
+    suffix = None
+    prefix = "gpt-5.3-codex-"
+    if opponent_alias.startswith(prefix):
+        suffix = opponent_alias[len(prefix):]
+
+    normalized = []
+    for player in players:
+        if player == "gpt5" and suffix:
+            normalized.append(f"gpt-5.4-{suffix}")
+        else:
+            normalized.append(player)
+    return sorted(normalized)
+
+cfgs = sorted((repo / "configs" / "generated").glob(f"*_vs_{opponent_alias}/*.yaml"))
+print(f"RUN_ROOT: {run_root}")
+print(f"TOTAL CONFIGS: {len(cfgs)}")
+
+metas = []
+for m in run_root.rglob("metadata.json"):
+    try:
+        md = json.loads(m.read_text())
+        cc = md.get("config", {})
+        metas.append(
+            (
+                cc.get("game", {}).get("name"),
+                normalize_players([p.get("name") for p in cc.get("players", [])], opponent_alias),
+                md,
+                m,
+            )
+        )
+    except Exception:
+        pass
+
+done = partial = pending = 0
+for c in cfgs:
+    cfg = yaml.safe_load(resolve_includes(c.read_text(), base_dir=CONFIG_DIR))
+    game = cfg["game"]["name"]
+    rounds = int(cfg["tournament"]["rounds"])
+    players = normalize_players([p["name"] for p in cfg["players"]], opponent_alias)
+
+    # Pick newest metadata for this game+player pair (important when retries create multiple folders).
+    hit = None
+    newest_mtime = -1.0
+    for g, p, md, meta_path in metas:
+        if g != game or p != players:
+            continue
+        ts = float((md.get("timing") or {}).get("start_time", 0.0))
+        if ts < 1.0:
+            ts = meta_path.stat().st_mtime
+        if ts >= newest_mtime:
+            newest_mtime = ts
+            hit = md
+
+    if not hit:
+        st = "PENDING"
+        pending += 1
+    else:
+        rs = hit.get("round_stats", {})
+        st = "DONE" if len(rs) >= rounds + 1 else "PARTIAL"
+        done += st == "DONE"
+        partial += st == "PARTIAL"
+
+    print(f"{st:7} {c.name}")
+
+print(f"\nSUMMARY done={done} partial={partial} pending={pending}")
+PY
+}
+
+print_all_tiers() {
+  local tier
+  for tier in default low medium high; do
+    echo "===== ${tier} ====="
+    print_snapshot "gpt-5.3-codex-${tier}"
+    echo
+  done
+}
+
+if [[ "${ONCE}" -eq 1 ]]; then
+  if [[ "${ALL_TIERS}" -eq 1 ]]; then
+    print_all_tiers
+  else
+    print_snapshot "${OPPONENT_ALIAS}"
+  fi
+  exit 0
+fi
+
+while true; do
+  clear
+  echo "Sweep Progress Monitor ($(date))"
+  echo
+  if [[ "${ALL_TIERS}" -eq 1 ]]; then
+    print_all_tiers
+  else
+    print_snapshot "${OPPONENT_ALIAS}"
+  fi
+  sleep "${INTERVAL}"
+done
diff --git a/tests/arenas/test_robocode.py b/tests/arenas/test_robocode.py
index 31e232d9..0fafeb27 100644
--- a/tests/arenas/test_robocode.py
+++ b/tests/arenas/test_robocode.py
@@ -8,6 +8,7 @@
 
 from codeclash.arenas.arena import RoundStats
 from codeclash.arenas.robocode.robocode import RC_FILE, SIMS_PER_RUN, RoboCodeArena
+from codeclash.constants import RESULT_TIE
 
 from .conftest import MockPlayer
 
@@ -225,6 +226,37 @@ def test_parse_results_player2_wins(self, arena, tmp_log_dir):
         assert stats.scores["Alice"] == 4500
         assert stats.scores["Bob"] == 9500
 
+    def test_parse_results_tie(self, arena, tmp_log_dir):
+        """Equal total scores should be recorded as a tie, not assigned to the first player."""
+        round_dir = tmp_log_dir / "rounds" / "1"
+        round_dir.mkdir(parents=True)
+
+        self._create_results_file(
+            round_dir,
+            0,
+            [
+                (1, "Alice.MyTank", 0),
+                (2, "Bob.MyTank", 0),
+            ],
+        )
+        self._create_results_file(
+            round_dir,
+            1,
+            [
+                (1, "Alice.MyTank", 0),
+                (2, "Bob.MyTank", 0),
+            ],
+        )
+
+        agents = [MockPlayer("Alice"), MockPlayer("Bob")]
+        stats = RoundStats(round_num=1, agents=agents)
+
+        arena.get_results(agents, round_num=1, stats=stats)
+
+        assert stats.winner == RESULT_TIE
+        assert stats.scores["Alice"] == 0
+        assert stats.scores["Bob"] == 0
+
 
 class TestRoboCodeConfig:
     """Tests for RoboCodeArena configuration and properties."""