From 41e80a510a190a02bef470783638d9242dc99e1a Mon Sep 17 00:00:00 2001 From: muhtasham Date: Sun, 26 Apr 2026 02:54:44 +0200 Subject: [PATCH] Add OpenAI sweep tooling and analysis helpers - add benchmark runners, watcher, and eval pipeline scripts for OpenAI sweeps - add generated configs, reporting utilities, and OpenAI feedback notes - preserve model aliases in analysis and fix RoboCode tie handling --- codeclash/analysis/metrics/elo.py | 27 +- codeclash/analysis/metrics/win_rate.py | 14 +- codeclash/analysis/viz/heatmap_win_rates.py | 5 +- codeclash/analysis/viz/utils.py | 16 + .../arenas/huskybench/HuskyBench.Dockerfile | 3 +- codeclash/arenas/robocode/robocode.py | 9 +- configs/ablations/scaffold/README.md | 184 +++++++ configs/ablations/scaffold/manifest.yaml | 142 +++++ ...ake__gpt-5.3-codex__gpt-5__r15__s1000.yaml | 40 ++ ...War__gpt-5.3-codex__gpt-5__r15__s1000.yaml | 37 ++ ...lite__gpt-5.3-codex__gpt-5__r15__s250.yaml | 48 ++ ...ench__gpt-5.3-codex__gpt-5__r15__s100.yaml | 39 ++ ...Code__gpt-5.3-codex__gpt-5__r15__s250.yaml | 42 ++ ...mble__gpt-5.3-codex__gpt-5__r15__s250.yaml | 39 ++ ...lt__gpt-5.3-codex-default__r15__s1000.yaml | 36 ++ ...lt__gpt-5.3-codex-default__r15__s1000.yaml | 33 ++ ...ult__gpt-5.3-codex-default__r15__s250.yaml | 44 ++ ...ult__gpt-5.3-codex-default__r15__s100.yaml | 35 ++ ...ult__gpt-5.3-codex-default__r15__s250.yaml | 38 ++ ...ult__gpt-5.3-codex-default__r15__s250.yaml | 35 ++ ...-high__gpt-5.3-codex-high__r15__s1000.yaml | 40 ++ ...-high__gpt-5.3-codex-high__r15__s1000.yaml | 37 ++ ...4-high__gpt-5.3-codex-high__r15__s250.yaml | 48 ++ ...4-high__gpt-5.3-codex-high__r15__s100.yaml | 39 ++ ...4-high__gpt-5.3-codex-high__r15__s250.yaml | 42 ++ ...4-high__gpt-5.3-codex-high__r15__s250.yaml | 39 ++ ....4-low__gpt-5.3-codex-low__r15__s1000.yaml | 40 ++ ....4-low__gpt-5.3-codex-low__r15__s1000.yaml | 37 ++ ...5.4-low__gpt-5.3-codex-low__r15__s250.yaml | 48 ++ ...5.4-low__gpt-5.3-codex-low__r15__s100.yaml | 39 ++ ...5.4-low__gpt-5.3-codex-low__r15__s250.yaml | 42 ++ ...5.4-low__gpt-5.3-codex-low__r15__s250.yaml | 39 ++ ...ium__gpt-5.3-codex-medium__r15__s1000.yaml | 40 ++ ...ium__gpt-5.3-codex-medium__r15__s1000.yaml | 37 ++ ...dium__gpt-5.3-codex-medium__r15__s250.yaml | 48 ++ ...dium__gpt-5.3-codex-medium__r15__s100.yaml | 39 ++ ...dium__gpt-5.3-codex-medium__r15__s250.yaml | 42 ++ ...dium__gpt-5.3-codex-medium__r15__s250.yaml | 39 ++ ...ttleSnake__gpt-5.4__gpt-5__r15__s1000.yaml | 40 ++ .../CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml | 37 ++ .../Halite__gpt-5.4__gpt-5__r15__s250.yaml | 48 ++ ...HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml | 39 ++ .../RoboCode__gpt-5.4__gpt-5__r15__s250.yaml | 42 ++ ...obotRumble__gpt-5.4__gpt-5__r15__s250.yaml | 39 ++ docs/openai_feedback_20260310.md | 94 ++++ scripts/finalize_openai_sweep_report.sh | 82 +++ scripts/plot_leaderboard_comparison.py | 117 ++++ scripts/plot_reasoning_head_to_head.py | 226 ++++++++ scripts/print_leaderboard_table.py | 55 ++ scripts/run_eval_pipeline.sh | 119 +++++ scripts/run_gpt54_gpt53codex_round_robin.sh | 312 +++++++++++ .../run_gpt54_vs_gpt53codex_high_remaining.sh | 81 +++ scripts/run_gpt54_vs_gpt53codex_reasoning.sh | 222 ++++++++ scripts/run_openai_model_benchmarks.sh | 498 ++++++++++++++++++ scripts/run_openai_sweep.sh | 332 ++++++++++++ scripts/scrape_viewer_leaderboard_runs.py | 324 ++++++++++++ scripts/watch_sweep_progress.sh | 199 +++++++ tests/arenas/test_robocode.py | 32 ++ 58 files changed, 4512 insertions(+), 27 deletions(-) create mode 100644 configs/ablations/scaffold/README.md create mode 100644 configs/ablations/scaffold/manifest.yaml create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/BattleSnake__gpt-5.3-codex__gpt-5__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/CoreWar__gpt-5.3-codex__gpt-5__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/Halite__gpt-5.3-codex__gpt-5__r15__s250.yaml create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/HuskyBench__gpt-5.3-codex__gpt-5__r15__s100.yaml create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/RoboCode__gpt-5.3-codex__gpt-5__r15__s250.yaml create mode 100644 configs/generated/gpt-5.3-codex_vs_gpt-5/RobotRumble__gpt-5.3-codex__gpt-5__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/BattleSnake__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/CoreWar__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/Halite__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/HuskyBench__gpt-5.4-default__gpt-5.3-codex-default__r15__s100.yaml create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RoboCode__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RobotRumble__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/BattleSnake__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/CoreWar__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/BattleSnake__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/CoreWar__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/Halite__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/HuskyBench__gpt-5.4-low__gpt-5.3-codex-low__r15__s100.yaml create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RoboCode__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RobotRumble__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/BattleSnake__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/CoreWar__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/Halite__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/HuskyBench__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s100.yaml create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RoboCode__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RobotRumble__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/BattleSnake__gpt-5.4__gpt-5__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/Halite__gpt-5.4__gpt-5__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/RoboCode__gpt-5.4__gpt-5__r15__s250.yaml create mode 100644 configs/generated/gpt-5.4_vs_gpt-5/RobotRumble__gpt-5.4__gpt-5__r15__s250.yaml create mode 100644 docs/openai_feedback_20260310.md create mode 100755 scripts/finalize_openai_sweep_report.sh create mode 100644 scripts/plot_leaderboard_comparison.py create mode 100644 scripts/plot_reasoning_head_to_head.py create mode 100644 scripts/print_leaderboard_table.py create mode 100755 scripts/run_eval_pipeline.sh create mode 100755 scripts/run_gpt54_gpt53codex_round_robin.sh create mode 100755 scripts/run_gpt54_vs_gpt53codex_high_remaining.sh create mode 100755 scripts/run_gpt54_vs_gpt53codex_reasoning.sh create mode 100755 scripts/run_openai_model_benchmarks.sh create mode 100755 scripts/run_openai_sweep.sh create mode 100644 scripts/scrape_viewer_leaderboard_runs.py create mode 100755 scripts/watch_sweep_progress.sh diff --git a/codeclash/analysis/metrics/elo.py b/codeclash/analysis/metrics/elo.py index 929d3154..c9207234 100644 --- a/codeclash/analysis/metrics/elo.py +++ b/codeclash/analysis/metrics/elo.py @@ -14,7 +14,7 @@ from tqdm import tqdm from codeclash.analysis.significance import calculate_p_value -from codeclash.analysis.viz.utils import ASSETS_DIR, FONT_BOLD, MODEL_TO_DISPLAY_NAME +from codeclash.analysis.viz.utils import ASSETS_DIR, FONT_BOLD, MODEL_TO_DISPLAY_NAME, model_display_name from codeclash.constants import LOCAL_LOG_DIR, RESULT_TIE from codeclash.utils.log import add_file_handler, get_logger @@ -75,9 +75,6 @@ def __init__( lambda: defaultdict(list) ) - def _get_unique_model_name(self, model: str) -> str: - return model.rpartition("/")[2] - def _get_sorted_pair(self, p1: str, p2: str) -> tuple[str, str]: return tuple(sorted([p1, p2])) @@ -154,8 +151,6 @@ def _process_tournament(self, metadata_path: Path) -> None: return player_names = [p["name"] for p in players] - models = [p["config"]["model"]["model_name"].strip("@") for p in players] - # Aggregate scores for each round p1_round_scores = [] p2_round_scores = [] @@ -199,7 +194,7 @@ def _process_tournament(self, metadata_path: Path) -> None: p2_score = sum(p2_round_scores) # Convert to unique names and sorted pair when updating matrix - unique_names = [self._get_unique_model_name(m) for m in models] + unique_names = player_names sorted_pair = self._get_sorted_pair(unique_names[0], unique_names[1]) if unique_names[0] == sorted_pair[0]: @@ -550,7 +545,7 @@ def create_elo_plots(self, output_dir: Path) -> None: player_order = [all_players[i] for i in all_indices] # Translate to display names - display_names = [MODEL_TO_DISPLAY_NAME.get(p, p) for p in player_order] + display_names = [model_display_name(p) for p in player_order] # Create mapping from player to y-position player_to_pos = {p: i for i, p in enumerate(player_order)} @@ -698,7 +693,7 @@ def create_validation_plots(self, output_dir: Path, regularization: float = 0.01 ax.set_xlabel("BT Strength", fontproperties=FONT_BOLD, fontsize=12) ax.set_ylabel("Negative Log-Likelihood", fontproperties=FONT_BOLD, fontsize=12) - display_name = MODEL_TO_DISPLAY_NAME.get(player, player) + display_name = model_display_name(player) ax.set_title(display_name, fontproperties=FONT_BOLD, fontsize=14) legend = ax.legend(prop=FONT_BOLD, fontsize=10, loc="upper right") legend.set_frame_on(False) @@ -777,7 +772,7 @@ def _create_rank_matrix_plot( rank_matrix = (rank_matrix / self.n_bootstrap) * 100 # Translate player names to display names - display_names = [MODEL_TO_DISPLAY_NAME.get(p, p) for p in players] + display_names = [model_display_name(p) for p in players] fig, ax = plt.subplots(figsize=(6, 6)) im = ax.imshow(rank_matrix, cmap="YlOrRd", aspect="auto", vmin=0, vmax=100) @@ -826,7 +821,7 @@ def _create_elo_violin_plot( elo_data = [elo_samples[p] for p in players] # Translate player names to display names - display_names = [MODEL_TO_DISPLAY_NAME.get(p, p) for p in players] + display_names = [model_display_name(p) for p in players] fig, ax = plt.subplots(figsize=(6, 6)) @@ -1095,7 +1090,7 @@ def _plot_results(self, results_by_max_round: dict[int, dict[str, dict[str, floa elos_list.append(results_by_max_round[max_round][game_name][player]) if max_rounds_list: - display_name = MODEL_TO_DISPLAY_NAME.get(player, player) + display_name = model_display_name(player) ax.plot(max_rounds_list, elos_list, marker="o", label=display_name, linewidth=2, markersize=6) ax.set_xlabel("Max Round", fontproperties=FONT_BOLD, fontsize=14) @@ -1212,7 +1207,7 @@ def _plot_results(self, results_by_round: dict[int, dict[str, dict[str, float]]] elos_list.append(results_by_round[round_num][game_name][player]) if rounds_list: - display_name = MODEL_TO_DISPLAY_NAME.get(player, player) + display_name = model_display_name(player) ax.plot(rounds_list, elos_list, marker="o", label=display_name, linewidth=2, markersize=6) ax.set_xlabel("Round", fontproperties=FONT_BOLD, fontsize=14) @@ -1348,7 +1343,7 @@ def write_latex_table(results: dict[str, dict], output_dir: Path) -> None: lines.append(r"\midrule") for player, all_elo in sorted_players: - display_name = MODEL_TO_DISPLAY_NAME.get(player, player) + display_name = model_display_name(player) row_parts = [display_name.replace("_", r"\_")] for game_name in games_in_table: @@ -1407,7 +1402,7 @@ def write_website_results(results: dict[str, dict], output_dir: Path) -> None: # Create leaderboard entries board = [] for rank, (player, elo) in enumerate(sorted_players): - entry = {"rank": rank + 1, "model": MODEL_TO_DISPLAY_NAME.get(player, player), "elo": int(round(elo))} + entry = {"rank": rank + 1, "model": model_display_name(player), "elo": int(round(elo))} # Add confidence interval if available if elo_std is not None: player_idx = players.index(player) @@ -1506,7 +1501,7 @@ def write_latex_table_plain(results: dict[str, dict], output_dir: Path) -> None: lines.append(r"\midrule") for player, all_elo in sorted_players: - display_name = MODEL_TO_DISPLAY_NAME.get(player, player) + display_name = model_display_name(player) row_parts = [display_name.replace("_", r"\_")] for game_name in games_in_table: diff --git a/codeclash/analysis/metrics/win_rate.py b/codeclash/analysis/metrics/win_rate.py index aaa3df0c..58fdf664 100755 --- a/codeclash/analysis/metrics/win_rate.py +++ b/codeclash/analysis/metrics/win_rate.py @@ -31,16 +31,16 @@ def main(log_dir: Path): model_profiles = {} for game_log_folder in tqdm([x.parent for x in log_dir.rglob("metadata.json")]): game_id = game_log_folder.name.split(".")[1] - player_ids = [x.name for x in (game_log_folder / "players").iterdir() if x.is_dir()] metadata = json.load(open(game_log_folder / "metadata.json")) try: - player_to_model = { - x["name"]: x["config"]["model"]["model_name"].strip("@").split("/")[-1] - for x in metadata["config"]["players"] - } + player_ids = [x["name"] for x in metadata["config"]["players"]] + player_to_model = {x["name"]: x["name"] for x in metadata["config"]["players"]} except KeyError: continue - num_rounds = len(metadata["round_stats"]) + round_stats = metadata.get("round_stats") + if not isinstance(round_stats, dict) or not round_stats: + continue + num_rounds = len(round_stats) # Only count each unique model once per game unique_models = {player_to_model[player] for player in player_ids} @@ -55,7 +55,7 @@ def main(log_dir: Path): player_id=player_id, model_name=model_name, game_id=game_id, count=num_rounds ) - for round, details in metadata["round_stats"].items(): + for round, details in round_stats.items(): if round == "0": # Skip initial round continue diff --git a/codeclash/analysis/viz/heatmap_win_rates.py b/codeclash/analysis/viz/heatmap_win_rates.py index f817f503..b2139f1b 100755 --- a/codeclash/analysis/viz/heatmap_win_rates.py +++ b/codeclash/analysis/viz/heatmap_win_rates.py @@ -59,7 +59,7 @@ def main(log_dir: Path, unit: str = "rounds", output_file: Path = ASSETS_DIR / " # Build matrix models = sorted({m for pair in results.keys() for m in pair}) - clean_names = [MODEL_TO_DISPLAY_NAME[m.split("/")[-1]] for m in models] + clean_names = [MODEL_TO_DISPLAY_NAME.get(m.split("/")[-1], m.split("/")[-1]) for m in models] n = len(models) matrix = np.full((n, n), np.nan) @@ -73,7 +73,8 @@ def main(log_dir: Path, unit: str = "rounds", output_file: Path = ASSETS_DIR / " total_wins = sum(results[(m1, m2)][0] for m2 in models if m1 != m2) total_matches = sum(results[(m1, m2)][1] for m2 in models if m1 != m2) avg_win_rate = total_wins / total_matches if total_matches > 0 else 0 - print(f"{MODEL_TO_DISPLAY_NAME[m1.split('/')[-1]]}: {avg_win_rate:.2%} win rate over {total_matches} matches") + label = MODEL_TO_DISPLAY_NAME.get(m1.split("/")[-1], m1.split("/")[-1]) + print(f"{label}: {avg_win_rate:.2%} win rate over {total_matches} matches") # Plot FONT_BOLD.set_size(18) diff --git a/codeclash/analysis/viz/utils.py b/codeclash/analysis/viz/utils.py index 168e8c64..5e934cad 100644 --- a/codeclash/analysis/viz/utils.py +++ b/codeclash/analysis/viz/utils.py @@ -21,6 +21,22 @@ "o3": "o3", } + +def model_display_name(model: str) -> str: + label = MODEL_TO_DISPLAY_NAME.get(model, model) + tier_labels = { + "-default": " (Default)", + "-low": " (Low)", + "-medium": " (Medium)", + "-high": " (High)", + } + for suffix, pretty in tier_labels.items(): + if model.endswith(suffix): + base = model[: -len(suffix)] + base_label = MODEL_TO_DISPLAY_NAME.get(base, base) + return f"{base_label}{pretty}" + return label + MODEL_TO_COLOR = { "anthropic/claude-sonnet-4-20250514": "#FFD449", "anthropic/claude-sonnet-4-5-20250929": "#F75C03", diff --git a/codeclash/arenas/huskybench/HuskyBench.Dockerfile b/codeclash/arenas/huskybench/HuskyBench.Dockerfile index 0b1d4d52..9e1070f2 100644 --- a/codeclash/arenas/huskybench/HuskyBench.Dockerfile +++ b/codeclash/arenas/huskybench/HuskyBench.Dockerfile @@ -16,5 +16,6 @@ RUN git clone https://github.com/CodeClash-ai/HuskyBench.git /workspace \ && git remote set-url origin https://github.com/CodeClash-ai/HuskyBench.git WORKDIR /workspace -RUN pip install -r engine/requirements.txt +RUN pip install --no-cache-dir Cython setuptools wheel \ + && pip install --no-cache-dir -r engine/requirements.txt RUN mkdir -p /workspace/engine/output diff --git a/codeclash/arenas/robocode/robocode.py b/codeclash/arenas/robocode/robocode.py index a7dc16fb..da473873 100644 --- a/codeclash/arenas/robocode/robocode.py +++ b/codeclash/arenas/robocode/robocode.py @@ -10,6 +10,7 @@ from codeclash.agents.player import Player from codeclash.arenas.arena import CodeArena, RoundStats +from codeclash.constants import RESULT_TIE from codeclash.utils.environment import create_file_in_container RC_FILE = Path("MyTank.java") @@ -140,7 +141,13 @@ def get_results(self, agents: list[Player], round_num: int, stats: RoundStats): player = match.group(2).rsplit(".", 1)[0] scores[player] += int(match.group(3)) - stats.winner = max(scores, key=scores.get) + if not scores: + stats.winner = RESULT_TIE + return + + max_score = max(scores.values()) + leaders = [player for player, score in scores.items() if score == max_score] + stats.winner = RESULT_TIE if len(leaders) > 1 else leaders[0] stats.scores = scores for player, score in scores.items(): stats.player_stats[player].score = score diff --git a/configs/ablations/scaffold/README.md b/configs/ablations/scaffold/README.md new file mode 100644 index 00000000..e2b9b6c5 --- /dev/null +++ b/configs/ablations/scaffold/README.md @@ -0,0 +1,184 @@ +# Scaffold / Harness Ablation + +This folder defines a concrete experiment matrix for answering: + +1. How much of CodeClash performance is due to the model versus the agent harness? +2. Do Codex-style agent stacks help because of the scaffold alone, or because the model and scaffold are co-designed? + +## Current repository limitation + +Today this repository only exposes two agent types: + +- `mini` +- `dummy` + +See `codeclash/agents/__init__.py`. + +That means the experiments below are split into: + +- `ready_now`: Can be run once the referenced model exists in `configs/models.yaml` +- `blocked_on_adapter`: Requires adding a new agent adapter for `swe-agent`, `openhands`, or `codex-sdk` + +## Design rules + +All harness comparisons should keep the following fixed unless the experiment explicitly says otherwise: + +- same arena +- same opponent panel +- same model +- same number of rounds +- same per-round step limit +- same per-round dollar limit +- same tool surface +- same repository snapshot +- same visibility into logs and docs +- same replication count + +Do not give one harness extra tools, hidden memory, or a longer prompt unless that is the variable under test. + +## Phases + +### Phase A: Cheap scaffold-only screen + +Purpose: +Measure scaffold effects while holding the model fixed. + +System under test: + +- model: `@openai/gpt-5-mini` +- harnesses: + - `mini` (existing baseline) + - `swe-agent` (planned) + - `openhands` (planned) + - `codex-sdk` (planned thin adapter, not the full Codex product stack) + +Opponents: + +- `@anthropic/claude-sonnet-4-5-20250929` +- `@openai/o3` +- `@x-ai/grok-code-fast-1` + +Arenas: + +- `BattleSnake` (`r5`, `s1000`) +- `CoreWar` (`r5`, `s1000`) +- `RobotRumble` (`r5`, `s250`) + +Replications: + +- `2` independent tournaments per cell + +Run count: + +- `4 harnesses x 3 opponents x 3 arenas x 2 reps = 72 tournaments` + +Advance rule: + +- Promote the best two harnesses by pooled Elo / win rate +- Require no obvious regression in validation rate, bash success, or recovery after failure + +### Phase B: Cheap Codex stack test + +Purpose: +Separate generic scaffold effects from model-stack co-design. + +Systems: + +- `best_generic_harness + @openai/gpt-5-mini` +- `best_generic_harness + @openai/gpt-5.1-codex-mini` +- `codex-sdk + @openai/gpt-5.1-codex-mini` + +Opponents: + +- same as Phase A + +Arenas: + +- same as Phase A + +Replications: + +- `2` independent tournaments per cell + +Run count: + +- `3 systems x 3 opponents x 3 arenas x 2 reps = 54 tournaments` + +Interpretation: + +- If `codex-sdk + gpt-5.1-codex-mini` beats `best_generic_harness + gpt-5.1-codex-mini`, the scaffold matters. +- If `best_generic_harness + gpt-5.1-codex-mini` already captures most of the gain, the model matters more than the scaffold. + +### Phase C: Expensive confirmation + +Purpose: +Confirm the screen on a stronger model after the cheap runs identify promising cells. + +Systems: + +- top `2` systems from Phase B + +Model: + +- `@openai/gpt-5.4` + +Opponents: + +- `@anthropic/claude-sonnet-4-5-20250929` +- `@openai/o3` + +Arenas: + +- all six benchmark arenas + +Tournament budget: + +- `r15` +- standard paper simulation counts per arena + +Replications: + +- `1` independent tournament per cell + +Run count: + +- `2 systems x 2 opponents x 6 arenas x 1 rep = 24 tournaments` + +## Primary metrics + +- pooled Elo +- per-arena Elo +- head-to-head win rate excluding ties +- top-1 consistency under bootstrap +- pairwise order agreement under bootstrap + +## Diagnostic metrics + +- bash/action success rate +- next-step recovery after failed command +- fraction of rounds with grounded edits +- fraction of rounds with simulation-based validation +- fraction of rounds with unit-test validation +- mean files edited per round +- mean thought length / steps per round + +## Minimum logging requirements + +For every tournament, retain: + +- `metadata.json` +- trajectories +- per-round diffs +- round stats +- cost and API call counts + +For scaffold adapters, also log: + +- prompt template used +- tool whitelist / sandbox mode +- whether notes persist across rounds +- any harness-specific retries or auto-fixes + +## Why this matrix + +This matrix deliberately starts with cheap screening. `GPT-5.4` should only be used after the cheap phase narrows the search space. The point is not to prove that one harness wins one benchmark snapshot; the point is to isolate whether improvements survive when model, budget, and arena are held fixed. diff --git a/configs/ablations/scaffold/manifest.yaml b/configs/ablations/scaffold/manifest.yaml new file mode 100644 index 00000000..d9db41a6 --- /dev/null +++ b/configs/ablations/scaffold/manifest.yaml @@ -0,0 +1,142 @@ +version: 1 + +notes: + - "This is an experiment manifest, not an execution format consumed by CodeClash today." + - "Current repo support is limited to agent=mini and agent=dummy." + - "Cells with blocked_on_adapter require a new agent implementation." + +fairness_constraints: + same_arena: true + same_rounds_per_cell: true + same_step_limit_per_round: true + same_cost_limit_per_round: true + same_repo_snapshot: true + same_tool_surface: true + same_log_visibility: true + same_replication_count: true + +opponent_panel: + - model: "@anthropic/claude-sonnet-4-5-20250929" + - model: "@openai/o3" + - model: "@x-ai/grok-code-fast-1" + +arena_panel_screen: + - arena: "BattleSnake" + rounds: 5 + sims_per_round: 1000 + - arena: "CoreWar" + rounds: 5 + sims_per_round: 1000 + - arena: "RobotRumble" + rounds: 5 + sims_per_round: 250 + +arena_panel_confirm: + - arena: "BattleSnake" + rounds: 15 + sims_per_round: 1000 + - arena: "CoreWar" + rounds: 15 + sims_per_round: 1000 + - arena: "Halite" + rounds: 15 + sims_per_round: 250 + - arena: "HuskyBench" + rounds: 15 + sims_per_round: 100 + - arena: "RoboCode" + rounds: 15 + sims_per_round: 250 + - arena: "RobotRumble" + rounds: 15 + sims_per_round: 250 + +phases: + - id: "phase_a_scaffold_screen" + status: "partially_blocked" + replications: 2 + goal: "Measure harness effects while holding the model fixed." + systems: + - id: "mini__gpt5mini" + agent: "mini" + model: "@openai/gpt-5-mini" + status: "ready_now" + - id: "swe_agent__gpt5mini" + agent: "swe-agent" + model: "@openai/gpt-5-mini" + status: "blocked_on_adapter" + - id: "openhands__gpt5mini" + agent: "openhands" + model: "@openai/gpt-5-mini" + status: "blocked_on_adapter" + - id: "codex_sdk__gpt5mini" + agent: "codex-sdk" + model: "@openai/gpt-5-mini" + status: "blocked_on_adapter" + run_count_formula: "4 harnesses x 3 opponents x 3 arenas x 2 reps" + run_count_total: 72 + success_rule: + shortlist_top_systems: 2 + require_no_major_regression_in: + - "simulation_validation_rate" + - "bash_success_rate" + - "recovery_after_failed_command" + + - id: "phase_b_codex_stack" + status: "blocked" + replications: 2 + goal: "Separate generic scaffold effects from Codex-specific model-stack co-design." + systems: + - id: "best_generic__gpt5mini" + agent: "TBD_from_phase_a" + model: "@openai/gpt-5-mini" + status: "blocked_on_phase_a" + - id: "best_generic__gpt5_1_codex_mini" + agent: "TBD_from_phase_a" + model: "@openai/gpt-5.1-codex-mini" + status: "blocked_on_phase_a_and_model_entry" + - id: "codex_sdk__gpt5_1_codex_mini" + agent: "codex-sdk" + model: "@openai/gpt-5.1-codex-mini" + status: "blocked_on_adapter_and_model_entry" + run_count_formula: "3 systems x 3 opponents x 3 arenas x 2 reps" + run_count_total: 54 + interpretation_checks: + - "Does codex-sdk help when the model is held fixed at gpt-5.1-codex-mini?" + - "Does the Codex-family model help even without the codex-sdk harness?" + + - id: "phase_c_confirm_gpt54" + status: "blocked" + replications: 1 + goal: "Confirm the best systems on a stronger expensive model." + systems: + - id: "top1_from_phase_b__gpt54" + agent: "TBD_from_phase_b" + model: "@openai/gpt-5.4" + status: "blocked_on_phase_b_and_model_entry" + - id: "top2_from_phase_b__gpt54" + agent: "TBD_from_phase_b" + model: "@openai/gpt-5.4" + status: "blocked_on_phase_b_and_model_entry" + opponents: + - model: "@anthropic/claude-sonnet-4-5-20250929" + - model: "@openai/o3" + run_count_formula: "2 systems x 2 opponents x 6 arenas x 1 rep" + run_count_total: 24 + +metrics: + primary: + - "pooled_elo" + - "per_arena_elo" + - "head_to_head_win_rate_excluding_ties" + - "bootstrap_top1_consistency" + - "bootstrap_pairwise_order_agreement" + diagnostics: + - "bash_success_rate" + - "recovery_after_failed_command" + - "grounded_edit_rate" + - "simulation_validation_rate" + - "unit_test_validation_rate" + - "files_edited_per_round" + - "steps_per_round" + - "model_cost_usd" diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/BattleSnake__gpt-5.3-codex__gpt-5__r15__s1000.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/BattleSnake__gpt-5.3-codex__gpt-5__r15__s1000.yaml new file mode 100644 index 00000000..c1df5b3e --- /dev/null +++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/BattleSnake__gpt-5.3-codex__gpt-5__r15__s1000.yaml @@ -0,0 +1,40 @@ +tournament: + rounds: 15 +game: + name: BattleSnake + sims_per_round: 1000 + args: + width: 11 + height: 11 + browser: false +players: +- agent: mini + name: gpt-5.3-codex + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called BattleSnake. + Your bot (`main.py`) controls a snake on a grid-based board. + Snakes collect food, avoid collisions, and try to outlast their opponents. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `main.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/CoreWar__gpt-5.3-codex__gpt-5__r15__s1000.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/CoreWar__gpt-5.3-codex__gpt-5__r15__s1000.yaml new file mode 100644 index 00000000..def48781 --- /dev/null +++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/CoreWar__gpt-5.3-codex__gpt-5__r15__s1000.yaml @@ -0,0 +1,37 @@ +tournament: + rounds: 15 +game: + name: CoreWar + sims_per_round: 1000 + args: {} +players: +- agent: mini + name: gpt-5.3-codex + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called CoreWar. + CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate. + Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `warrior.red`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/Halite__gpt-5.3-codex__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/Halite__gpt-5.3-codex__gpt-5__r15__s250.yaml new file mode 100644 index 00000000..8640a350 --- /dev/null +++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/Halite__gpt-5.3-codex__gpt-5__r15__s250.yaml @@ -0,0 +1,48 @@ +tournament: + rounds: 15 +game: + name: Halite + sims_per_round: 250 + args: {} +players: +- agent: mini + name: gpt-5.3-codex + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called Halite. + Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength. + Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it. + The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions. + + You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust. + Example implementations can be found under the `airesources/` folder. + Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages. + Please make sure your main file is named `main.`, where `` is the appropriate file extension for your chosen programming language. + You may include additional files as needed, but please ensure: + 1. The `submission/` folder contains only files relevant to your bot. + 2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission). + 3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission//` folder to see how we will compile and run your bot. + + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `submission`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/HuskyBench__gpt-5.3-codex__gpt-5__r15__s100.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/HuskyBench__gpt-5.3-codex__gpt-5__r15__s100.yaml new file mode 100644 index 00000000..53df4943 --- /dev/null +++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/HuskyBench__gpt-5.3-codex__gpt-5__r15__s100.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: HuskyBench + sims_per_round: 100 + args: {} +players: +- agent: mini + name: gpt-5.3-codex + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called HuskyBench. + In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips. + Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively. + Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round. + You can use run_game.sh to check if your bot runs in time. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `client/player.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/RoboCode__gpt-5.3-codex__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/RoboCode__gpt-5.3-codex__gpt-5__r15__s250.yaml new file mode 100644 index 00000000..ec1c8774 --- /dev/null +++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/RoboCode__gpt-5.3-codex__gpt-5__r15__s250.yaml @@ -0,0 +1,42 @@ +tournament: + rounds: 15 +game: + name: RoboCode + sims_per_round: 250 + args: + nodisplay: true + nosound: true + record_ratio: 0.2 +players: +- agent: mini + name: gpt5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RoboCode. + Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar. + Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots. + Your bot logic must be written in Java and located in the `robots/custom/` directory. + Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robots/custom/`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.3-codex_vs_gpt-5/RobotRumble__gpt-5.3-codex__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.3-codex_vs_gpt-5/RobotRumble__gpt-5.3-codex__gpt-5__r15__s250.yaml new file mode 100644 index 00000000..9663e660 --- /dev/null +++ b/configs/generated/gpt-5.3-codex_vs_gpt-5/RobotRumble__gpt-5.3-codex__gpt-5__r15__s250.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: RobotRumble + sims_per_round: 250 + args: + raw: true +players: +- agent: mini + name: gpt-5.3-codex + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RobotRumble. + RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid. + Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match. + NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robot.js`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/BattleSnake__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/BattleSnake__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml new file mode 100644 index 00000000..c2d34dde --- /dev/null +++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/BattleSnake__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml @@ -0,0 +1,36 @@ +tournament: + rounds: 15 +game: + name: BattleSnake + sims_per_round: 1000 + args: + width: 11 + height: 11 + browser: false +players: +- agent: mini + name: gpt-5.4-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm +- agent: mini + name: gpt-5.3-codex-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called BattleSnake. + Your bot (`main.py`) controls a snake on a grid-based board. + Snakes collect food, avoid collisions, and try to outlast their opponents. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `main.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/CoreWar__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/CoreWar__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml new file mode 100644 index 00000000..f2e539c7 --- /dev/null +++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/CoreWar__gpt-5.4-default__gpt-5.3-codex-default__r15__s1000.yaml @@ -0,0 +1,33 @@ +tournament: + rounds: 15 +game: + name: CoreWar + sims_per_round: 1000 + args: {} +players: +- agent: mini + name: gpt-5.4-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm +- agent: mini + name: gpt-5.3-codex-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called CoreWar. + CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate. + Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `warrior.red`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/Halite__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/Halite__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml new file mode 100644 index 00000000..fc8f7eee --- /dev/null +++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/Halite__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml @@ -0,0 +1,44 @@ +tournament: + rounds: 15 +game: + name: Halite + sims_per_round: 250 + args: {} +players: +- agent: mini + name: gpt-5.4-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm +- agent: mini + name: gpt-5.3-codex-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called Halite. + Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength. + Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it. + The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions. + + You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust. + Example implementations can be found under the `airesources/` folder. + Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages. + Please make sure your main file is named `main.`, where `` is the appropriate file extension for your chosen programming language. + You may include additional files as needed, but please ensure: + 1. The `submission/` folder contains only files relevant to your bot. + 2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission). + 3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission//` folder to see how we will compile and run your bot. + + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `submission`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/HuskyBench__gpt-5.4-default__gpt-5.3-codex-default__r15__s100.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/HuskyBench__gpt-5.4-default__gpt-5.3-codex-default__r15__s100.yaml new file mode 100644 index 00000000..7ccd9220 --- /dev/null +++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/HuskyBench__gpt-5.4-default__gpt-5.3-codex-default__r15__s100.yaml @@ -0,0 +1,35 @@ +tournament: + rounds: 15 +game: + name: HuskyBench + sims_per_round: 100 + args: {} +players: +- agent: mini + name: gpt-5.4-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm +- agent: mini + name: gpt-5.3-codex-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called HuskyBench. + In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips. + Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively. + Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round. + You can use run_game.sh to check if your bot runs in time. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `client/player.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RoboCode__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RoboCode__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml new file mode 100644 index 00000000..72fdaf5c --- /dev/null +++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RoboCode__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml @@ -0,0 +1,38 @@ +tournament: + rounds: 15 +game: + name: RoboCode + sims_per_round: 250 + args: + nodisplay: true + nosound: true + record_ratio: 0.2 +players: +- agent: mini + name: gpt5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm +- agent: mini + name: gpt-5.3-codex-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RoboCode. + Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar. + Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots. + Your bot logic must be written in Java and located in the `robots/custom/` directory. + Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robots/custom/`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RobotRumble__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RobotRumble__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml new file mode 100644 index 00000000..ce4ed43d --- /dev/null +++ b/configs/generated/gpt-5.4-default_vs_gpt-5.3-codex-default/RobotRumble__gpt-5.4-default__gpt-5.3-codex-default__r15__s250.yaml @@ -0,0 +1,35 @@ +tournament: + rounds: 15 +game: + name: RobotRumble + sims_per_round: 250 + args: + raw: true +players: +- agent: mini + name: gpt-5.4-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm +- agent: mini + name: gpt-5.3-codex-default + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RobotRumble. + RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid. + Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match. + NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robot.js`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/BattleSnake__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/BattleSnake__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml new file mode 100644 index 00000000..7458df6c --- /dev/null +++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/BattleSnake__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml @@ -0,0 +1,40 @@ +tournament: + rounds: 15 +game: + name: BattleSnake + sims_per_round: 1000 + args: + width: 11 + height: 11 + browser: false +players: +- agent: mini + name: gpt-5.4-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5.3-codex-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called BattleSnake. + Your bot (`main.py`) controls a snake on a grid-based board. + Snakes collect food, avoid collisions, and try to outlast their opponents. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `main.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/CoreWar__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/CoreWar__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml new file mode 100644 index 00000000..3b856293 --- /dev/null +++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/CoreWar__gpt-5.4-high__gpt-5.3-codex-high__r15__s1000.yaml @@ -0,0 +1,37 @@ +tournament: + rounds: 15 +game: + name: CoreWar + sims_per_round: 1000 + args: {} +players: +- agent: mini + name: gpt-5.4-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5.3-codex-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called CoreWar. + CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate. + Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `warrior.red`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml new file mode 100644 index 00000000..ae84473f --- /dev/null +++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml @@ -0,0 +1,48 @@ +tournament: + rounds: 15 +game: + name: Halite + sims_per_round: 250 + args: {} +players: +- agent: mini + name: gpt-5.4-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5.3-codex-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called Halite. + Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength. + Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it. + The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions. + + You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust. + Example implementations can be found under the `airesources/` folder. + Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages. + Please make sure your main file is named `main.`, where `` is the appropriate file extension for your chosen programming language. + You may include additional files as needed, but please ensure: + 1. The `submission/` folder contains only files relevant to your bot. + 2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission). + 3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission//` folder to see how we will compile and run your bot. + + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `submission`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml new file mode 100644 index 00000000..c057921f --- /dev/null +++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: HuskyBench + sims_per_round: 100 + args: {} +players: +- agent: mini + name: gpt-5.4-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5.3-codex-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called HuskyBench. + In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips. + Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively. + Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round. + You can use run_game.sh to check if your bot runs in time. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `client/player.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml new file mode 100644 index 00000000..b52edef8 --- /dev/null +++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml @@ -0,0 +1,42 @@ +tournament: + rounds: 15 +game: + name: RoboCode + sims_per_round: 250 + args: + nodisplay: true + nosound: true + record_ratio: 0.2 +players: +- agent: mini + name: gpt5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5.3-codex-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RoboCode. + Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar. + Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots. + Your bot logic must be written in Java and located in the `robots/custom/` directory. + Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robots/custom/`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml new file mode 100644 index 00000000..f88367f1 --- /dev/null +++ b/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: RobotRumble + sims_per_round: 250 + args: + raw: true +players: +- agent: mini + name: gpt-5.4-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5.3-codex-high + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RobotRumble. + RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid. + Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match. + NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robot.js`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/BattleSnake__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/BattleSnake__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml new file mode 100644 index 00000000..63dee234 --- /dev/null +++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/BattleSnake__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml @@ -0,0 +1,40 @@ +tournament: + rounds: 15 +game: + name: BattleSnake + sims_per_round: 1000 + args: + width: 11 + height: 11 + browser: false +players: +- agent: mini + name: gpt-5.4-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +- agent: mini + name: gpt-5.3-codex-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called BattleSnake. + Your bot (`main.py`) controls a snake on a grid-based board. + Snakes collect food, avoid collisions, and try to outlast their opponents. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `main.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/CoreWar__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/CoreWar__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml new file mode 100644 index 00000000..73ea79a6 --- /dev/null +++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/CoreWar__gpt-5.4-low__gpt-5.3-codex-low__r15__s1000.yaml @@ -0,0 +1,37 @@ +tournament: + rounds: 15 +game: + name: CoreWar + sims_per_round: 1000 + args: {} +players: +- agent: mini + name: gpt-5.4-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +- agent: mini + name: gpt-5.3-codex-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called CoreWar. + CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate. + Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `warrior.red`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/Halite__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/Halite__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml new file mode 100644 index 00000000..d71a6bbc --- /dev/null +++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/Halite__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml @@ -0,0 +1,48 @@ +tournament: + rounds: 15 +game: + name: Halite + sims_per_round: 250 + args: {} +players: +- agent: mini + name: gpt-5.4-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +- agent: mini + name: gpt-5.3-codex-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called Halite. + Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength. + Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it. + The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions. + + You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust. + Example implementations can be found under the `airesources/` folder. + Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages. + Please make sure your main file is named `main.`, where `` is the appropriate file extension for your chosen programming language. + You may include additional files as needed, but please ensure: + 1. The `submission/` folder contains only files relevant to your bot. + 2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission). + 3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission//` folder to see how we will compile and run your bot. + + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `submission`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/HuskyBench__gpt-5.4-low__gpt-5.3-codex-low__r15__s100.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/HuskyBench__gpt-5.4-low__gpt-5.3-codex-low__r15__s100.yaml new file mode 100644 index 00000000..e3b67725 --- /dev/null +++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/HuskyBench__gpt-5.4-low__gpt-5.3-codex-low__r15__s100.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: HuskyBench + sims_per_round: 100 + args: {} +players: +- agent: mini + name: gpt-5.4-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +- agent: mini + name: gpt-5.3-codex-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called HuskyBench. + In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips. + Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively. + Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round. + You can use run_game.sh to check if your bot runs in time. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `client/player.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RoboCode__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RoboCode__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml new file mode 100644 index 00000000..534db3b2 --- /dev/null +++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RoboCode__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml @@ -0,0 +1,42 @@ +tournament: + rounds: 15 +game: + name: RoboCode + sims_per_round: 250 + args: + nodisplay: true + nosound: true + record_ratio: 0.2 +players: +- agent: mini + name: gpt5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +- agent: mini + name: gpt-5.3-codex-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RoboCode. + Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar. + Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots. + Your bot logic must be written in Java and located in the `robots/custom/` directory. + Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robots/custom/`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RobotRumble__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RobotRumble__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml new file mode 100644 index 00000000..12a98655 --- /dev/null +++ b/configs/generated/gpt-5.4-low_vs_gpt-5.3-codex-low/RobotRumble__gpt-5.4-low__gpt-5.3-codex-low__r15__s250.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: RobotRumble + sims_per_round: 250 + args: + raw: true +players: +- agent: mini + name: gpt-5.4-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +- agent: mini + name: gpt-5.3-codex-low + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'low' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RobotRumble. + RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid. + Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match. + NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robot.js`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/BattleSnake__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/BattleSnake__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml new file mode 100644 index 00000000..3f3a3de3 --- /dev/null +++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/BattleSnake__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml @@ -0,0 +1,40 @@ +tournament: + rounds: 15 +game: + name: BattleSnake + sims_per_round: 1000 + args: + width: 11 + height: 11 + browser: false +players: +- agent: mini + name: gpt-5.4-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +- agent: mini + name: gpt-5.3-codex-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called BattleSnake. + Your bot (`main.py`) controls a snake on a grid-based board. + Snakes collect food, avoid collisions, and try to outlast their opponents. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `main.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/CoreWar__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/CoreWar__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml new file mode 100644 index 00000000..6ad0b786 --- /dev/null +++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/CoreWar__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s1000.yaml @@ -0,0 +1,37 @@ +tournament: + rounds: 15 +game: + name: CoreWar + sims_per_round: 1000 + args: {} +players: +- agent: mini + name: gpt-5.4-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +- agent: mini + name: gpt-5.3-codex-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called CoreWar. + CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate. + Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `warrior.red`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/Halite__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/Halite__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml new file mode 100644 index 00000000..b55e7a00 --- /dev/null +++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/Halite__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml @@ -0,0 +1,48 @@ +tournament: + rounds: 15 +game: + name: Halite + sims_per_round: 250 + args: {} +players: +- agent: mini + name: gpt-5.4-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +- agent: mini + name: gpt-5.3-codex-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called Halite. + Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength. + Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it. + The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions. + + You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust. + Example implementations can be found under the `airesources/` folder. + Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages. + Please make sure your main file is named `main.`, where `` is the appropriate file extension for your chosen programming language. + You may include additional files as needed, but please ensure: + 1. The `submission/` folder contains only files relevant to your bot. + 2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission). + 3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission//` folder to see how we will compile and run your bot. + + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `submission`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/HuskyBench__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s100.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/HuskyBench__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s100.yaml new file mode 100644 index 00000000..4bed924f --- /dev/null +++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/HuskyBench__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s100.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: HuskyBench + sims_per_round: 100 + args: {} +players: +- agent: mini + name: gpt-5.4-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +- agent: mini + name: gpt-5.3-codex-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called HuskyBench. + In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips. + Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively. + Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round. + You can use run_game.sh to check if your bot runs in time. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `client/player.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RoboCode__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RoboCode__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml new file mode 100644 index 00000000..51ae0878 --- /dev/null +++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RoboCode__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml @@ -0,0 +1,42 @@ +tournament: + rounds: 15 +game: + name: RoboCode + sims_per_round: 250 + args: + nodisplay: true + nosound: true + record_ratio: 0.2 +players: +- agent: mini + name: gpt5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +- agent: mini + name: gpt-5.3-codex-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RoboCode. + Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar. + Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots. + Your bot logic must be written in Java and located in the `robots/custom/` directory. + Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robots/custom/`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RobotRumble__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RobotRumble__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml new file mode 100644 index 00000000..5ceed406 --- /dev/null +++ b/configs/generated/gpt-5.4-medium_vs_gpt-5.3-codex-medium/RobotRumble__gpt-5.4-medium__gpt-5.3-codex-medium__r15__s250.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: RobotRumble + sims_per_round: 250 + args: + raw: true +players: +- agent: mini + name: gpt-5.4-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +- agent: mini + name: gpt-5.3-codex-medium + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.3-codex' + model_class: litellm + model_kwargs: + reasoning_effort: 'medium' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RobotRumble. + RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid. + Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match. + NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robot.js`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4_vs_gpt-5/BattleSnake__gpt-5.4__gpt-5__r15__s1000.yaml b/configs/generated/gpt-5.4_vs_gpt-5/BattleSnake__gpt-5.4__gpt-5__r15__s1000.yaml new file mode 100644 index 00000000..04d2e83e --- /dev/null +++ b/configs/generated/gpt-5.4_vs_gpt-5/BattleSnake__gpt-5.4__gpt-5__r15__s1000.yaml @@ -0,0 +1,40 @@ +tournament: + rounds: 15 +game: + name: BattleSnake + sims_per_round: 1000 + args: + width: 11 + height: 11 + browser: false +players: +- agent: mini + name: gpt-5.4 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called BattleSnake. + Your bot (`main.py`) controls a snake on a grid-based board. + Snakes collect food, avoid collisions, and try to outlast their opponents. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `main.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4_vs_gpt-5/CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml b/configs/generated/gpt-5.4_vs_gpt-5/CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml new file mode 100644 index 00000000..50a28315 --- /dev/null +++ b/configs/generated/gpt-5.4_vs_gpt-5/CoreWar__gpt-5.4__gpt-5__r15__s1000.yaml @@ -0,0 +1,37 @@ +tournament: + rounds: 15 +game: + name: CoreWar + sims_per_round: 1000 + args: {} +players: +- agent: mini + name: gpt-5.4 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called CoreWar. + CoreWar is a programming battle where you write "warriors" in an assembly-like language called Redcode to compete within a virtual machine (MARS), aiming to eliminate your rivals by making their code self-terminate. + Victory comes from crafting clever tactics—replicators, scanners, bombers—that exploit memory layout and instruction timing to control the core. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `warrior.red`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4_vs_gpt-5/Halite__gpt-5.4__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.4_vs_gpt-5/Halite__gpt-5.4__gpt-5__r15__s250.yaml new file mode 100644 index 00000000..dff0c56f --- /dev/null +++ b/configs/generated/gpt-5.4_vs_gpt-5/Halite__gpt-5.4__gpt-5__r15__s250.yaml @@ -0,0 +1,48 @@ +tournament: + rounds: 15 +game: + name: Halite + sims_per_round: 250 + args: {} +players: +- agent: mini + name: gpt-5.4 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called Halite. + Halite is a multi-player turn-based strategy game where bots compete on a rectangular grid to capture territory and accumulate strength. + Players control pieces that can move across the map to conquer neutral and enemy territory, with each cell providing production that increases the strength of pieces occupying it. + The goal is to control the most territory by the end of the game through strategic expansion, consolidation of forces, and tactical combat decisions. + + You have the choice of writing your Halite bot in one of four programming languages: C, C++, OCaml, or Rust. + Example implementations can be found under the `airesources/` folder. + Your submission should be stored in the `submission/` folder. This folder currently contains an example C bot, but feel free to use any of the supported languages. + Please make sure your main file is named `main.`, where `` is the appropriate file extension for your chosen programming language. + You may include additional files as needed, but please ensure: + 1. The `submission/` folder contains only files relevant to your bot. + 2. The `submission/` folder ONLY contains a single bot (no multiple bots in one submission). + 3. Your bot can be compiled. See `runGame.sh` under the corresponding `submission//` folder to see how we will compile and run your bot. + + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `submission`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4_vs_gpt-5/HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml b/configs/generated/gpt-5.4_vs_gpt-5/HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml new file mode 100644 index 00000000..cf4cd1c1 --- /dev/null +++ b/configs/generated/gpt-5.4_vs_gpt-5/HuskyBench__gpt-5.4__gpt-5__r15__s100.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: HuskyBench + sims_per_round: 100 + args: {} +players: +- agent: mini + name: gpt-5.4 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called HuskyBench. + In this game, you will write code to control a poker-playing bot, aiming to outsmart your opponents and win chips. + Victory comes from crafting clever strategies—bluffing, reading opponents, and managing your chip stack effectively. + Be mindful of your bot's efficiency - your code should complete a simulation within 10 seconds to avoid forfeiting the round. + You can use run_game.sh to check if your bot runs in time. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `client/player.py`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4_vs_gpt-5/RoboCode__gpt-5.4__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.4_vs_gpt-5/RoboCode__gpt-5.4__gpt-5__r15__s250.yaml new file mode 100644 index 00000000..b07ab7b6 --- /dev/null +++ b/configs/generated/gpt-5.4_vs_gpt-5/RoboCode__gpt-5.4__gpt-5__r15__s250.yaml @@ -0,0 +1,42 @@ +tournament: + rounds: 15 +game: + name: RoboCode + sims_per_round: 250 + args: + nodisplay: true + nosound: true + record_ratio: 0.2 +players: +- agent: mini + name: gpt5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RoboCode. + Robocode (Tank Royale) is a programming game where your code is the tank: each turn your bot sends intents—speed plus body/gun/radar turn rates and firepower—based on the game state it perceives via radar. + Your program decides how to move, aim, and fire in a deterministic, turn-based arena to outlast other bots. + Your bot logic must be written in Java and located in the `robots/custom/` directory. + Keep the main bot class named `MyTank.java`, but you can include additional Java files if you'd like. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robots/custom/`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/configs/generated/gpt-5.4_vs_gpt-5/RobotRumble__gpt-5.4__gpt-5__r15__s250.yaml b/configs/generated/gpt-5.4_vs_gpt-5/RobotRumble__gpt-5.4__gpt-5__r15__s250.yaml new file mode 100644 index 00000000..b6eef98a --- /dev/null +++ b/configs/generated/gpt-5.4_vs_gpt-5/RobotRumble__gpt-5.4__gpt-5__r15__s250.yaml @@ -0,0 +1,39 @@ +tournament: + rounds: 15 +game: + name: RobotRumble + sims_per_round: 250 + args: + raw: true +players: +- agent: mini + name: gpt-5.4 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5.4' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +- agent: mini + name: gpt-5 + config: + agent: !include mini/default.yaml + model: + model_name: 'openai/gpt-5' + model_class: litellm + model_kwargs: + reasoning_effort: 'high' +prompts: + game_description: |- + You are a software developer ({{player_id}}) competing in a coding game called RobotRumble. + RobotRumble is a turn-based coding battle where you program a team of robots in Python to move, attack, and outmaneuver your opponent on a grid. + Every decision is driven by your code, and victory comes from crafting logic that positions robots smartly, times attacks well, and adapts over the 100-turn match. + NOTE: Please ensure that your code runs efficiently (under 60 seconds). Code that exceeds this run time will automatically forfeit the round. + + The game is played in 15 rounds. For every round, you (and your competitors) edit program code that controls your bot. This is round {{round}}. + After you and your competitor finish editing your codebases, the game is run automatically. + + Your task: improve the bot in `robot.js`, located in {{working_dir}}. + {{working_dir}} is your codebase, which contains both your both and supporting assets. + All of your commands will be executed in the {{working_dir}} directory (see notes below). diff --git a/docs/openai_feedback_20260310.md b/docs/openai_feedback_20260310.md new file mode 100644 index 00000000..b7ce8ca0 --- /dev/null +++ b/docs/openai_feedback_20260310.md @@ -0,0 +1,94 @@ +# OpenAI Benchmark Feedback + +Date: March 10, 2026 + +## What We Ran + +We evaluated OpenAI coding models in a long-horizon iterative coding benchmark across six arenas: + +- BattleSnake +- CoreWar +- Halite +- HuskyBench +- RoboCode +- RobotRumble + +Each matchup ran for 15 edit-and-play rounds. This is not a one-shot code generation benchmark; the models repeatedly edited their agents over time using feedback from prior rounds. + +## Sweep 1: Prior Pooled Sweep + +Run root: +`/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312` + +Shareable plot: +`/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312/analysis/elo/openai_feedback_per_arena.png` + +Overall pooled Elo: + +- GPT-5.4: 1298 +/- 98 +- GPT-5: 1210 +/- 65 +- GPT-5.3-Codex: 1092 +/- 108 + +Interpretation: + +- GPT-5.4 was the strongest overall model in the pooled sweep. +- GPT-5 was generally in the middle. +- GPT-5.3-Codex trailed overall. + +Important nuance: + +- GPT-5 still looked stronger in some arenas, especially Halite and HuskyBench. +- GPT-5.4 appears to win overall because it was stronger more consistently across the full suite, especially in BattleSnake and in the aggregate ranking. + +## Sweep 2: Direct GPT-5.4 vs GPT-5.3-Codex by Reasoning Mode + +Run root: +`/Users/muhtasham/Documents/CodeClash/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105` + +Shareable plots: + +- Direct win rate by reasoning mode: + `/Users/muhtasham/Documents/CodeClash/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105/analysis/shareable/reasoning_mode_win_rate.png` +- GPT-5.4 win rate by arena and reasoning mode: + `/Users/muhtasham/Documents/CodeClash/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105/analysis/shareable/reasoning_mode_arena_heatmap.png` +- 8-variant Elo view: + `/Users/muhtasham/Documents/CodeClash/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105/analysis/elo/all_games_elo.png` + +Matched-tier direct head-to-head results, excluding ties: + +- Default: GPT-5.4 won 57, GPT-5.3-Codex won 37, ties 2 +- Low: GPT-5.4 won 48, GPT-5.3-Codex won 45, ties 3 +- Medium: GPT-5.4 won 81, GPT-5.3-Codex won 14, ties 1 +- High: GPT-5.4 won 62, GPT-5.3-Codex won 22, ties 12 + +Interpretation: + +- GPT-5.4 beat GPT-5.3-Codex at every matched reasoning tier in this direct sweep. +- Medium was the strongest GPT-5.4 setting in this benchmark. +- High was also strong, but not clearly better than Medium. +- Low was close to parity. + +Arena-level pattern: + +- GPT-5.4 was very strong in RoboCode across all tiers. +- GPT-5.4 Medium was especially strong in CoreWar, Halite, HuskyBench, RoboCode, and RobotRumble. +- GPT-5.3-Codex remained competitive or better in some BattleSnake settings. + +## Caveat on the 8-Variant Elo + +The 8-way Elo chart for the reasoning sweep should be treated as directional, not definitive. + +Reason: + +- the sweep only included same-tier direct pairings +- the comparison graph is split into four disconnected components +- there are no bridge matches between tiers + +So the most reliable conclusion from Sweep 2 is the direct same-tier head-to-head result, not the exact cross-tier Elo spacing among all eight variants. + +## Suggested Product Feedback + +- GPT-5.4 looks stronger than GPT-5.3-Codex on long-horizon iterative code improvement, especially at Medium reasoning. +- More reasoning did not monotonically improve results; Medium outperformed High in this setup. +- Performance remains arena-dependent. GPT-5 still looked strong in some environments in the pooled sweep, so model choice may depend on task structure rather than only aggregate Elo. +- For future benchmarking, a connected round-robin across reasoning settings would produce a more trustworthy shared Elo ladder. diff --git a/scripts/finalize_openai_sweep_report.sh b/scripts/finalize_openai_sweep_report.sh new file mode 100755 index 00000000..9743c6e7 --- /dev/null +++ b/scripts/finalize_openai_sweep_report.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +set -euo pipefail +RUN_ROOT="/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312" +REPORT="$RUN_ROOT/analysis/openai_vs_previous_leaderboard_summary.md" + +uv run python - <<'PY' +import json, time +from pathlib import Path +run=Path('/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312') +patterns=[ + 'PvpTournament.HuskyBench.r15.s100.p2.gpt-5.gpt-5.4.gpt-5.4-vs-gpt-5.*', + 'PvpTournament.HuskyBench.r15.s100.p2.gpt-5.gpt-5.3-codex.gpt-5.3-codex-vs-gpt-5.*', +] +need={str(i) for i in range(16)} +for _ in range(720): + done=0 + for pat in patterns: + ds=sorted(run.glob(pat), key=lambda p:p.stat().st_mtime, reverse=True) + if not ds: continue + m=ds[0]/'metadata.json' + if not m.exists(): continue + try:j=json.loads(m.read_text()) + except: continue + rs=set(j.get('round_stats',{}).keys()) + if need.issubset(rs): done+=1 + if done==2: + print('huskybench complete') + break + time.sleep(30) +else: + raise SystemExit('timeout waiting for huskybench completion') +PY + +/Users/muhtasham/Documents/CodeClash/scripts/run_eval_pipeline.sh --log-dir "$RUN_ROOT" + +uv run python - <<'PY' +import re +from pathlib import Path +run=Path('/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312') +report=run/'analysis'/'openai_vs_previous_leaderboard_summary.md' +report.parent.mkdir(parents=True, exist_ok=True) +all_tex=run/'analysis'/'elo'/'elo_table_plain.tex' +text=all_tex.read_text() if all_tex.exists() else '' +rows=[] +for line in text.splitlines(): + if '&' in line and '\\\\' in line and 'Model' not in line: + parts=[p.strip() for p in line.replace('\\\\','').split('&')] + if len(parts)>=3: + rows.append(parts) + +prev={ +'Claude Sonnet 4.5':1385,'GPT-5':1366,'o3':1343,'Claude Sonnet 4':1224, +'GPT-5 Mini':1199,'Gemini 2.5 Pro':1124,'Grok Code Fast':1006,'Qwen3 Coder':952, +} + +lines=[] +lines.append('# OpenAI Sweep vs Previous Leaderboard') +lines.append('') +lines.append('Current run root: `/Users/muhtasham/Documents/CodeClash/logs/new_openai_sweep_20260307_184312`') +lines.append('') +lines.append('## New-model ALL Elo (this run)') +if rows: + lines.append('| Model | Elo |') + lines.append('|---|---:|') + for r in rows: + model=r[0] + elo=r[1] if len(r)>1 else '' + if any(k in model for k in ['gpt-5.4','gpt-5.3-codex','gpt-5']): + lines.append(f'| {model} | {elo} |') +else: + lines.append('_Could not parse elo_table_plain.tex_') + +lines.append('') +lines.append('## Comparison to previous public leaderboard (ALL Elo)') +lines.append('- Previous top baseline in your provided table: **Claude Sonnet 4.5 = 1385 ± 18**') +lines.append('- Previous **GPT-5 = 1366 ± 17**') +lines.append('') +lines.append('Interpretation guidance: this run uses a much smaller model set (gpt-5, gpt-5.4, gpt-5.3-codex), so Elo scale can shift. Compare directionally, not as strict absolute replacement of global board ranks.') + +report.write_text('\n'.join(lines)+'\n') +print(report) +PY diff --git a/scripts/plot_leaderboard_comparison.py b/scripts/plot_leaderboard_comparison.py new file mode 100644 index 00000000..038d33cb --- /dev/null +++ b/scripts/plot_leaderboard_comparison.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +import matplotlib.pyplot as plt + +from codeclash.analysis.viz.utils import FONT_BOLD, FONT_REG, MARKERS, MODEL_TO_COLOR, model_display_name + + +GAME_ORDER = ["halite", "huskybench", "corewar", "robotrumble", "robocode", "battlesnake", "all"] +GAME_LABELS = { + "halite": "Halite", + "huskybench": "HuskyBench", + "corewar": "CoreWar", + "robotrumble": "RobotRumble", + "robocode": "RoboCode", + "battlesnake": "BattleSnake", + "all": "Overall", +} + + +def canonical_model_name(name: str) -> str: + lowered = name.lower() + if lowered == "gpt-5": + return "gpt-5" + return lowered + + +def base_model_name(name: str) -> str: + for suffix in ("-default", "-low", "-medium", "-high"): + if name.endswith(suffix): + return name[: -len(suffix)] + return name + + +def load_rows(leaderboards_path: Path) -> tuple[list[str], dict[str, dict[str, tuple[int, int]]]]: + raw = json.loads(leaderboards_path.read_text()) + models: list[str] = [] + rows: dict[str, dict[str, tuple[int, int]]] = {} + + for game_key in GAME_ORDER: + board = raw.get(game_key, {}).get("board", []) + rows[game_key] = {} + for entry in board: + model = canonical_model_name(entry["model"]) + if model not in models: + models.append(model) + rows[game_key][model] = (int(entry["elo"]), int(entry["elo_std"])) + return models, rows + + +def plot(rows: dict[str, dict[str, tuple[int, int]]], models: list[str], output_base: Path, title: str) -> None: + fig, ax = plt.subplots(figsize=(11.5, 7.0)) + + y_positions = list(range(len(GAME_ORDER))) + spread = 0.48 + if len(models) == 1: + offsets = [0.0] + else: + step = spread / (len(models) - 1) + offsets = [(-spread / 2) + (i * step) for i in range(len(models))] + + for idx, model in enumerate(models): + ys = [y + offsets[idx] for y in y_positions] + xs = [rows[g][model][0] for g in GAME_ORDER] + xerr = [rows[g][model][1] for g in GAME_ORDER] + color = MODEL_TO_COLOR.get(model, MODEL_TO_COLOR.get(base_model_name(model), "#4C78A8")) + marker = MARKERS[idx % len(MARKERS)] + + ax.errorbar( + xs, + ys, + xerr=xerr, + fmt=marker, + color=color, + ecolor=color, + elinewidth=1.6, + capsize=3, + markersize=7, + label=model_display_name(model), + ) + + ax.set_yticks(y_positions) + ax.set_yticklabels([GAME_LABELS[g] for g in GAME_ORDER], fontproperties=FONT_REG, fontsize=12) + ax.invert_yaxis() + ax.set_xlabel("Elo", fontproperties=FONT_BOLD, fontsize=13) + ax.set_title(title, fontproperties=FONT_BOLD, fontsize=15, pad=12) + ax.grid(axis="x", alpha=0.25) + ax.axvline(1200, color="#888888", linestyle="--", linewidth=1, alpha=0.5) + ax.legend(frameon=False, prop=FONT_REG, loc="lower right", ncol=2) + + for spine in ["top", "right"]: + ax.spines[spine].set_visible(False) + + fig.tight_layout() + fig.savefig(output_base.with_suffix(".png"), dpi=220, bbox_inches="tight") + fig.savefig(output_base.with_suffix(".pdf"), bbox_inches="tight") + plt.close(fig) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Create a shareable per-game Elo comparison plot.") + parser.add_argument("leaderboards_json", type=Path) + parser.add_argument("--output-base", type=Path, required=True) + parser.add_argument("--title", type=str, default="OpenAI Benchmark Results by Arena") + args = parser.parse_args() + + models, rows = load_rows(args.leaderboards_json) + plot(rows, models, args.output_base, args.title) + + +if __name__ == "__main__": + main() diff --git a/scripts/plot_reasoning_head_to_head.py b/scripts/plot_reasoning_head_to_head.py new file mode 100644 index 00000000..d183c145 --- /dev/null +++ b/scripts/plot_reasoning_head_to_head.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 + +from __future__ import annotations + +import argparse +import json +from collections import defaultdict +from pathlib import Path + +import matplotlib.pyplot as plt +import numpy as np + +from codeclash.analysis.viz.utils import FONT_BOLD, FONT_REG + + +TIER_ORDER = ["default", "low", "medium", "high"] +ARENA_ORDER = ["BattleSnake", "CoreWar", "Halite", "HuskyBench", "RoboCode", "RobotRumble"] +ARENA_LABELS = { + "BattleSnake": "BattleSnake", + "CoreWar": "CoreWar", + "Halite": "Halite", + "HuskyBench": "HuskyBench", + "RoboCode": "RoboCode", + "RobotRumble": "RobotRumble", +} + + +def iter_live_metadata(run_root: Path): + for metadata_path in sorted(run_root.rglob("metadata.json")): + if "quarantine" in metadata_path.parts: + continue + yield metadata_path + + +def parse_tier(model_name: str) -> str: + for tier in TIER_ORDER: + if model_name.endswith(f"-{tier}"): + return tier + raise ValueError(f"Could not parse tier from {model_name}") + + +def load_stats(run_root: Path): + by_tier = {tier: {"gpt-5.4": 0, "gpt-5.3-codex": 0, "ties": 0, "total": 0} for tier in TIER_ORDER} + by_arena_tier = { + arena: {tier: {"gpt-5.4": 0, "gpt-5.3-codex": 0, "ties": 0, "total": 0} for tier in TIER_ORDER} + for arena in ARENA_ORDER + } + + for metadata_path in iter_live_metadata(run_root): + data = json.loads(metadata_path.read_text()) + game_name = data.get("config", {}).get("game", {}).get("name") or data.get("game", {}).get("name") + if game_name not in ARENA_ORDER: + continue + + players = [p.get("name") for p in data.get("config", {}).get("players", []) if isinstance(p, dict)] + if len(players) != 2: + continue + + tier = parse_tier(players[0]) + rounds = data.get("round_stats", {}) + if isinstance(rounds, dict): + rounds = list(rounds.values()) + + for round_stat in rounds: + winner = round_stat.get("winner") + bucket = by_tier[tier] + arena_bucket = by_arena_tier[game_name][tier] + if winner == "Tie": + bucket["ties"] += 1 + arena_bucket["ties"] += 1 + elif winner and winner.startswith("gpt-5.4"): + bucket["gpt-5.4"] += 1 + arena_bucket["gpt-5.4"] += 1 + elif winner and winner.startswith("gpt-5.3-codex"): + bucket["gpt-5.3-codex"] += 1 + arena_bucket["gpt-5.3-codex"] += 1 + else: + continue + bucket["total"] += 1 + arena_bucket["total"] += 1 + + return by_tier, by_arena_tier + + +def plot_overall(by_tier, output_path: Path) -> None: + tiers = TIER_ORDER + gpt54_rates = [] + codex_rates = [] + + for tier in tiers: + total = by_tier[tier]["total"] or 1 + gpt54_rates.append(100 * by_tier[tier]["gpt-5.4"] / total) + codex_rates.append(100 * by_tier[tier]["gpt-5.3-codex"] / total) + + y = np.arange(len(tiers)) + fig, ax = plt.subplots(figsize=(9, 4.8)) + + ax.barh(y - 0.18, gpt54_rates, height=0.34, color="#0B8F55", label="GPT-5.4") + ax.barh(y + 0.18, codex_rates, height=0.34, color="#C75B12", label="GPT-5.3-Codex") + + for idx, tier in enumerate(tiers): + total = by_tier[tier]["total"] + ax.text(gpt54_rates[idx] + 1.2, y[idx] - 0.18, f"{by_tier[tier]['gpt-5.4']}/{total}", va="center", fontproperties=FONT_REG, fontsize=10) + ax.text(codex_rates[idx] + 1.2, y[idx] + 0.18, f"{by_tier[tier]['gpt-5.3-codex']}/{total}", va="center", fontproperties=FONT_REG, fontsize=10) + + ax.set_yticks(y) + ax.set_yticklabels([tier.title() for tier in tiers], fontproperties=FONT_REG, fontsize=11) + ax.set_xlim(0, 100) + ax.set_xlabel("Win Rate Excluding Ties (%)", fontproperties=FONT_BOLD, fontsize=12) + ax.set_title("Direct Head-to-Head Win Rate by Reasoning Mode", fontproperties=FONT_BOLD, fontsize=14, pad=10) + ax.grid(axis="x", alpha=0.25) + ax.legend(frameon=False, prop=FONT_REG, loc="lower right") + for spine in ("top", "right"): + ax.spines[spine].set_visible(False) + + fig.tight_layout() + fig.savefig(output_path.with_suffix(".png"), dpi=220, bbox_inches="tight") + fig.savefig(output_path.with_suffix(".pdf"), bbox_inches="tight") + plt.close(fig) + + +def plot_heatmap(by_arena_tier, output_path: Path) -> None: + matrix = [] + annotations = [] + for arena in ARENA_ORDER: + row = [] + ann_row = [] + for tier in TIER_ORDER: + bucket = by_arena_tier[arena][tier] + total = bucket["total"] or 1 + rate = 100 * bucket["gpt-5.4"] / total + row.append(rate) + ann_row.append(f"{bucket['gpt-5.4']}/{total}") + matrix.append(row) + annotations.append(ann_row) + + matrix = np.array(matrix) + + fig, ax = plt.subplots(figsize=(8.8, 5.8)) + image = ax.imshow(matrix, cmap="RdYlGn", vmin=0, vmax=100, aspect="auto") + + ax.set_xticks(np.arange(len(TIER_ORDER))) + ax.set_xticklabels([tier.title() for tier in TIER_ORDER], fontproperties=FONT_REG, fontsize=11) + ax.set_yticks(np.arange(len(ARENA_ORDER))) + ax.set_yticklabels([ARENA_LABELS[a] for a in ARENA_ORDER], fontproperties=FONT_REG, fontsize=11) + ax.set_title("GPT-5.4 Win Rate by Arena and Reasoning Mode", fontproperties=FONT_BOLD, fontsize=14, pad=10) + + for row_idx in range(len(ARENA_ORDER)): + for col_idx in range(len(TIER_ORDER)): + value = matrix[row_idx, col_idx] + color = "#111111" if 25 <= value <= 75 else "#FFFFFF" + ax.text( + col_idx, + row_idx, + annotations[row_idx][col_idx], + ha="center", + va="center", + color=color, + fontproperties=FONT_REG, + fontsize=10, + ) + + cbar = fig.colorbar(image, ax=ax, fraction=0.046, pad=0.04) + cbar.set_label("GPT-5.4 Win Rate (%)", fontproperties=FONT_BOLD, fontsize=11) + + fig.tight_layout() + fig.savefig(output_path.with_suffix(".png"), dpi=220, bbox_inches="tight") + fig.savefig(output_path.with_suffix(".pdf"), bbox_inches="tight") + plt.close(fig) + + +def plot_per_game(by_arena_tier, output_path: Path) -> None: + fig, axes = plt.subplots(2, 3, figsize=(13.5, 8.2), sharex=True) + axes = axes.flatten() + y = np.arange(len(TIER_ORDER)) + + for ax, arena in zip(axes, ARENA_ORDER): + gpt54_rates = [] + codex_rates = [] + for tier in TIER_ORDER: + bucket = by_arena_tier[arena][tier] + total = bucket["total"] or 1 + gpt54_rates.append(100 * bucket["gpt-5.4"] / total) + codex_rates.append(100 * bucket["gpt-5.3-codex"] / total) + + ax.barh(y - 0.18, gpt54_rates, height=0.34, color="#0B8F55", label="GPT-5.4") + ax.barh(y + 0.18, codex_rates, height=0.34, color="#C75B12", label="GPT-5.3-Codex") + + for idx, tier in enumerate(TIER_ORDER): + total = by_arena_tier[arena][tier]["total"] + ax.text(gpt54_rates[idx] + 1.0, y[idx] - 0.18, f"{by_arena_tier[arena][tier]['gpt-5.4']}/{total}", va="center", fontproperties=FONT_REG, fontsize=8.5) + ax.text(codex_rates[idx] + 1.0, y[idx] + 0.18, f"{by_arena_tier[arena][tier]['gpt-5.3-codex']}/{total}", va="center", fontproperties=FONT_REG, fontsize=8.5) + + ax.set_title(ARENA_LABELS[arena], fontproperties=FONT_BOLD, fontsize=12, pad=8) + ax.set_xlim(0, 100) + ax.set_yticks(y) + ax.set_yticklabels([tier.title() for tier in TIER_ORDER], fontproperties=FONT_REG, fontsize=10) + ax.grid(axis="x", alpha=0.2) + for spine in ("top", "right"): + ax.spines[spine].set_visible(False) + + handles, labels = axes[0].get_legend_handles_labels() + fig.legend(handles, labels, frameon=False, prop=FONT_REG, loc="lower center", ncol=2, bbox_to_anchor=(0.5, -0.01)) + fig.suptitle("Direct Head-to-Head Win Rate by Arena and Reasoning Mode", fontproperties=FONT_BOLD, fontsize=15, y=0.98) + fig.supxlabel("Win Rate Excluding Ties (%)", fontproperties=FONT_BOLD, fontsize=12, y=0.04) + fig.tight_layout(rect=(0, 0.05, 1, 0.95)) + fig.savefig(output_path.with_suffix(".png"), dpi=220, bbox_inches="tight") + fig.savefig(output_path.with_suffix(".pdf"), bbox_inches="tight") + plt.close(fig) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Plot direct head-to-head reasoning-mode results from tournament metadata.") + parser.add_argument("run_root", type=Path) + parser.add_argument("--output-dir", type=Path, required=True) + args = parser.parse_args() + + args.output_dir.mkdir(parents=True, exist_ok=True) + by_tier, by_arena_tier = load_stats(args.run_root) + plot_overall(by_tier, args.output_dir / "reasoning_mode_win_rate") + plot_heatmap(by_arena_tier, args.output_dir / "reasoning_mode_arena_heatmap") + plot_per_game(by_arena_tier, args.output_dir / "reasoning_mode_win_rate_per_game") + + +if __name__ == "__main__": + main() diff --git a/scripts/print_leaderboard_table.py b/scripts/print_leaderboard_table.py new file mode 100644 index 00000000..9108d336 --- /dev/null +++ b/scripts/print_leaderboard_table.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 +import argparse +import json +from pathlib import Path + + +GAME_ORDER = ["halite", "huskybench", "corewar", "robotrumble", "robocode", "battlesnake", "all"] + + +def load_board(path: Path) -> dict: + with path.open() as f: + return json.load(f) + + +def as_lookup(board: list[dict]) -> dict[str, tuple[int, int]]: + return {row["model"]: (int(row["elo"]), int(row["elo_std"])) for row in board} + + +def main(input: Path, out: Path) -> None: + data = load_board(input) + models = {row["model"] for section in data.values() for row in section.get("board", [])} + + rows = [] + for model in sorted(models): + all_board = data.get("all", {}).get("board", []) + rank = next((int(r["rank"]) for r in all_board if r["model"] == model), 999) + + vals = [] + for game in GAME_ORDER: + lookup = as_lookup(data.get(game, {}).get("board", [])) + elo, std = lookup.get(model, (0, 0)) + vals.append(f"{elo} ± {std}") + rows.append((rank, model, vals)) + + rows.sort(key=lambda x: x[0]) + + header = "| Rank | Model | Halite | HuskyBench | CoreWar | RobotRumble | Robocode | BattleSnake | All |\n" + header += "|---:|---|---:|---:|---:|---:|---:|---:|---:|\n" + lines = [header] + for rank, model, vals in rows: + lines.append( + f"| {rank} | {model} | {vals[0]} | {vals[1]} | {vals[2]} | {vals[3]} | {vals[4]} | {vals[5]} | {vals[6]} |\n" + ) + + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text("".join(lines)) + print(f"Wrote {out}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=Path, required=True) + parser.add_argument("--out", type=Path, required=True) + args = parser.parse_args() + main(args.input, args.out) diff --git a/scripts/run_eval_pipeline.sh b/scripts/run_eval_pipeline.sh new file mode 100755 index 00000000..2905b017 --- /dev/null +++ b/scripts/run_eval_pipeline.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +usage() { + cat <<'EOF' +Usage: + scripts/run_eval_pipeline.sh --log-dir [--output-dir ] [--viewer] + +Description: + Runs post-benchmark analysis pipeline for CodeClash logs: + 1) backfill cost info into metadata + 2) compute win-rate summary + 3) compute Elo rankings + uncertainty outputs + 4) generate win-rate heatmap PDF + 5) render markdown leaderboard table for manual patching + 6) optionally launch local viewer + +Arguments: + --log-dir Required. Root directory containing tournament logs. + --output-dir Optional. Defaults to /analysis. + --viewer Optional. Launch viewer at end (blocks until Ctrl+C). + -h, --help Show this help. +EOF +} + +LOG_DIR="" +OUTPUT_DIR="" +OPEN_VIEWER=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --log-dir) + LOG_DIR="${2:-}" + shift 2 + ;; + --output-dir) + OUTPUT_DIR="${2:-}" + shift 2 + ;; + --viewer) + OPEN_VIEWER=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ -z "${LOG_DIR}" ]]; then + echo "Error: --log-dir is required." >&2 + usage + exit 1 +fi + +if [[ ! -d "${LOG_DIR}" ]]; then + echo "Error: log directory does not exist: ${LOG_DIR}" >&2 + exit 1 +fi + +if [[ -z "${OUTPUT_DIR}" ]]; then + OUTPUT_DIR="${LOG_DIR%/}/analysis" +fi + +ELO_OUT="${OUTPUT_DIR%/}/elo" +HEATMAP_OUT="${OUTPUT_DIR%/}/heatmap_win_rates.pdf" +TABLE_OUT="${OUTPUT_DIR%/}/leaderboard_table.md" + +mkdir -p "${ELO_OUT}" + +echo "==> Repo root: ${REPO_ROOT}" +echo "==> Log dir: ${LOG_DIR}" +echo "==> Output dir: ${OUTPUT_DIR}" + +cd "${REPO_ROOT}" + +echo "==> Step 1/5: Backfilling cost info into metadata..." +uv run python "${REPO_ROOT}/scripts/include_cost_info_in_metadata.py" "${LOG_DIR}" + +echo "==> Step 2/5: Computing win-rate summary..." +uv run python "${REPO_ROOT}/codeclash/analysis/metrics/win_rate.py" -d "${LOG_DIR}" + +echo "==> Step 3/5: Computing Elo rankings..." +uv run python "${REPO_ROOT}/codeclash/analysis/metrics/elo.py" \ + -d "${LOG_DIR}" \ + --output-dir "${ELO_OUT}" + +echo "==> Step 4/5: Generating win-rate heatmap..." +uv run python "${REPO_ROOT}/codeclash/analysis/viz/heatmap_win_rates.py" \ + -d "${LOG_DIR}" \ + -o "${HEATMAP_OUT}" + +echo "==> Step 5/5: Rendering leaderboard table..." +uv run python "${REPO_ROOT}/scripts/print_leaderboard_table.py" \ + --input "${ELO_OUT}/leaderboards.json" \ + --out "${TABLE_OUT}" + +echo +echo "Pipeline complete." +echo "Elo outputs: ${ELO_OUT}" +echo "Heatmap: ${HEATMAP_OUT}" +echo "Leaderboard table: ${TABLE_OUT}" + +if [[ ${OPEN_VIEWER} -eq 1 ]]; then + echo "==> Launching viewer (Ctrl+C to stop)..." + uv run python "${REPO_ROOT}/scripts/run_viewer.py" -d "${LOG_DIR}" +else + echo "To inspect trajectories: uv run python ${REPO_ROOT}/scripts/run_viewer.py -d ${LOG_DIR}" +fi diff --git a/scripts/run_gpt54_gpt53codex_round_robin.sh b/scripts/run_gpt54_gpt53codex_round_robin.sh new file mode 100755 index 00000000..1dfaef80 --- /dev/null +++ b/scripts/run_gpt54_gpt53codex_round_robin.sh @@ -0,0 +1,312 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +RUN_ROOT="${REPO_ROOT}/logs/gpt54_gpt53codex_round_robin_$(date +%Y%m%d_%H%M%S)" +MAX_CONFIG_RETRIES=2 +CONTINUE_ON_ERROR=0 +OPEN_VIEWER=0 +RESUME=0 +DRY_RUN=0 +PARALLEL=0 +MAX_PARALLEL=4 + +usage() { + cat <<'EOF' +Usage: + scripts/run_gpt54_gpt53codex_round_robin.sh [options] + +Description: + Runs a full round robin across 8 variants: + - gpt-5.4-default + - gpt-5.4-low + - gpt-5.4-medium + - gpt-5.4-high + - gpt-5.3-codex-default + - gpt-5.3-codex-low + - gpt-5.3-codex-medium + - gpt-5.3-codex-high + + This creates one connected match graph so Elo is meaningful across all 8 variants. + +Options: + --run-root Set custom logs root for this batch. + --max-config-retries Retry each failed arena config up to n times (default: 2). + --continue-on-error Continue after a failed pairing. + --resume Skip already-completed per-arena configs in an existing --run-root. + --parallel Run pairings in parallel. + --max-parallel Maximum concurrent pairings when --parallel is set (default: 4). + --viewer Launch viewer after the eval pipeline. + --dry-run Print commands without running them. + -h, --help Show help. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --run-root) + RUN_ROOT="${2:-}" + shift 2 + ;; + --max-config-retries) + MAX_CONFIG_RETRIES="${2:-}" + shift 2 + ;; + --continue-on-error) + CONTINUE_ON_ERROR=1 + shift + ;; + --resume) + RESUME=1 + shift + ;; + --parallel) + PARALLEL=1 + shift + ;; + --max-parallel) + MAX_PARALLEL="${2:-}" + shift 2 + ;; + --viewer) + OPEN_VIEWER=1 + shift + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if ! [[ "${MAX_CONFIG_RETRIES}" =~ ^[0-9]+$ ]]; then + echo "Error: --max-config-retries must be a non-negative integer, got '${MAX_CONFIG_RETRIES}'" >&2 + exit 1 +fi + +if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || [[ "${MAX_PARALLEL}" -lt 1 ]]; then + echo "Error: --max-parallel must be a positive integer, got '${MAX_PARALLEL}'" >&2 + exit 1 +fi + +if [[ ! -x "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" ]]; then + echo "Missing or non-executable: ${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" >&2 + exit 1 +fi + +if [[ ! -x "${REPO_ROOT}/scripts/run_eval_pipeline.sh" ]]; then + echo "Missing or non-executable: ${REPO_ROOT}/scripts/run_eval_pipeline.sh" >&2 + exit 1 +fi + +mkdir -p "${RUN_ROOT}" + +declare -a VARIANT_KEYS=( + "gpt-5.4-default" + "gpt-5.4-low" + "gpt-5.4-medium" + "gpt-5.4-high" + "gpt-5.3-codex-default" + "gpt-5.3-codex-low" + "gpt-5.3-codex-medium" + "gpt-5.3-codex-high" +) + +variant_model() { + case "$1" in + gpt-5.4-*) echo "openai/gpt-5.4" ;; + gpt-5.3-codex-*) echo "openai/gpt-5.3-codex" ;; + *) echo "Unknown variant: $1" >&2; exit 1 ;; + esac +} + +variant_effort() { + case "$1" in + *-default) echo "" ;; + *-low) echo "low" ;; + *-medium) echo "medium" ;; + *-high) echo "high" ;; + *) echo "Unknown variant: $1" >&2; exit 1 ;; + esac +} + +common_args=( + --log-dir "${RUN_ROOT}" + --max-config-retries "${MAX_CONFIG_RETRIES}" +) + +if [[ ${CONTINUE_ON_ERROR} -eq 1 ]]; then + common_args+=(--continue-on-error) +fi + +if [[ ${RESUME} -eq 1 ]]; then + common_args+=(--resume) +fi + +run_pairing() { + local player_alias="$1" + local opponent_alias="$2" + local player_model + local opponent_model + local player_effort + local opponent_effort + local -a args + + player_model="$(variant_model "${player_alias}")" + opponent_model="$(variant_model "${opponent_alias}")" + player_effort="$(variant_effort "${player_alias}")" + opponent_effort="$(variant_effort "${opponent_alias}")" + + args=( + "${common_args[@]}" + --model "${player_model}" + --alias "${player_alias}" + --opponent "${opponent_model}" + --opponent-alias "${opponent_alias}" + ) + + if [[ -n "${player_effort}" ]]; then + args+=(--player-reasoning-effort "${player_effort}") + fi + if [[ -n "${opponent_effort}" ]]; then + args+=(--opponent-reasoning-effort "${opponent_effort}") + fi + + echo "==> Pairing: ${player_alias} vs ${opponent_alias}" + "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" "${args[@]}" +} + +print_pairing_command() { + local player_alias="$1" + local opponent_alias="$2" + local player_model + local opponent_model + local player_effort + local opponent_effort + + player_model="$(variant_model "${player_alias}")" + opponent_model="$(variant_model "${opponent_alias}")" + player_effort="$(variant_effort "${player_alias}")" + opponent_effort="$(variant_effort "${opponent_alias}")" + + printf "%s --model %s --alias %s --opponent %s --opponent-alias %s --log-dir %s --max-config-retries %s" \ + "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" \ + "${player_model}" \ + "${player_alias}" \ + "${opponent_model}" \ + "${opponent_alias}" \ + "${RUN_ROOT}" \ + "${MAX_CONFIG_RETRIES}" + + if [[ ${CONTINUE_ON_ERROR} -eq 1 ]]; then + printf " --continue-on-error" + fi + if [[ ${RESUME} -eq 1 ]]; then + printf " --resume" + fi + if [[ -n "${player_effort}" ]]; then + printf " --player-reasoning-effort %s" "${player_effort}" + fi + if [[ -n "${opponent_effort}" ]]; then + printf " --opponent-reasoning-effort %s" "${opponent_effort}" + fi + printf "\n" +} + +declare -a PAIRS=() +for ((i = 0; i < ${#VARIANT_KEYS[@]}; i++)); do + for ((j = i + 1; j < ${#VARIANT_KEYS[@]}; j++)); do + PAIRS+=("${VARIANT_KEYS[i]}|${VARIANT_KEYS[j]}") + done +done + +echo "==> Repo root: ${REPO_ROOT}" +echo "==> Run root: ${RUN_ROOT}" +echo "==> Variants: ${#VARIANT_KEYS[@]}" +echo "==> Pairings: ${#PAIRS[@]}" +echo "==> Parallel pairings: ${PARALLEL}" +echo "==> Max parallel pairings: ${MAX_PARALLEL}" +echo "==> Continue on error: ${CONTINUE_ON_ERROR}" +echo "==> Resume: ${RESUME}" +echo "==> Max config retries: ${MAX_CONFIG_RETRIES}" + +if [[ ${DRY_RUN} -eq 1 ]]; then + for pair in "${PAIRS[@]}"; do + IFS="|" read -r player_alias opponent_alias <<<"${pair}" + print_pairing_command "${player_alias}" "${opponent_alias}" + done + if [[ ${OPEN_VIEWER} -eq 1 ]]; then + echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${RUN_ROOT} --viewer" + else + echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${RUN_ROOT}" + fi + exit 0 +fi + +if [[ ${PARALLEL} -eq 1 ]]; then + declare -a PIDS=() + declare -a PID_PAIRS=() + declare -i FAILURE_COUNT=0 + + wait_for_one() { + local pid="${PIDS[0]}" + local pair="${PID_PAIRS[0]}" + local status=0 + + if ! wait "${pid}"; then + status=$? + echo "==> Pairing failed: ${pair} (exit ${status})" >&2 + FAILURE_COUNT+=1 + if [[ ${CONTINUE_ON_ERROR} -ne 1 ]]; then + echo "==> Stopping due to pairing failure and --continue-on-error not set." >&2 + exit "${status}" + fi + else + echo "==> Pairing finished: ${pair}" + fi + + PIDS=("${PIDS[@]:1}") + PID_PAIRS=("${PID_PAIRS[@]:1}") + } + + for pair in "${PAIRS[@]}"; do + IFS="|" read -r player_alias opponent_alias <<<"${pair}" + run_pairing "${player_alias}" "${opponent_alias}" & + PIDS+=("$!") + PID_PAIRS+=("${player_alias} vs ${opponent_alias}") + if [[ ${#PIDS[@]} -ge ${MAX_PARALLEL} ]]; then + wait_for_one + fi + done + + while [[ ${#PIDS[@]} -gt 0 ]]; do + wait_for_one + done + + if [[ ${FAILURE_COUNT} -gt 0 ]]; then + echo "==> ${FAILURE_COUNT} pairing(s) failed." >&2 + fi +else + for pair in "${PAIRS[@]}"; do + IFS="|" read -r player_alias opponent_alias <<<"${pair}" + run_pairing "${player_alias}" "${opponent_alias}" + done +fi + +if [[ ${OPEN_VIEWER} -eq 1 ]]; then + "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${RUN_ROOT}" --viewer +else + "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${RUN_ROOT}" +fi diff --git a/scripts/run_gpt54_vs_gpt53codex_high_remaining.sh b/scripts/run_gpt54_vs_gpt53codex_high_remaining.sh new file mode 100755 index 00000000..a94eff86 --- /dev/null +++ b/scripts/run_gpt54_vs_gpt53codex_high_remaining.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +set -euo pipefail + +REPO_ROOT="/Users/muhtasham/Documents/CodeClash" +RUN_ROOT="${REPO_ROOT}/logs/gpt54_vs_gpt53codex_reasoning_20260308_164105" +CFG_ROOT="${REPO_ROOT}/configs/generated/gpt-5.4-high_vs_gpt-5.3-codex-high" +RUN_SUFFIX="gpt-5.4-high-vs-gpt-5.3-codex-high" + +cd "${REPO_ROOT}" + +# Force key source to repo .env so mini-swe-agent global env doesn't silently override. +OPENAI_KEY_FROM_REPO="$( + uv run python - "${REPO_ROOT}/.env" <<'PY' +import sys +from pathlib import Path +from dotenv import dotenv_values + +env_path = Path(sys.argv[1]) +if not env_path.exists(): + raise SystemExit(2) +val = dotenv_values(env_path).get("OPENAI_API_KEY", "") +print(val or "") +PY +)" + +if [[ -z "${OPENAI_KEY_FROM_REPO}" ]]; then + echo "Error: OPENAI_API_KEY missing in ${REPO_ROOT}/.env" >&2 + exit 1 +fi + +export OPENAI_API_KEY="${OPENAI_KEY_FROM_REPO}" +unset OPENAI_KEY_FROM_REPO + +uv run python - "${REPO_ROOT}/.env" <<'PY' +import hashlib +import os +import sys +from pathlib import Path +from dotenv import dotenv_values + +def fp(v: str) -> str: + return f"len={len(v)} sha256[:10]={hashlib.sha256(v.encode()).hexdigest()[:10]} tail={v[-4:]}" + +repo_env = Path(sys.argv[1]) +repo_key = dotenv_values(repo_env).get("OPENAI_API_KEY", "") +env_key = os.environ.get("OPENAI_API_KEY", "") +mini_env = Path.home() / "Library/Application Support/mini-swe-agent/.env" +mini_key = dotenv_values(mini_env).get("OPENAI_API_KEY", "") if mini_env.exists() else "" + +print(f"==> OPENAI key source forced: {repo_env}") +print(f"==> OPENAI key fingerprint: {fp(env_key)}") +if mini_key and mini_key != repo_key: + print("==> note: mini-swe-agent global key differs; repo key is forced for this run.") +PY + +mkdir -p "${RUN_ROOT}/quarantine" + +find "${RUN_ROOT}" -maxdepth 1 -type d \ + -name 'PvpTournament.Halite.r15.s250.p2.gpt-5.3-codex-high.gpt-5.4-high.gpt-5.4-high-vs-gpt-5.3-codex-high.*' \ + -exec mv {} "${RUN_ROOT}/quarantine"/ \; + +uv run python main.py \ + "${CFG_ROOT}/Halite__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml" \ + -o "${RUN_ROOT}" \ + -s "${RUN_SUFFIX}" + +uv run python main.py \ + "${CFG_ROOT}/HuskyBench__gpt-5.4-high__gpt-5.3-codex-high__r15__s100.yaml" \ + -o "${RUN_ROOT}" \ + -s "${RUN_SUFFIX}" + +uv run python main.py \ + "${CFG_ROOT}/RoboCode__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml" \ + -o "${RUN_ROOT}" \ + -s "${RUN_SUFFIX}" + +uv run python main.py \ + "${CFG_ROOT}/RobotRumble__gpt-5.4-high__gpt-5.3-codex-high__r15__s250.yaml" \ + -o "${RUN_ROOT}" \ + -s "${RUN_SUFFIX}" diff --git a/scripts/run_gpt54_vs_gpt53codex_reasoning.sh b/scripts/run_gpt54_vs_gpt53codex_reasoning.sh new file mode 100755 index 00000000..962eefeb --- /dev/null +++ b/scripts/run_gpt54_vs_gpt53codex_reasoning.sh @@ -0,0 +1,222 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +PLAYER_MODEL="openai/gpt-5.4" +OPPONENT_MODEL="openai/gpt-5.3-codex" +RUN_ROOT="${REPO_ROOT}/logs/gpt54_vs_gpt53codex_reasoning_$(date +%Y%m%d_%H%M%S)" +MAX_CONFIG_RETRIES=2 +CONTINUE_ON_ERROR=0 +OPEN_VIEWER=0 +RESUME=0 +DRY_RUN=0 +PARALLEL=0 +MAX_PARALLEL=4 + +usage() { + cat <<'EOF' +Usage: + scripts/run_gpt54_vs_gpt53codex_reasoning.sh [options] + +Description: + Runs direct head-to-head benchmark sweeps for openai/gpt-5.4 vs openai/gpt-5.3-codex + across four effort tiers in one shared log root: + - default + - low + - medium + - high + +Options: + --run-root Set custom logs root for this batch. + --max-config-retries Retry each failed arena config up to n times (default: 2). + --continue-on-error Continue to remaining configs/tiers when one fails. + --resume Skip already-completed per-arena configs in an existing --run-root. + --parallel Run tiers in parallel. + --max-parallel Maximum concurrent tiers when --parallel is set (default: 4). + --viewer Launch viewer after the eval pipeline. + --dry-run Print commands without running them. + -h, --help Show help. +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --run-root) + RUN_ROOT="${2:-}" + shift 2 + ;; + --max-config-retries) + MAX_CONFIG_RETRIES="${2:-}" + shift 2 + ;; + --continue-on-error) + CONTINUE_ON_ERROR=1 + shift + ;; + --resume) + RESUME=1 + shift + ;; + --parallel) + PARALLEL=1 + shift + ;; + --max-parallel) + MAX_PARALLEL="${2:-}" + shift 2 + ;; + --viewer) + OPEN_VIEWER=1 + shift + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if ! [[ "${MAX_CONFIG_RETRIES}" =~ ^[0-9]+$ ]]; then + echo "Error: --max-config-retries must be a non-negative integer, got '${MAX_CONFIG_RETRIES}'" >&2 + exit 1 +fi + +if ! [[ "${MAX_PARALLEL}" =~ ^[0-9]+$ ]] || [[ "${MAX_PARALLEL}" -lt 1 ]]; then + echo "Error: --max-parallel must be a positive integer, got '${MAX_PARALLEL}'" >&2 + exit 1 +fi + +if [[ ! -x "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" ]]; then + echo "Missing or non-executable: ${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" >&2 + exit 1 +fi + +if [[ ! -x "${REPO_ROOT}/scripts/run_eval_pipeline.sh" ]]; then + echo "Missing or non-executable: ${REPO_ROOT}/scripts/run_eval_pipeline.sh" >&2 + exit 1 +fi + +mkdir -p "${RUN_ROOT}" + +common_args=( + --model "${PLAYER_MODEL}" + --opponent "${OPPONENT_MODEL}" + --log-dir "${RUN_ROOT}" + --max-config-retries "${MAX_CONFIG_RETRIES}" +) + +if [[ ${CONTINUE_ON_ERROR} -eq 1 ]]; then + common_args+=(--continue-on-error) +fi + +if [[ ${RESUME} -eq 1 ]]; then + common_args+=(--resume) +fi + +run_tier() { + local tier="$1" + local player_alias="gpt-5.4-${tier}" + local opponent_alias="gpt-5.3-codex-${tier}" + local -a args=("${common_args[@]}" --alias "${player_alias}" --opponent-alias "${opponent_alias}") + + if [[ "${tier}" != "default" ]]; then + args+=(--player-reasoning-effort "${tier}" --opponent-reasoning-effort "${tier}") + fi + + echo "==> Running tier: ${tier}" + "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh" "${args[@]}" +} + +echo "==> Repo root: ${REPO_ROOT}" +echo "==> Run root: ${RUN_ROOT}" +echo "==> Player model: ${PLAYER_MODEL}" +echo "==> Opponent model: ${OPPONENT_MODEL}" +echo "==> Tiers: default, low, medium, high" +echo "==> Continue on error: ${CONTINUE_ON_ERROR}" +echo "==> Resume: ${RESUME}" +echo "==> Parallel tiers: ${PARALLEL}" +echo "==> Max parallel tiers: ${MAX_PARALLEL}" +echo "==> Max config retries: ${MAX_CONFIG_RETRIES}" + +if [[ ${DRY_RUN} -eq 1 ]]; then + for tier in default low medium high; do + if [[ "${tier}" == "default" ]]; then + echo "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh ${common_args[*]} --alias gpt-5.4-default --opponent-alias gpt-5.3-codex-default" + else + echo "${REPO_ROOT}/scripts/run_openai_model_benchmarks.sh ${common_args[*]} --alias gpt-5.4-${tier} --opponent-alias gpt-5.3-codex-${tier} --player-reasoning-effort ${tier} --opponent-reasoning-effort ${tier}" + fi + done + if [[ ${OPEN_VIEWER} -eq 1 ]]; then + echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${RUN_ROOT} --viewer" + else + echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${RUN_ROOT}" + fi + exit 0 +fi + +if [[ ${PARALLEL} -eq 1 ]]; then + declare -a PIDS=() + declare -a PID_TIERS=() + declare -i FAILURE_COUNT=0 + + wait_for_one() { + local pid="${PIDS[0]}" + local tier="${PID_TIERS[0]}" + local status=0 + + if ! wait "${pid}"; then + status=$? + echo "==> Tier failed: ${tier} (exit ${status})" >&2 + FAILURE_COUNT+=1 + if [[ ${CONTINUE_ON_ERROR} -ne 1 ]]; then + echo "==> Stopping due to tier failure and --continue-on-error not set." >&2 + exit "${status}" + fi + else + echo "==> Tier finished: ${tier}" + fi + + PIDS=("${PIDS[@]:1}") + PID_TIERS=("${PID_TIERS[@]:1}") + } + + for tier in default low medium high; do + run_tier "${tier}" & + PIDS+=("$!") + PID_TIERS+=("${tier}") + if [[ ${#PIDS[@]} -ge ${MAX_PARALLEL} ]]; then + wait_for_one + fi + done + + while [[ ${#PIDS[@]} -gt 0 ]]; do + wait_for_one + done + + if [[ ${FAILURE_COUNT} -gt 0 ]]; then + echo "==> ${FAILURE_COUNT} tier(s) failed." >&2 + fi +else + run_tier default + run_tier low + run_tier medium + run_tier high +fi + +if [[ ${OPEN_VIEWER} -eq 1 ]]; then + "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${RUN_ROOT}" --viewer +else + "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${RUN_ROOT}" +fi diff --git a/scripts/run_openai_model_benchmarks.sh b/scripts/run_openai_model_benchmarks.sh new file mode 100755 index 00000000..d258e655 --- /dev/null +++ b/scripts/run_openai_model_benchmarks.sh @@ -0,0 +1,498 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +usage() { + cat <<'EOF' +Usage: + scripts/run_openai_model_benchmarks.sh --model [options] + +Description: + Generates configs for one OpenAI model vs a configurable opponent across the standard + benchmark arenas, runs all tournaments, and optionally runs post-eval analysis. + +Required: + --model Example: openai/gpt-5.4-pro-2026-03-05 + +Optional: + --alias Player/config alias (default: model basename). + --opponent Opponent model id (default: openai/gpt-5). + --opponent-alias Opponent display alias in config (default: opponent basename). + --reasoning-effort Set reasoning_effort for both players (e.g. low|medium|high). + --player-reasoning-effort + Set reasoning_effort only for the evaluated model. + --opponent-reasoning-effort + Set reasoning_effort only for the opponent model. + --log-dir Logs root output dir (default: logs/_vs__). + --configs-dir Generated configs dir (default: configs/generated). + --resume Skip configs that already have a completed run in --log-dir. + --max-config-retries Retry each failed config up to n times (default: 2). + --continue-on-error Continue to remaining configs even if one fails. + --post-eval Run scripts/run_eval_pipeline.sh after runs. + --viewer With --post-eval, also launch viewer at end. + --dry-run Generate configs + print run commands only. + -h, --help Show help. + +Notes: + - This script uses model_class: litellm for both players. + - It expects OPENAI_API_KEY (and usually GITHUB_TOKEN) in your environment. +EOF +} + +MODEL="" +ALIAS="" +OPPONENT="openai/gpt-5" +OPPONENT_ALIAS="" +REASONING_EFFORT="" +PLAYER_REASONING_EFFORT="" +OPPONENT_REASONING_EFFORT="" +LOG_DIR="" +CONFIGS_DIR="${REPO_ROOT}/configs/generated" +RUN_POST_EVAL=0 +OPEN_VIEWER=0 +DRY_RUN=0 +RESUME=0 +CONTINUE_ON_ERROR=0 +MAX_CONFIG_RETRIES=2 + +while [[ $# -gt 0 ]]; do + case "$1" in + --model) + MODEL="${2:-}" + shift 2 + ;; + --alias) + ALIAS="${2:-}" + shift 2 + ;; + --opponent) + OPPONENT="${2:-}" + shift 2 + ;; + --opponent-alias) + OPPONENT_ALIAS="${2:-}" + shift 2 + ;; + --reasoning-effort) + REASONING_EFFORT="${2:-}" + shift 2 + ;; + --player-reasoning-effort) + PLAYER_REASONING_EFFORT="${2:-}" + shift 2 + ;; + --opponent-reasoning-effort) + OPPONENT_REASONING_EFFORT="${2:-}" + shift 2 + ;; + --log-dir) + LOG_DIR="${2:-}" + shift 2 + ;; + --configs-dir) + CONFIGS_DIR="${2:-}" + shift 2 + ;; + --post-eval) + RUN_POST_EVAL=1 + shift + ;; + --resume) + RESUME=1 + shift + ;; + --continue-on-error) + CONTINUE_ON_ERROR=1 + shift + ;; + --max-config-retries) + MAX_CONFIG_RETRIES="${2:-}" + shift 2 + ;; + --viewer) + OPEN_VIEWER=1 + shift + ;; + --dry-run) + DRY_RUN=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ -z "${MODEL}" ]]; then + echo "Error: --model is required." >&2 + usage + exit 1 +fi + +if ! [[ "${MAX_CONFIG_RETRIES}" =~ ^[0-9]+$ ]]; then + echo "Error: --max-config-retries must be a non-negative integer, got '${MAX_CONFIG_RETRIES}'" >&2 + exit 1 +fi + +# Force key source to repo .env so mini-swe-agent global env doesn't silently override. +OPENAI_KEY_FROM_REPO="$( + uv run python - "${REPO_ROOT}/.env" <<'PY' +import sys +from pathlib import Path +from dotenv import dotenv_values + +env_path = Path(sys.argv[1]) +if not env_path.exists(): + raise SystemExit(2) +val = dotenv_values(env_path).get("OPENAI_API_KEY", "") +print(val or "") +PY +)" + +if [[ -z "${OPENAI_KEY_FROM_REPO}" ]]; then + echo "Error: OPENAI_API_KEY missing in ${REPO_ROOT}/.env" >&2 + exit 1 +fi +export OPENAI_API_KEY="${OPENAI_KEY_FROM_REPO}" +unset OPENAI_KEY_FROM_REPO + +uv run python - "${REPO_ROOT}/.env" <<'PY' +import hashlib +import os +import sys +from pathlib import Path +from dotenv import dotenv_values + +def fp(v: str) -> str: + return f"len={len(v)} sha256[:10]={hashlib.sha256(v.encode()).hexdigest()[:10]} tail={v[-4:]}" + +repo_env = Path(sys.argv[1]) +repo_key = dotenv_values(repo_env).get("OPENAI_API_KEY", "") +env_key = os.environ.get("OPENAI_API_KEY", "") +mini_env = Path.home() / "Library/Application Support/mini-swe-agent/.env" +mini_key = dotenv_values(mini_env).get("OPENAI_API_KEY", "") if mini_env.exists() else "" + +print(f"==> OPENAI key source forced: {repo_env}") +print(f"==> OPENAI key fingerprint: {fp(env_key)}") +if mini_key and mini_key != repo_key: + print("==> note: mini-swe-agent global key differs; repo key is forced for this run.") +PY + +normalize_model_id() { + local v="${1#@}" + if [[ "${v}" == */* ]]; then + echo "${v}" + else + echo "openai/${v}" + fi +} + +# Accept "@openai/gpt-5", "openai/gpt-5", and bare "gpt-5". +MODEL="$(normalize_model_id "${MODEL}")" +OPPONENT="$(normalize_model_id "${OPPONENT}")" + +if [[ -n "${REASONING_EFFORT}" ]]; then + if [[ -z "${PLAYER_REASONING_EFFORT}" ]]; then + PLAYER_REASONING_EFFORT="${REASONING_EFFORT}" + fi + if [[ -z "${OPPONENT_REASONING_EFFORT}" ]]; then + OPPONENT_REASONING_EFFORT="${REASONING_EFFORT}" + fi +fi + +requires_responses_model_class() { + local m="$1" + case "$m" in + openai/gpt-5.4-pro-2026-03-05) return 0 ;; + *) return 1 ;; + esac +} + +PLAYER_MODEL_CLASS="litellm" +OPPONENT_MODEL_CLASS="litellm" +RESPONSES_SANITIZED_CLASS="codeclash.agents.litellm_response_sanitized_model.LitellmResponseSanitizedModel" +responses_class_available=0 +if uv run python - "${RESPONSES_SANITIZED_CLASS}" <<'PY' >/dev/null 2>&1 +import importlib +import sys + +class_path = sys.argv[1] +module_name, class_name = class_path.rsplit(".", 1) +mod = importlib.import_module(module_name) +getattr(mod, class_name) +PY +then + responses_class_available=1 +fi +if requires_responses_model_class "${MODEL}"; then + if [[ ${responses_class_available} -eq 1 ]]; then + PLAYER_MODEL_CLASS="${RESPONSES_SANITIZED_CLASS}" + else + echo "Warning: requested Responses model class '${RESPONSES_SANITIZED_CLASS}' is not importable; falling back to 'litellm'." >&2 + fi +fi +if requires_responses_model_class "${OPPONENT}"; then + if [[ ${responses_class_available} -eq 1 ]]; then + OPPONENT_MODEL_CLASS="${RESPONSES_SANITIZED_CLASS}" + else + echo "Warning: requested Responses model class '${RESPONSES_SANITIZED_CLASS}' is not importable; falling back to 'litellm'." >&2 + fi +fi + +if [[ -z "${ALIAS}" ]]; then + ALIAS="${MODEL#openai/}" +fi + +if [[ -z "${OPPONENT_ALIAS}" ]]; then + OPPONENT_ALIAS="${OPPONENT#openai/}" +fi + +SAFE_ALIAS="${ALIAS//\//-}" +SAFE_ALIAS="${SAFE_ALIAS//@/}" +SAFE_OPPONENT_ALIAS="${OPPONENT_ALIAS//\//-}" +SAFE_OPPONENT_ALIAS="${SAFE_OPPONENT_ALIAS//@/}" +RUN_SUFFIX="${SAFE_ALIAS}-vs-${SAFE_OPPONENT_ALIAS}" + +if [[ -z "${LOG_DIR}" ]]; then + TS="$(date +%Y%m%d_%H%M%S)" + LOG_DIR="${REPO_ROOT}/logs/${SAFE_ALIAS}_vs_${SAFE_OPPONENT_ALIAS}_${TS}" +fi + +RUN_CONFIG_DIR="${CONFIGS_DIR%/}/${SAFE_ALIAS}_vs_${SAFE_OPPONENT_ALIAS}" +mkdir -p "${RUN_CONFIG_DIR}" "${LOG_DIR}" + +declare -a TEMPLATES=( + "${REPO_ROOT}/configs/main/BattleSnake__gpt-5__o3__r15__s1000.yaml" + "${REPO_ROOT}/configs/main/CoreWar__gpt-5__o3__r15__s1000.yaml" + "${REPO_ROOT}/configs/main/Halite__gpt-5__o3__r15__s250.yaml" + "${REPO_ROOT}/configs/main/RoboCode__gpt-5__o3__r15__s250.yaml" + "${REPO_ROOT}/configs/main/RobotRumble__gpt-5__o3__r15__s250.yaml" + "${REPO_ROOT}/configs/main/HuskyBench__gpt-5__o3__r15__s100.yaml" +) + +for tpl in "${TEMPLATES[@]}"; do + if [[ ! -f "${tpl}" ]]; then + echo "Error: Missing template config: ${tpl}" >&2 + exit 1 + fi +done + +declare -a GENERATED_CONFIGS=() +for tpl in "${TEMPLATES[@]}"; do + base_name="$(basename "${tpl}")" + out_name="${base_name/__gpt-5__o3__/__${SAFE_ALIAS}__${SAFE_OPPONENT_ALIAS}__}" + out_path="${RUN_CONFIG_DIR}/${out_name}" + + uv run python - "${tpl}" "${out_path}" "${MODEL}" "${ALIAS}" "${OPPONENT}" "${OPPONENT_ALIAS}" "${PLAYER_MODEL_CLASS}" "${OPPONENT_MODEL_CLASS}" "${PLAYER_REASONING_EFFORT}" "${OPPONENT_REASONING_EFFORT}" <<'PY' +from pathlib import Path +import re +import sys + +src = Path(sys.argv[1]) +dst = Path(sys.argv[2]) +model = sys.argv[3] +alias = sys.argv[4] +opponent = sys.argv[5] +opponent_alias = sys.argv[6] +player_model_class = sys.argv[7] +opponent_model_class = sys.argv[8] +player_reasoning_effort = sys.argv[9] +opponent_reasoning_effort = sys.argv[10] + +text = src.read_text() + +# First player name in template is gpt-5. +text = re.sub(r"(?m)^ name: gpt-5$", f" name: {alias}", text, count=1) +# Second player name in template is o3. +text = re.sub(r"(?m)^ name: o3$", f" name: {opponent_alias}", text, count=1) + +# Convert model IDs from Portkey-style "@openai/*" to LiteLLM "openai/*". +text = text.replace("model_name: '@openai/gpt-5'", f"model_name: '{model}'") +text = text.replace("model_name: '@openai/o3'", f"model_name: '{opponent}'") + +# Base class for generated configs. +text = text.replace("model_class: portkey", "model_class: litellm") + +# The template has exactly two player blocks; map class by player order. +class_lines = [i for i, line in enumerate(text.splitlines()) if line.strip() == "model_class: litellm"] +lines = text.splitlines() +if len(class_lines) >= 1 and player_model_class != "litellm": + lines[class_lines[0]] = re.sub(r"litellm$", player_model_class, lines[class_lines[0]]) +if len(class_lines) >= 2 and opponent_model_class != "litellm": + lines[class_lines[1]] = re.sub(r"litellm$", opponent_model_class, lines[class_lines[1]]) + +# Inject per-player reasoning effort as model_kwargs when requested. +offset = 0 +targets = [ + (class_lines[0] if len(class_lines) >= 1 else None, player_reasoning_effort), + (class_lines[1] if len(class_lines) >= 2 else None, opponent_reasoning_effort), +] +for class_idx, effort in targets: + if class_idx is None or not effort: + continue + idx = class_idx + offset + indent = re.match(r"^(\s*)", lines[idx]).group(1) + effort_escaped = effort.replace("'", "''") + lines[idx + 1 : idx + 1] = [ + f"{indent}model_kwargs:", + f"{indent} reasoning_effort: '{effort_escaped}'", + ] + offset += 2 + +text = "\n".join(lines) + ("\n" if text.endswith("\n") else "") + +dst.write_text(text) +PY + + GENERATED_CONFIGS+=("${out_path}") +done + +echo "==> Model: ${MODEL}" +echo "==> Alias: ${ALIAS}" +echo "==> Opponent: ${OPPONENT}" +echo "==> Opponent alias: ${OPPONENT_ALIAS}" +echo "==> Player model class: ${PLAYER_MODEL_CLASS}" +echo "==> Opponent model class: ${OPPONENT_MODEL_CLASS}" +echo "==> Player reasoning_effort: ${PLAYER_REASONING_EFFORT:-}" +echo "==> Opponent reasoning_effort: ${OPPONENT_REASONING_EFFORT:-}" +echo "==> Generated configs dir: ${RUN_CONFIG_DIR}" +echo "==> Logs dir: ${LOG_DIR}" +echo "==> Resume: ${RESUME}" +echo "==> Continue on error: ${CONTINUE_ON_ERROR}" +echo "==> Max config retries: ${MAX_CONFIG_RETRIES}" +echo "==> Configs:" +printf ' - %s\n' "${GENERATED_CONFIGS[@]}" + +if [[ ${DRY_RUN} -eq 1 ]]; then + echo + echo "Dry-run only. Commands that would run:" + for cfg in "${GENERATED_CONFIGS[@]}"; do + echo "uv run python ${REPO_ROOT}/main.py ${cfg} -o ${LOG_DIR} -s ${RUN_SUFFIX}" + done + if [[ ${RUN_POST_EVAL} -eq 1 ]]; then + if [[ ${OPEN_VIEWER} -eq 1 ]]; then + echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${LOG_DIR} --viewer" + else + echo "${REPO_ROOT}/scripts/run_eval_pipeline.sh --log-dir ${LOG_DIR}" + fi + fi + exit 0 +fi + +cd "${REPO_ROOT}" + +is_config_completed() { + local cfg_path="$1" + local log_dir="$2" + uv run python - "$cfg_path" "$log_dir" <<'PY' +import json +import sys +from pathlib import Path + +import yaml + +cfg_path = Path(sys.argv[1]) +log_dir = Path(sys.argv[2]) + +cfg = yaml.safe_load(cfg_path.read_text()) +cfg_game = cfg["game"]["name"] +cfg_rounds = int(cfg["tournament"]["rounds"]) +cfg_players = sorted(p["name"] for p in cfg["players"]) +expected_round_keys = {str(i) for i in range(cfg_rounds + 1)} + +for meta_path in log_dir.rglob("metadata.json"): + try: + meta = json.loads(meta_path.read_text()) + except Exception: + continue + conf = meta.get("config", {}) + game = conf.get("game", {}).get("name") + players = sorted(p.get("name") for p in conf.get("players", [])) + if game != cfg_game or players != cfg_players: + continue + round_stats = meta.get("round_stats", {}) + if expected_round_keys.issubset(set(round_stats.keys())): + print(meta_path) + sys.exit(0) + +sys.exit(1) +PY +} + +declare -a COMPLETED_CONFIGS=() +declare -a SKIPPED_CONFIGS=() +declare -a FAILED_CONFIGS=() + +for cfg in "${GENERATED_CONFIGS[@]}"; do + echo + echo "==> Running benchmark: ${cfg}" + + if [[ ${RESUME} -eq 1 ]]; then + if completed_path="$(is_config_completed "${cfg}" "${LOG_DIR}" 2>/dev/null)"; then + echo " skipping (resume): found completed run at ${completed_path}" + SKIPPED_CONFIGS+=("${cfg}") + continue + fi + fi + + max_attempts=$((MAX_CONFIG_RETRIES + 1)) + attempt=1 + ran_ok=0 + while [[ ${attempt} -le ${max_attempts} ]]; do + echo " attempt ${attempt}/${max_attempts}" + if uv run python "${REPO_ROOT}/main.py" "${cfg}" -o "${LOG_DIR}" -s "${RUN_SUFFIX}"; then + ran_ok=1 + break + fi + if [[ ${attempt} -lt ${max_attempts} ]]; then + sleep_s=$((15 * (2 ** (attempt - 1)))) + echo " failed attempt ${attempt}; retrying in ${sleep_s}s..." + sleep "${sleep_s}" + fi + attempt=$((attempt + 1)) + done + + if [[ ${ran_ok} -eq 1 ]]; then + COMPLETED_CONFIGS+=("${cfg}") + continue + fi + + FAILED_CONFIGS+=("${cfg}") + if [[ ${CONTINUE_ON_ERROR} -eq 0 ]]; then + echo "Error: benchmark failed and --continue-on-error is not set." >&2 + exit 1 + fi +done + +echo +echo "==> Run summary" +echo "Completed: ${#COMPLETED_CONFIGS[@]}" +echo "Skipped (resume): ${#SKIPPED_CONFIGS[@]}" +echo "Failed: ${#FAILED_CONFIGS[@]}" +if [[ ${#FAILED_CONFIGS[@]} -gt 0 ]]; then + printf ' - %s\n' "${FAILED_CONFIGS[@]}" +fi + +if [[ ${RUN_POST_EVAL} -eq 1 ]]; then + echo + echo "==> Running post-eval pipeline..." + if [[ ${OPEN_VIEWER} -eq 1 ]]; then + "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${LOG_DIR}" --viewer + else + "${REPO_ROOT}/scripts/run_eval_pipeline.sh" --log-dir "${LOG_DIR}" + fi +fi + +echo +echo "Done." +echo "Logs: ${LOG_DIR}" +echo "Generated configs: ${RUN_CONFIG_DIR}" diff --git a/scripts/run_openai_sweep.sh b/scripts/run_openai_sweep.sh new file mode 100755 index 00000000..16cb59cd --- /dev/null +++ b/scripts/run_openai_sweep.sh @@ -0,0 +1,332 @@ +#!/usr/bin/env bash + +set -euo pipefail + +REPO="/Users/muhtasham/Documents/CodeClash" +MODELS=( + "openai/gpt-5.4" + "openai/gpt-5.3-codex" +) +OPPONENT="openai/gpt-5" + +RUN_ROOT="$REPO/logs/new_openai_sweep_$(date +%Y%m%d_%H%M%S)" +CHECK_ONLY=0 +PUSH_DIFFS=0 +OPEN_VIEWER=0 +RESUME=0 +CONTINUE_ON_ERROR=0 +MAX_CONFIG_RETRIES=2 +REASONING_EFFORT="" +PLAYER_REASONING_EFFORT="" +OPPONENT_REASONING_EFFORT="" + +usage() { + cat <<'EOF' +Usage: + scripts/run_openai_sweep.sh [options] + +Options: + --run-root Set custom logs root for this sweep. + --opponent Opponent baseline model (default: openai/gpt-5). + --reasoning-effort + Set reasoning_effort for both players (e.g. low|medium|high). + --player-reasoning-effort + Set reasoning_effort for evaluated models only. + --opponent-reasoning-effort + Set reasoning_effort for opponent only. + --check-only Run preflight checks + dry runs only, then exit. + --resume Skip already-completed per-arena configs in an existing --run-root. + --max-config-retries + Retry each failed arena config up to n times (default: 2). + --continue-on-error Continue with other configs/models when a config fails. + --push-diffs After eval, push per-tournament code diffs to arena repos. + --viewer Launch local viewer at end of pipeline. + -h, --help Show help. + +What this script does: + 1) Preflight checks: + - local scripts/dependencies + - LiteLLM support for: + openai/gpt-5.4 + openai/gpt-5.3-codex + and opponent baseline model + - dry-run config generation for each model + 2) Full benchmark runs for all listed models (all standard arenas vs chosen opponent) + 3) Combined post-eval pipeline over one shared run root + 4) Optional diff-branch push to CodeClash arena repos +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --run-root) + RUN_ROOT="${2:-}" + shift 2 + ;; + --opponent) + OPPONENT="${2:-}" + shift 2 + ;; + --reasoning-effort) + REASONING_EFFORT="${2:-}" + shift 2 + ;; + --player-reasoning-effort) + PLAYER_REASONING_EFFORT="${2:-}" + shift 2 + ;; + --opponent-reasoning-effort) + OPPONENT_REASONING_EFFORT="${2:-}" + shift 2 + ;; + --check-only) + CHECK_ONLY=1 + shift + ;; + --resume) + RESUME=1 + shift + ;; + --max-config-retries) + MAX_CONFIG_RETRIES="${2:-}" + shift 2 + ;; + --continue-on-error) + CONTINUE_ON_ERROR=1 + shift + ;; + --push-diffs) + PUSH_DIFFS=1 + shift + ;; + --viewer) + OPEN_VIEWER=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if ! [[ "${MAX_CONFIG_RETRIES}" =~ ^[0-9]+$ ]]; then + echo "Error: --max-config-retries must be a non-negative integer, got '${MAX_CONFIG_RETRIES}'" >&2 + exit 1 +fi + +normalize_model_id() { + local v="${1#@}" + if [[ "${v}" == */* ]]; then + echo "${v}" + else + echo "openai/${v}" + fi +} + +OPPONENT="$(normalize_model_id "${OPPONENT}")" + +if [[ -n "${REASONING_EFFORT}" ]]; then + if [[ -z "${PLAYER_REASONING_EFFORT}" ]]; then + PLAYER_REASONING_EFFORT="${REASONING_EFFORT}" + fi + if [[ -z "${OPPONENT_REASONING_EFFORT}" ]]; then + OPPONENT_REASONING_EFFORT="${REASONING_EFFORT}" + fi +fi + +COMMON_BENCH_ARGS=(--opponent "$OPPONENT" --max-config-retries "$MAX_CONFIG_RETRIES") +if [[ "$RESUME" -eq 1 ]]; then + COMMON_BENCH_ARGS+=(--resume) +fi +if [[ "$CONTINUE_ON_ERROR" -eq 1 ]]; then + COMMON_BENCH_ARGS+=(--continue-on-error) +fi +if [[ -n "${PLAYER_REASONING_EFFORT}" ]]; then + COMMON_BENCH_ARGS+=(--player-reasoning-effort "$PLAYER_REASONING_EFFORT") +fi +if [[ -n "${OPPONENT_REASONING_EFFORT}" ]]; then + COMMON_BENCH_ARGS+=(--opponent-reasoning-effort "$OPPONENT_REASONING_EFFORT") +fi + +if [[ ! -d "$REPO" ]]; then + echo "Repo not found: $REPO" >&2 + exit 1 +fi + +if [[ ! -x "$REPO/scripts/run_openai_model_benchmarks.sh" ]]; then + echo "Missing or non-executable: $REPO/scripts/run_openai_model_benchmarks.sh" >&2 + exit 1 +fi + +if [[ ! -x "$REPO/scripts/run_eval_pipeline.sh" ]]; then + echo "Missing or non-executable: $REPO/scripts/run_eval_pipeline.sh" >&2 + exit 1 +fi + +if [[ ! -f "$REPO/scripts/push_log_to_gh.py" ]]; then + echo "Missing: $REPO/scripts/push_log_to_gh.py" >&2 + exit 1 +fi + +mkdir -p "$RUN_ROOT" + +cd "$REPO" + +echo "==> Auth preflight: forcing OPENAI key from repo .env..." +OPENAI_KEY_FROM_REPO="$( + uv run python - "$REPO/.env" <<'PY' +import sys +from pathlib import Path +from dotenv import dotenv_values + +env_path = Path(sys.argv[1]) +if not env_path.exists(): + raise SystemExit(2) +val = dotenv_values(env_path).get("OPENAI_API_KEY", "") +print(val or "") +PY +)" + +if [[ -z "$OPENAI_KEY_FROM_REPO" ]]; then + echo "Error: OPENAI_API_KEY missing in $REPO/.env" >&2 + exit 1 +fi +export OPENAI_API_KEY="$OPENAI_KEY_FROM_REPO" +unset OPENAI_KEY_FROM_REPO + +uv run python - "$REPO/.env" <<'PY' +import hashlib +import os +import sys +from pathlib import Path +from dotenv import dotenv_values + +def fp(v: str) -> str: + return f"len={len(v)} sha256[:10]={hashlib.sha256(v.encode()).hexdigest()[:10]} tail={v[-4:]}" + +repo_env = Path(sys.argv[1]) +repo_key = dotenv_values(repo_env).get("OPENAI_API_KEY", "") +env_key = os.environ.get("OPENAI_API_KEY", "") +mini_env = Path.home() / "Library/Application Support/mini-swe-agent/.env" +mini_key = dotenv_values(mini_env).get("OPENAI_API_KEY", "") if mini_env.exists() else "" + +print(f" source: {repo_env}") +print(f" active OPENAI_API_KEY: {fp(env_key)}") +print(f" repo OPENAI_API_KEY: {fp(repo_key)}") +if mini_key: + print(f" mini OPENAI_API_KEY: {fp(mini_key)}") + if mini_key != repo_key: + print(" note: mini-swe-agent global key differs; repo key is forced for this run.") +PY + +echo "==> Sweep run root: $RUN_ROOT" +echo "==> Models:" +printf ' - %s\n' "${MODELS[@]}" +echo "==> Opponent baseline: $OPPONENT" +echo "==> Player reasoning_effort: ${PLAYER_REASONING_EFFORT:-}" +echo "==> Opponent reasoning_effort: ${OPPONENT_REASONING_EFFORT:-}" + +echo +echo "==> Preflight 1/3: LiteLLM model support checks..." +uv run python - "$OPPONENT" <<'PY' +import sys +from importlib.metadata import version +import litellm + +opponent = sys.argv[1] +models = [ + "openai/gpt-5.4", + "openai/gpt-5.3-codex", + opponent, +] + +print(f"litellm_version={version('litellm')}") +ok = True +for m in models: + print(f"\nMODEL {m}") + try: + print(" provider:", litellm.get_llm_provider(model=m)) + except Exception as e: + ok = False + print(f" provider_error: {type(e).__name__}: {e}") + try: + info = litellm.get_model_info(model=m) + print( + " model_info:", + { + "max_input_tokens": info.get("max_input_tokens"), + "max_output_tokens": info.get("max_output_tokens"), + "supports_function_calling": info.get("supports_function_calling"), + }, + ) + except Exception as e: + ok = False + print(f" model_info_error: {type(e).__name__}: {e}") + +if not ok: + sys.exit(1) +PY + +echo +echo "==> Preflight 2/3: Dry-run config generation checks..." +for MODEL in "${MODELS[@]}"; do + echo " -> $MODEL" + "$REPO/scripts/run_openai_model_benchmarks.sh" \ + --model "$MODEL" \ + --log-dir "$RUN_ROOT" \ + "${COMMON_BENCH_ARGS[@]}" \ + --dry-run >/dev/null +done + +echo +echo "==> Preflight 3/3: GitHub CLI auth check (needed for optional upload/push flows)..." +gh auth status >/dev/null +echo " gh auth: OK" + +if [[ "$CHECK_ONLY" -eq 1 ]]; then + echo + echo "Preflight + dry-run checks passed. Exiting due to --check-only." + exit 0 +fi + +echo +echo "==> Running full sweeps..." +for MODEL in "${MODELS[@]}"; do + echo + echo "### Running model: $MODEL" + "$REPO/scripts/run_openai_model_benchmarks.sh" \ + --model "$MODEL" \ + --log-dir "$RUN_ROOT" \ + "${COMMON_BENCH_ARGS[@]}" +done + +echo +echo "==> Running combined post-eval pipeline..." +if [[ "$OPEN_VIEWER" -eq 1 ]]; then + "$REPO/scripts/run_eval_pipeline.sh" --log-dir "$RUN_ROOT" --viewer +else + "$REPO/scripts/run_eval_pipeline.sh" --log-dir "$RUN_ROOT" +fi + +if [[ "$PUSH_DIFFS" -eq 1 ]]; then + echo + echo "==> Pushing per-tournament diffs to arena repos..." + find "$RUN_ROOT" -type f -name metadata.json -print0 \ + | xargs -0 -I{} dirname "{}" \ + | sort -u \ + | while read -r folder; do + echo " -> $folder" + uv run python "$REPO/scripts/push_log_to_gh.py" "$folder" + done +fi + +echo +echo "Done." +echo "Run root: $RUN_ROOT" +echo "Combined leaderboard JSON: $RUN_ROOT/analysis/elo/leaderboards.json" diff --git a/scripts/scrape_viewer_leaderboard_runs.py b/scripts/scrape_viewer_leaderboard_runs.py new file mode 100644 index 00000000..7db572a7 --- /dev/null +++ b/scripts/scrape_viewer_leaderboard_runs.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python3 +"""Scrape viewer.codeclash.ai completed runs and download metadata.json files. + +By default this targets the 8-model public leaderboard cohort and the 6 arenas: +BattleSnake, CoreWar, Halite, RobotRumble, RoboCode, HuskyBench. +""" + +from __future__ import annotations + +import argparse +import json +import re +import urllib.parse +import urllib.request +from dataclasses import dataclass +from pathlib import Path + + +DEFAULT_MODELS = [ + "claude-sonnet-4-5-20250929", + "gpt-5", + "o3", + "claude-sonnet-4-20250514", + "gpt-5-mini", + "gemini-2.5-pro", + "grok-code-fast-1", + "qwen3-coder-plus-2025-09-23", +] + +DEFAULT_GAMES = ["BattleSnake", "CoreWar", "Halite", "RobotRumble", "RoboCode", "HuskyBench"] + +VIEWER_BASE = "https://viewer.codeclash.ai" + + +@dataclass(frozen=True) +class RunRef: + rel_path: str + game: str + rounds: int + sims: int + players: int + p1: str + p2: str + ts: str + + +def fetch_index_html() -> str: + with urllib.request.urlopen(VIEWER_BASE + "/", timeout=60) as r: + return r.read().decode("utf-8", errors="replace") + + +def extract_paths(html: str) -> list[str]: + # Example: data-path="completed/PvpTournament.BattleSnake.r15.s1000.p2.a.b.251002061714" + return sorted(set(re.findall(r'data-path="(completed/PvpTournament\.[^"]+)"', html))) + + +def _alias_variants(alias: str) -> set[str]: + norm = re.sub(r"[^a-zA-Z0-9]", "", alias).lower() + return {alias, norm} + + +def _build_variant_lookup(models: set[str]) -> dict[str, str]: + out: dict[str, str] = {} + for m in models: + for v in _alias_variants(m): + out[v] = m + return out + + +def parse_run(path: str, models: set[str], variant_to_alias: dict[str, str]) -> RunRef | None: + # Parse fixed front/back first, then decode the middle p1.p2 region safely. + if not path.startswith("completed/PvpTournament."): + return None + # Strip prefix "completed/PvpTournament." + body = path[len("completed/PvpTournament.") :] + # Find timestamp token (12 digits), allowing optional suffix after it + # (e.g. ".-uuid" in some games). + try: + pre = body + parts = pre.split(".") + ts_idx = None + for i in range(len(parts) - 1, -1, -1): + if re.fullmatch(r"\d{12}", parts[i]): + ts_idx = i + break + if ts_idx is None: + return None + ts = parts[ts_idx] + pre = ".".join(parts[:ts_idx]) + except ValueError: + return None + parts = pre.split(".") + if len(parts) < 5: + return None + game = parts[0] + rounds_s = parts[1] + sims_s = parts[2] + players_s = parts[3] + model_region = ".".join(parts[4:]) + + if not rounds_s.startswith("r") or not sims_s.startswith("s") or not players_s.startswith("p"): + return None + + rounds = int(rounds_s[1:]) + sims = int(sims_s[1:]) + players = int(players_s[1:]) + + # Identify p1/p2 using known model aliases (including normalized + # hyphenless variants used in some logs). + p1 = p2 = None + variants = sorted(variant_to_alias.keys(), key=len, reverse=True) + for alias_variant in variants: + pref = alias_variant + "." + if model_region.startswith(pref): + tail = model_region[len(pref) :] + if tail in variant_to_alias: + p1 = variant_to_alias[alias_variant] + p2 = variant_to_alias[tail] + break + if p1 is None or p2 is None: + return None + + return RunRef( + rel_path=path, + game=game, + rounds=rounds, + sims=sims, + players=players, + p1=p1, + p2=p2, + ts=ts, + ) + + +def build_download_url(rel_path: str) -> str: + # Endpoint expects absolute path on the viewer host. + abs_path = f"/home/klieret/CodeClash/logs/{rel_path}/metadata.json" + q = urllib.parse.urlencode({"path": abs_path}) + return f"{VIEWER_BASE}/download-file/?{q}" + + +def build_game_page_url(rel_path: str) -> str: + return f"{VIEWER_BASE}/game/{rel_path}.html" + + +def _extract_json_object_after_marker(text: str, marker: str) -> str | None: + idx = text.find(marker) + if idx < 0: + return None + i = idx + len(marker) + while i < len(text) and text[i].isspace(): + i += 1 + if i >= len(text) or text[i] != "{": + return None + + # Brace-match while respecting quoted strings. + depth = 0 + in_str = False + escaped = False + start = i + for j in range(i, len(text)): + ch = text[j] + if in_str: + if escaped: + escaped = False + elif ch == "\\": + escaped = True + elif ch == '"': + in_str = False + continue + if ch == '"': + in_str = True + continue + if ch == "{": + depth += 1 + elif ch == "}": + depth -= 1 + if depth == 0: + return text[start : j + 1] + return None + + +def _extract_embedded_metadata_from_game_html(html: str) -> dict | None: + blob = _extract_json_object_after_marker(html, "initializeJSONEditors(") + if not blob: + return None + return json.loads(blob) + + +def download(url: str, out_file: Path) -> bool: + out_file.parent.mkdir(parents=True, exist_ok=True) + try: + with urllib.request.urlopen(url, timeout=60) as r: + data = r.read() + out_file.write_bytes(data) + # basic sanity + json.loads(out_file.read_text()) + return True + except Exception: + return False + + +def download_metadata_via_game_page(rel_path: str, out_file: Path) -> bool: + out_file.parent.mkdir(parents=True, exist_ok=True) + page_url = build_game_page_url(rel_path) + try: + with urllib.request.urlopen(page_url, timeout=60) as r: + html = r.read().decode("utf-8", errors="replace") + payload = _extract_embedded_metadata_from_game_html(html) + if payload is None: + return False + # Viewer pages embed a wrapper object used by front-end widgets. The + # actual tournament metadata is under "results". + metadata = payload.get("results") if isinstance(payload, dict) else None + if not isinstance(metadata, dict): + metadata = payload + out_file.write_text(json.dumps(metadata, indent=2)) + # basic sanity + json.loads(out_file.read_text()) + return True + except Exception: + return False + + +def main() -> int: + ap = argparse.ArgumentParser() + ap.add_argument( + "--output-root", + type=Path, + required=True, + help="Local root to save downloaded runs (folders with metadata.json).", + ) + ap.add_argument("--models", nargs="*", default=DEFAULT_MODELS, help="Model aliases to include.") + ap.add_argument("--games", nargs="*", default=DEFAULT_GAMES, help="Game names to include.") + ap.add_argument("--rounds", type=int, default=15) + ap.add_argument("--players", type=int, default=2) + ap.add_argument( + "--strategy", + choices=["latest-per-pair-game", "all-matching"], + default="latest-per-pair-game", + help="Download only latest run per (game,unordered pair), or all matching runs.", + ) + args = ap.parse_args() + + models = set(args.models) + games = set(args.games) + variant_to_alias = _build_variant_lookup(models) + + html = fetch_index_html() + paths = extract_paths(html) + + runs: list[RunRef] = [] + for p in paths: + r = parse_run(p, models, variant_to_alias) + if r is None: + continue + if r.game not in games: + continue + if r.rounds != args.rounds or r.players != args.players: + continue + runs.append(r) + + if args.strategy == "latest-per-pair-game": + best: dict[tuple[str, tuple[str, str]], RunRef] = {} + for r in runs: + pair = tuple(sorted((r.p1, r.p2))) + k = (r.game, pair) + prev = best.get(k) + if prev is None or int(r.ts) > int(prev.ts): + best[k] = r + selected = sorted(best.values(), key=lambda x: (x.game, tuple(sorted((x.p1, x.p2))), x.ts)) + else: + selected = sorted(runs, key=lambda x: (x.game, x.p1, x.p2, x.ts)) + + ok = 0 + fail = 0 + manifest = [] + for r in selected: + url = build_download_url(r.rel_path) + page_url = build_game_page_url(r.rel_path) + out_file = args.output_root / r.rel_path / "metadata.json" + success = download_metadata_via_game_page(r.rel_path, out_file) + if not success: + success = download(url, out_file) + manifest.append( + { + "rel_path": r.rel_path, + "game": r.game, + "p1": r.p1, + "p2": r.p2, + "rounds": r.rounds, + "players": r.players, + "sims": r.sims, + "ts": r.ts, + "page_url": page_url, + "download_url": url, + "ok": success, + "local_metadata": str(out_file), + } + ) + if success: + ok += 1 + else: + fail += 1 + + args.output_root.mkdir(parents=True, exist_ok=True) + (args.output_root / "download_manifest.json").write_text(json.dumps(manifest, indent=2)) + + summary = { + "selected_runs": len(selected), + "download_ok": ok, + "download_failed": fail, + "output_root": str(args.output_root), + "strategy": args.strategy, + "models": sorted(models), + "games": sorted(games), + } + (args.output_root / "summary.json").write_text(json.dumps(summary, indent=2)) + print(json.dumps(summary, indent=2)) + return 0 if fail == 0 else 2 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/watch_sweep_progress.sh b/scripts/watch_sweep_progress.sh new file mode 100755 index 00000000..1729f382 --- /dev/null +++ b/scripts/watch_sweep_progress.sh @@ -0,0 +1,199 @@ +#!/usr/bin/env bash + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +RUN_ROOT="" +OPPONENT_ALIAS="gpt-5" +INTERVAL=15 +ONCE=0 +ALL_TIERS=0 + +usage() { + cat <<'EOF' +Usage: + scripts/watch_sweep_progress.sh [options] + +Options: + --run-root Explicit sweep log root. + --opponent-alias Match generated config dirs: *_vs_ (default: gpt-5). + --all-tiers Show default, low, medium, and high tiers together. + --interval Refresh interval (default: 15). + --once Print one snapshot and exit. + -h, --help Show help. + +Default run-root auto-detection order: + 1) latest logs/new_openai_sweep_* + 2) latest logs/gpt54_vs_gpt53codex_reasoning_* +EOF +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --run-root) + RUN_ROOT="${2:-}" + shift 2 + ;; + --opponent-alias) + OPPONENT_ALIAS="${2:-}" + shift 2 + ;; + --all-tiers) + ALL_TIERS=1 + shift + ;; + --interval) + INTERVAL="${2:-}" + shift 2 + ;; + --once) + ONCE=1 + shift + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2 + usage + exit 1 + ;; + esac +done + +if [[ -z "${RUN_ROOT}" ]]; then + RUN_ROOT="$( + ls -td \ + "${REPO_ROOT}"/logs/new_openai_sweep_* \ + "${REPO_ROOT}"/logs/gpt54_vs_gpt53codex_reasoning_* \ + 2>/dev/null | head -n 1 || true + )" +fi + +if [[ -z "${RUN_ROOT}" || ! -d "${RUN_ROOT}" ]]; then + echo "No valid run root found. Pass --run-root explicitly." >&2 + exit 1 +fi + +if ! [[ "${INTERVAL}" =~ ^[0-9]+$ ]]; then + echo "--interval must be an integer number of seconds." >&2 + exit 1 +fi + +print_snapshot() { + local opponent_alias="$1" + cd "${REPO_ROOT}" + uv run python - "${REPO_ROOT}" "${RUN_ROOT}" "${opponent_alias}" <<'PY' +from pathlib import Path +import json +import yaml + +from codeclash import CONFIG_DIR +from codeclash.utils.yaml_utils import resolve_includes + +repo = Path(__import__("sys").argv[1]) +run_root = Path(__import__("sys").argv[2]) +opponent_alias = __import__("sys").argv[3] + +def normalize_players(players: list[str], opponent_alias: str) -> list[str]: + """Normalize known stale aliases in generated configs/metadata for matching.""" + suffix = None + prefix = "gpt-5.3-codex-" + if opponent_alias.startswith(prefix): + suffix = opponent_alias[len(prefix):] + + normalized = [] + for player in players: + if player == "gpt5" and suffix: + normalized.append(f"gpt-5.4-{suffix}") + else: + normalized.append(player) + return sorted(normalized) + +cfgs = sorted((repo / "configs" / "generated").glob(f"*_vs_{opponent_alias}/*.yaml")) +print(f"RUN_ROOT: {run_root}") +print(f"TOTAL CONFIGS: {len(cfgs)}") + +metas = [] +for m in run_root.rglob("metadata.json"): + try: + md = json.loads(m.read_text()) + cc = md.get("config", {}) + metas.append( + ( + cc.get("game", {}).get("name"), + normalize_players([p.get("name") for p in cc.get("players", [])], opponent_alias), + md, + m, + ) + ) + except Exception: + pass + +done = partial = pending = 0 +for c in cfgs: + cfg = yaml.safe_load(resolve_includes(c.read_text(), base_dir=CONFIG_DIR)) + game = cfg["game"]["name"] + rounds = int(cfg["tournament"]["rounds"]) + players = normalize_players([p["name"] for p in cfg["players"]], opponent_alias) + + # Pick newest metadata for this game+player pair (important when retries create multiple folders). + hit = None + newest_mtime = -1.0 + for g, p, md, meta_path in metas: + if g != game or p != players: + continue + ts = float((md.get("timing") or {}).get("start_time", 0.0)) + if ts < 1.0: + ts = meta_path.stat().st_mtime + if ts >= newest_mtime: + newest_mtime = ts + hit = md + + if not hit: + st = "PENDING" + pending += 1 + else: + rs = hit.get("round_stats", {}) + st = "DONE" if len(rs) >= rounds + 1 else "PARTIAL" + done += st == "DONE" + partial += st == "PARTIAL" + + print(f"{st:7} {c.name}") + +print(f"\nSUMMARY done={done} partial={partial} pending={pending}") +PY +} + +print_all_tiers() { + local tier + for tier in default low medium high; do + echo "===== ${tier} =====" + print_snapshot "gpt-5.3-codex-${tier}" + echo + done +} + +if [[ "${ONCE}" -eq 1 ]]; then + if [[ "${ALL_TIERS}" -eq 1 ]]; then + print_all_tiers + else + print_snapshot "${OPPONENT_ALIAS}" + fi + exit 0 +fi + +while true; do + clear + echo "Sweep Progress Monitor ($(date))" + echo + if [[ "${ALL_TIERS}" -eq 1 ]]; then + print_all_tiers + else + print_snapshot "${OPPONENT_ALIAS}" + fi + sleep "${INTERVAL}" +done diff --git a/tests/arenas/test_robocode.py b/tests/arenas/test_robocode.py index 31e232d9..0fafeb27 100644 --- a/tests/arenas/test_robocode.py +++ b/tests/arenas/test_robocode.py @@ -8,6 +8,7 @@ from codeclash.arenas.arena import RoundStats from codeclash.arenas.robocode.robocode import RC_FILE, SIMS_PER_RUN, RoboCodeArena +from codeclash.constants import RESULT_TIE from .conftest import MockPlayer @@ -225,6 +226,37 @@ def test_parse_results_player2_wins(self, arena, tmp_log_dir): assert stats.scores["Alice"] == 4500 assert stats.scores["Bob"] == 9500 + def test_parse_results_tie(self, arena, tmp_log_dir): + """Equal total scores should be recorded as a tie, not assigned to the first player.""" + round_dir = tmp_log_dir / "rounds" / "1" + round_dir.mkdir(parents=True) + + self._create_results_file( + round_dir, + 0, + [ + (1, "Alice.MyTank", 0), + (2, "Bob.MyTank", 0), + ], + ) + self._create_results_file( + round_dir, + 1, + [ + (1, "Alice.MyTank", 0), + (2, "Bob.MyTank", 0), + ], + ) + + agents = [MockPlayer("Alice"), MockPlayer("Bob")] + stats = RoundStats(round_num=1, agents=agents) + + arena.get_results(agents, round_num=1, stats=stats) + + assert stats.winner == RESULT_TIE + assert stats.scores["Alice"] == 0 + assert stats.scores["Bob"] == 0 + class TestRoboCodeConfig: """Tests for RoboCodeArena configuration and properties."""