Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ OPENAI_API_KEY=sk-...
ANTHROPIC_API_KEY=sk-ant-api03-...

# Google Gemini
GEMINI_API_KEY=...
GEMINI_API_KEY=

# DeepSeek
DEEPSEEK_API_KEY=sk-...
Expand Down
45 changes: 33 additions & 12 deletions scripts/benchmark_eval_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def patch(eval_results, dataset):
"""
Patch the eval results with the dataset
"""
for pid in range(1, len(dataset) + 1):
for pid in dataset.get_problem_ids():
if str(pid) not in eval_results:
eval_results[str(pid)] = {
"sample_id": 0,
Expand Down Expand Up @@ -161,19 +161,40 @@ def analyze_greedy_eval(run_name, hardware, baseline, level,
)

# Extract the speedup values
is_correct = np.array([entry["correctness"] for entry in eval_results.values()])
baseline_speed = np.array(
[entry["mean"] for entry in baseline_results[f"level{level}"].values()]
)
actual_speed = np.array([entry["runtime"] for entry in eval_results.values()])
is_correct_list = []
baseline_speed_list = []
actual_speed_list = []

# Sort problem IDs to ensure consistent order
sorted_pids = sorted(dataset.get_problem_ids())

for pid in sorted_pids:
# Get eval result
if str(pid) not in eval_results:
print(f"Warning: Problem {pid} not found in eval results")
continue
eval_entry = eval_results[str(pid)]

# Get baseline result
problem = dataset.get_problem_by_id(pid)
problem_name = problem.name

if problem_name not in baseline_results[f"level{level}"]:
print(f"Warning: Problem {problem_name} not found in baseline results")
continue

baseline_entry = baseline_results[f"level{level}"][problem_name]

is_correct_list.append(eval_entry["correctness"])
actual_speed_list.append(eval_entry["runtime"])
baseline_speed_list.append(baseline_entry["mean"])

is_correct = np.array(is_correct_list)
baseline_speed = np.array(baseline_speed_list)
actual_speed = np.array(actual_speed_list)
n = len(is_correct)

assert (
len(baseline_speed) == n
), "Baseline speedup values do not match the number of eval results"
assert (
len(actual_speed) == n
), "Actual speedup values do not match the number of eval results"
print(f"Aligned {n} problems for analysis")

# Calculate the metrics
gmsr_correct = geometric_mean_speed_ratio_correct_only(
Expand Down
85 changes: 28 additions & 57 deletions scripts/eval_from_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import pydra
import torch

from datasets import load_dataset
from pydra import Config, REQUIRED

# Import only what we need
Expand Down Expand Up @@ -255,36 +254,17 @@ def evaluate_single_sample_modal(


def fetch_ref_arch_from_problem_id(
dataset, problem_id: int, dataset_src: str
dataset, problem_id: int, dataset_src: str = None
) -> str | None:
"""
Fetch reference architecture from problem directory
Either from Hugging Face or Local Dataset
Fetch reference architecture from problem directory.
Uses the unified dataset interface.

Note: dataset_src parameter is kept for backward compatibility but ignored
since the dataset object already handles both sources.
"""
if dataset_src == "huggingface":
curr_problem_row = dataset.filter(
lambda x: x["problem_id"] == problem_id, num_proc=None, desc=None
)
ref_arch_src = curr_problem_row["code"][0]
problem_name = curr_problem_row["name"][0]

elif dataset_src == "local":
problem_idx_in_dataset = (
problem_id - 1
) # due to dataset list being 0-indexed locally
ref_arch_path = dataset[problem_idx_in_dataset]

problem_name = os.path.basename(ref_arch_path)
ref_arch_src = read_file(ref_arch_path)

# verify
# Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
problem_number = int(problem_name.split("_")[0])
assert (
problem_number == problem_id
), f"Problem number in filename ({problem_number}) does not match config problem_id ({problem_id})"

return ref_arch_src
problem = dataset.get_problem_by_id(problem_id)
return problem.code


def fetch_kernel_from_disk(
Expand Down Expand Up @@ -822,57 +802,48 @@ def main(config: EvalConfig):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method("spawn")

# Dataset Configurations
if config.dataset_src == "huggingface":
dataset = load_dataset(config.dataset_name)
curr_level_dataset = dataset[f"level_{config.level}"]
elif config.dataset_src == "local":
curr_level_dataset = construct_kernelbench_dataset(config.level)

num_problems_in_level = len(curr_level_dataset)

# Determine which problem IDs to evaluate
# you can either specify a list of problem IDs (prioritize) or a subset range
# NOTE: later once the dataset PR is in we will link the representative subset as a built-in preset too
if config.problem_ids is not None:
# Use specific problem IDs if provided
problem_id_list = config.problem_ids
for pid in problem_id_list:
assert 1 <= pid <= num_problems_in_level, f"Problem ID {pid} out of range for Level {config.level}"
elif config.subset == (None, None):
problem_id_list = list(range(1, num_problems_in_level + 1))
# Dataset Configurations - Unified loading
dataset = construct_kernelbench_dataset(
level=config.level,
source=config.dataset_src,
dataset_name=config.dataset_name,
)

all_problem_ids = dataset.get_problem_ids()

if config.subset == (None, None):
problem_ids_to_run = all_problem_ids
else:
assert (
config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level
), f"Subset range {config.subset} out of range for Level {config.level}"
problem_id_list = list(range(config.subset[0], config.subset[1] + 1))
start, end = config.subset
problem_ids_to_run = [pid for pid in all_problem_ids if start <= pid <= end]
if not problem_ids_to_run:
print(f"Warning: No problems found in subset range {config.subset}")

print(
f"Evaluating {config.num_samples_per_problem} sample(s) each for level {config.level} problems: {problem_id_list}"
f"Evaluating {config.num_samples_per_problem} sample(s) each for level {config.level} problems: {problem_ids_to_run}"
)

run_dir = os.path.join(config.runs_dir, config.run_name)
eval_file_path = os.path.join(run_dir, f"eval_results.json")

# To Debug
# single_eval_example(config, curr_level_dataset, run_dir, eval_file_path)
# single_eval_example(config, dataset, run_dir, eval_file_path)

total_work = []
for problem_id in problem_id_list:
for problem_id in problem_ids_to_run:
for sample_id in range(config.num_samples_per_problem):
if not check_if_eval_exists_local(problem_id, sample_id, eval_file_path):
total_work.append((problem_id, sample_id))

print(
f"Start evaluation on {len(total_work)} unevaluated samples"
f" for problems: {problem_id_list}"
f" in range: {problem_ids_to_run}"
)
# Build Cache on CPU as that is faster (only for local mode)
if config.build_cache and config.eval_mode == "local":
compile.batch_compile(total_work, config.to_dict())

# Batch Eval on multiple GPUs in parallel
batch_eval(total_work, config, curr_level_dataset, run_dir, eval_file_path)
batch_eval(total_work, config, dataset, run_dir, eval_file_path)

# Calculate pass@k metrics if multiple samples per problem were evaluated
if config.num_samples_per_problem > 1:
Expand Down
54 changes: 13 additions & 41 deletions scripts/generate_and_eval_single_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,12 @@
import json
import modal

from datasets import load_dataset

from kernelbench.dataset import construct_kernelbench_dataset
from kernelbench.eval import eval_kernel_against_ref
from kernelbench.prompt_constructor_toml import get_prompt_for_backend, get_custom_prompt
from kernelbench.utils import (
create_inference_server_from_presets,
extract_first_code,
query_server,
read_file,
set_gpu_arch,
)
from kernelbench.eval import get_torch_dtype_from_string
Expand Down Expand Up @@ -116,13 +112,14 @@ def main(config: EvalConfig):

print(f"Starting Eval with config: {config}")

# Configurations

if config.dataset_src == "huggingface":
dataset = load_dataset(config.dataset_name)
curr_level_dataset = dataset[f"level_{config.level}"]
elif config.dataset_src == "local":
curr_level_dataset = construct_kernelbench_dataset(config.level)
# Configurations - Unified dataset loading (works for both HF and local)
from kernelbench.dataset import construct_kernelbench_dataset

dataset = construct_kernelbench_dataset(
level=config.level,
source=config.dataset_src,
dataset_name=config.dataset_name,
)

if config.gpu_arch:
set_gpu_arch(config.gpu_arch) # otherwise build for all architectures
Expand All @@ -131,41 +128,16 @@ def main(config: EvalConfig):
os.makedirs(config.logdir, exist_ok=True)

# Problem Checks
num_problems = len(curr_level_dataset)
num_problems = len(dataset)
print(f"Number of problems in Level {config.level}: {num_problems}")
print(
f"Start Generation + Evaluation for Level {config.level} Problem {config.problem_id}"
)

assert (
config.problem_id <= num_problems
), f"Problem ID {config.problem_id} out of range for Level {config.level}"

# TODO: refactor dataset fetching logic to be as clean as posisble.
# 1. Fetch Problem
if config.dataset_src == "huggingface":

curr_problem_row = curr_level_dataset.filter(
lambda x: x["problem_id"] == config.problem_id
)
ref_arch_src = curr_problem_row["code"][0]
problem_name = curr_problem_row["name"][0]

elif config.dataset_src == "local":
problem_idx_in_dataset = (
config.problem_id - 1
) # due to dataset list being 0-indexed locally
ref_arch_path = curr_level_dataset[problem_idx_in_dataset]

problem_name = os.path.basename(ref_arch_path)
ref_arch_src = read_file(ref_arch_path)
# import pdb; pdb.set_trace()

# Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
problem_number = int(problem_name.split("_")[0])
assert (
problem_number == config.problem_id
), f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
# Fetch problem - unified interface, no branching needed
problem = dataset.get_problem_by_id(config.problem_id)
ref_arch_src = problem.code
problem_name = problem.name

# 2. Generate Sample
# Create inference function with config parameters
Expand Down
44 changes: 13 additions & 31 deletions scripts/generate_and_eval_single_sample_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@
import json
import modal

from datasets import load_dataset

#from src.dataset import construct_kernelbench_dataset
from kernelbench.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
from kernelbench.dataset import construct_kernelbench_dataset
from kernelbench.utils import extract_first_code, query_server, set_gpu_arch, create_inference_server_from_presets
Copy link

Copilot AI Jan 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Import of 'query_server' is not used.
Import of 'set_gpu_arch' is not used.

Suggested change
from kernelbench.utils import extract_first_code, query_server, set_gpu_arch, create_inference_server_from_presets
from kernelbench.utils import extract_first_code, create_inference_server_from_presets

Copilot uses AI. Check for mistakes.

app = modal.App("eval_single_sample")

Expand Down Expand Up @@ -155,41 +153,25 @@ def main(config: EvalConfig):

print(f"Starting Eval with config: {config}")

# Configurations

if config.dataset_src == "huggingface":
dataset = load_dataset(config.dataset_name)
curr_level_dataset = dataset[f"level_{config.level}"]
# Configurations - Unified dataset loading (works for both HF and local)
dataset = construct_kernelbench_dataset(
level=config.level,
source=config.dataset_src,
dataset_name=config.dataset_name,
)

if config.log:
os.makedirs(config.logdir, exist_ok=True)

# Problem Checks
num_problems = len(curr_level_dataset)
num_problems = len(dataset)
print(f"Number of problems in Level {config.level}: {num_problems}")
print(f"Start Generation + Evaluation for Level {config.level} Problem {config.problem_id}")

assert config.problem_id <= num_problems, f"Problem ID {config.problem_id} out of range for Level {config.level}"


# 1. Fetch Problem
if config.dataset_src == "huggingface":

curr_problem_row = curr_level_dataset.filter(lambda x: x["problem_id"] == config.problem_id)
ref_arch_src = curr_problem_row["code"][0]
problem_name = curr_problem_row["name"][0]

elif config.dataset_src == "local":
problem_idx_in_dataset = config.problem_id - 1 # due to dataset list being 0-indexed locally
ref_arch_path = curr_level_dataset[problem_idx_in_dataset]

problem_name = os.path.basename(ref_arch_path)
ref_arch_src = read_file(ref_arch_path)
# import pdb; pdb.set_trace()

# Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
problem_number = int(problem_name.split("_")[0])
assert problem_number == config.problem_id, f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
# Fetch problem - unified interface, no branching needed
problem = dataset.get_problem_by_id(config.problem_id)
ref_arch_src = problem.code
problem_name = problem.name


# 2. Generate Sample
Expand Down
Loading