Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
db46141
Start seperate timing file for timing functions
kesavanramakrishnan Nov 10, 2025
5ff8891
Add tests, cache clearning, time, and do_bench
PaliC Nov 16, 2025
9581487
reorganize timing func, migrate cuda event with l2 cache from branch …
simonguozirui Dec 12, 2025
467f856
implement do_bench and cpu host timing, script to run all 4 timing me…
simonguozirui Dec 12, 2025
920a793
some annotations
Dec 12, 2025
05d408f
run_and_check compatible
Dec 12, 2025
2e724c6
Merge branch 'main' of github-simon:ScalingIntelligence/KernelBench i…
Dec 12, 2025
2be968a
revert eval and add only necessary changes
Dec 12, 2025
936f221
top_level eval entry point to set timing_method
simonguozirui Dec 15, 2025
2c36572
remove discard_first for cuda event and updated documentation
simonguozirui Dec 15, 2025
4909b1d
add discard_first for cuda_event
simonguozirui Dec 15, 2025
6c92786
add device context for profile on particular device
simonguozirui Dec 16, 2025
8a165d6
nit fix ready for merge
Dec 16, 2025
c063b81
type annotation for device
Dec 16, 2025
a7af124
benchmarking guide
PaliC Dec 16, 2025
f33d4e6
merge
PaliC Dec 16, 2025
b26e05f
benchmarking guide
PaliC Dec 17, 2025
f8e6839
benchmarking guide
PaliC Dec 17, 2025
614ecfd
benchmarking guide
PaliC Dec 17, 2025
353e4e1
benchmarking guide
PaliC Dec 17, 2025
8c28b88
benchmarking guide
PaliC Dec 17, 2025
d2207ea
try uv
PaliC Dec 17, 2025
3eb3901
uv support
PaliC Dec 17, 2025
3ac6148
uv support
PaliC Dec 23, 2025
bdb7abb
uv support
PaliC Dec 23, 2025
c7c3207
Delete CLAUDE.md
PaliC Dec 23, 2025
64996a3
Delete notebooks/benchmarking.ipynb
PaliC Dec 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .gitignore
Binary file not shown.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,17 @@ KernelBench/
```

## 🔧 Set up

### Using uv (recommended)
```bash
uv sync
```

### Using pip
```bash
conda create --name kernel-bench python=3.10
conda activate kernel-bench
pip install -r requirements.txt
pip install -e .
pip install -e .
```

We use `litellm` for API calls. Please set your keys by creating a `.env` following our `.env.example`.
Expand Down
53 changes: 53 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
[project]
name = "kernelbench"
version = "0.1.0"
description = "A benchmark for evaluating LLMs' ability to generate efficient GPU kernels"
readme = "README.md"
license = "MIT"
requires-python = ">=3.10"
dependencies = [
# Frameworks
"torch==2.9.0",
"transformers",
"datasets",
"modal",

# DSLs
"nvidia-cutlass-dsl",
"tilelang",
"triton",

# Helper
"tqdm",
"packaging",
"pydra_config",
"pytest",
"ninja",
"cupy-cuda12x",

# Numerics
"einops",
"python-dotenv",
"numpy",

# LLM API access
"openai",
"litellm[proxy]",
]

[project.optional-dependencies]
dev = [
"pytest",
]

[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["src"]

[dependency-groups]
dev = [
"pytest",
]
29 changes: 0 additions & 29 deletions requirements.txt

This file was deleted.

6 changes: 6 additions & 0 deletions scripts/eval_from_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def __init__(self):
self.num_perf_trials = 100
self.timeout = 180 # in seconds
self.measure_performance = True
self.timing_method = "cuda_event"

# Eval Flow setting
# To speedup evaluation, you can start building the kernel on CPU on disk as cache
Expand Down Expand Up @@ -173,6 +174,7 @@ def evaluate_single_sample_modal(
num_correct_trials: int = 5,
num_perf_trials: int = 100,
measure_performance: bool = True,
timing_method: str = "cuda_event",
verbose: bool = False,
backend: str = "cuda",
precision: str = "fp32",
Expand Down Expand Up @@ -212,6 +214,7 @@ def evaluate_single_sample_modal(
original_model_src=ref_arch_src,
custom_model_src=kernel_src,
measure_performance=measure_performance,
timing_method=timing_method,
verbose=verbose,
num_correct_trials=num_correct_trials,
num_perf_trials=num_perf_trials,
Expand Down Expand Up @@ -324,6 +327,7 @@ def evaluate_single_sample(
original_model_src=ref_arch_src,
custom_model_src=kernel_src,
measure_performance=configs.measure_performance,
timing_method=configs.timing_method,
verbose=configs.verbose,
num_correct_trials=configs.num_correct_trials,
num_perf_trials=configs.num_perf_trials,
Expand Down Expand Up @@ -384,6 +388,7 @@ def evaluate_single_sample_modal_direct(
num_correct_trials=configs.num_correct_trials,
num_perf_trials=configs.num_perf_trials,
measure_performance=configs.measure_performance,
timing_method=configs.timing_method,
verbose=configs.verbose,
)
return eval_result
Expand Down Expand Up @@ -502,6 +507,7 @@ def batch_eval_modal(
num_correct_trials=config.num_correct_trials,
num_perf_trials=config.num_perf_trials,
measure_performance=config.measure_performance,
timing_method=config.timing_method,
verbose=config.verbose,
backend=config.backend,
precision=config.precision,
Expand Down
2 changes: 2 additions & 0 deletions scripts/generate_and_eval_single_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def __init__(self):
self.log_eval_result = False

self.backend = "cuda"
self.timing_method = "cuda_event" # see timing.py

# Prompt construction
self.prompt_option = "one_shot" # choices: zero_shot, one_shot, few_shot
Expand Down Expand Up @@ -267,6 +268,7 @@ def main(config: EvalConfig):
custom_kernel,
verbose=config.verbose,
measure_performance=True,
timing_method=config.timing_method,
num_correct_trials=5,
num_perf_trials=100,
backend=config.backend,
Expand Down
9 changes: 5 additions & 4 deletions scripts/generate_and_eval_single_sample_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from datasets import load_dataset

#from src.dataset import construct_kernelbench_dataset
from src.eval import eval_kernel_against_ref
from src.prompt_constructor_toml import get_prompt_for_backend, get_custom_prompt
from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets

Expand Down Expand Up @@ -75,6 +74,7 @@ def __init__(self):
self.log_eval_result = False

self.backend = "cuda"
self.timing_method = "cuda_event" # see timing.py
# Prompt generation settings
self.prompt_option = "one_shot" # zero_shot, one_shot, few_shot
self.include_hardware_info = False
Expand Down Expand Up @@ -102,15 +102,15 @@ def __repr__(self):
"g++-10",
"clang" # note i skip a step
)
.pip_install_from_requirements(os.path.join(REPO_TOP_DIR, "requirements.txt"))
.uv_sync(uv_project_dir=REPO_TOP_DIR)
.add_local_python_source("src")
)

@app.cls(image=image)
class EvalFunc:

@modal.method()
def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision):
def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision, timing_method):
# 3. Evaluate Kernel
# NOTE: no need to wrap around process here as only a single sample
# see batch eval for examples of process isolation
Expand All @@ -121,6 +121,7 @@ def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arc
modal_set_gpu_arch(gpu_arch)
return eval_kernel_against_ref(
ref_arch_src, custom_kernel, verbose=verbose, measure_performance=True,
timing_method=timing_method,
num_correct_trials=5, num_perf_trials=100, backend=backend, precision=get_torch_dtype_from_string(precision)
)

Expand Down Expand Up @@ -274,7 +275,7 @@ def main(config: EvalConfig):

with app.run():
kernel_exec_result = EvalFunc.with_options(gpu=config.gpu)().eval_single_sample_modal.remote(
ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision
ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision, config.timing_method
)

print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")
Expand Down
14 changes: 10 additions & 4 deletions scripts/generate_baseline_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
import numpy as np
from src.eval import (
load_original_model_and_inputs,
time_execution_with_cuda_event,
get_timing_stats,
set_seed,
fetch_ref_arch_from_problem_id,
)
from src.timing import (
get_timing_function,
get_timing_stats,
)
from src.dataset import construct_problem_dataset_from_problem_dir
from src.utils import read_file
import os
Expand Down Expand Up @@ -81,6 +83,7 @@ def measure_program_time(
torch_compile_options: str="default",
device: torch.device="cuda:0",
verbose: bool = False,
timing_method: str = "cuda_event",
) -> dict:
"""
Measure the time of a KernelBench reference architecture
Expand Down Expand Up @@ -116,8 +119,11 @@ def measure_program_time(

model = model.cuda(device=device)
torch.cuda.synchronize(device=device)
elapsed_times = time_execution_with_cuda_event(
model, *inputs, num_trials=num_trials, verbose=verbose, device=device

# run chosen timing function
timing_fn = get_timing_function(timing_method)
elapsed_times = timing_fn(
model, inputs, num_trials=num_trials, verbose=verbose, device=device
)
runtime_stats = get_timing_stats(elapsed_times, device=device)

Expand Down
2 changes: 1 addition & 1 deletion scripts/generate_baseline_time_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def __init__(self):
"g++-10",
"clang" # note i skip a step
)
.pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt"))
.uv_sync(uv_project_dir=REPO_TOP_PATH)
.add_local_dir(
KERNEL_BENCH_PATH,
remote_path="/root/KernelBench"
Expand Down
20 changes: 16 additions & 4 deletions scripts/run_and_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
Usage:
1. PyTorch reference is a local file (local eval)
python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/model_ex_add.py kernel_src_path=src/prompts/model_new_ex_add.py eval_mode=local
python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/few_shot/model_ex_tiled_matmul.py kernel_src_path=src/prompts/few_shot/model_new_ex_tiled_matmul.py eval_mode=local


2. PyTorch reference is a kernelbench problem (local eval)
python3 scripts/run_and_check.py ref_origin=kernelbench level=<level> problem_id=<problem_id> kernel_src_path=<path to model-generated kernel> eval_mode=local
Expand Down Expand Up @@ -101,6 +103,7 @@ def __init__(self):
# verbose logging
self.verbose = False
self.measure_performance = True
self.timing_method = "cuda_event" # see timing.py
self.build_dir_prefix = "" # if you want to specify a custom build directory
self.clear_cache = False # TODO

Expand Down Expand Up @@ -128,18 +131,23 @@ def evaluate_single_sample_src(ref_arch_src: str, kernel_src: str, configs: dict
num_perf_trials = configs["num_perf_trials"]
verbose = configs["verbose"]
measure_performance = configs["measure_performance"]
timing_method = configs["timing_method"]
backend = configs["backend"]
precision = kernel_eval.get_torch_dtype_from_string(configs["precision"])

try:
eval_result = kernel_eval.eval_kernel_against_ref(
original_model_src=ref_arch_src,
custom_model_src=kernel_src,
measure_performance=measure_performance,
timing_method=timing_method,
verbose=verbose,
num_correct_trials=num_correct_trials,
num_perf_trials=num_perf_trials,
build_dir=build_dir,
device=device,
backend=configs["backend"],
precision=kernel_eval.get_torch_dtype_from_string(configs["precision"])
backend=backend,
precision=precision
)
return eval_result
except Exception as e:
Expand Down Expand Up @@ -180,17 +188,21 @@ def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, c
num_perf_trials = configs["num_perf_trials"]
verbose = configs["verbose"]
measure_performance = configs["measure_performance"]
timing_method = configs["timing_method"]
backend = configs["backend"]
precision = kernel_eval.get_torch_dtype_from_string(configs["precision"])

eval_result = eval_kernel_against_ref(
original_model_src=ref_arch_src,
custom_model_src=kernel_src,
measure_performance=measure_performance,
timing_method=timing_method,
verbose=verbose,
num_correct_trials=num_correct_trials,
num_perf_trials=num_perf_trials,
device=device,
backend=configs["backend"],
precision=get_torch_dtype_from_string(configs["precision"])
backend=backend,
precision=precision
)
return eval_result

Expand Down
8 changes: 0 additions & 8 deletions setup.py

This file was deleted.

Loading