Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .gitignore
Binary file not shown.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ modal
# DSLs
nvidia-cutlass-dsl
tilelang
triton

# helper
tqdm
packaging
pydra_config
pytest
ninja
cupy-cuda12x

# Numerics
einops
Expand Down
6 changes: 6 additions & 0 deletions scripts/eval_from_generations.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ def __init__(self):
self.num_perf_trials = 100
self.timeout = 180 # in seconds
self.measure_performance = True
self.timing_method = "cuda_event"

# Eval Flow setting
# To speedup evaluation, you can start building the kernel on CPU on disk as cache
Expand Down Expand Up @@ -173,6 +174,7 @@ def evaluate_single_sample_modal(
num_correct_trials: int = 5,
num_perf_trials: int = 100,
measure_performance: bool = True,
timing_method: str = "cuda_event",
verbose: bool = False,
backend: str = "cuda",
precision: str = "fp32",
Expand Down Expand Up @@ -212,6 +214,7 @@ def evaluate_single_sample_modal(
original_model_src=ref_arch_src,
custom_model_src=kernel_src,
measure_performance=measure_performance,
timing_method=timing_method,
verbose=verbose,
num_correct_trials=num_correct_trials,
num_perf_trials=num_perf_trials,
Expand Down Expand Up @@ -324,6 +327,7 @@ def evaluate_single_sample(
original_model_src=ref_arch_src,
custom_model_src=kernel_src,
measure_performance=configs.measure_performance,
timing_method=configs.timing_method,
verbose=configs.verbose,
num_correct_trials=configs.num_correct_trials,
num_perf_trials=configs.num_perf_trials,
Expand Down Expand Up @@ -384,6 +388,7 @@ def evaluate_single_sample_modal_direct(
num_correct_trials=configs.num_correct_trials,
num_perf_trials=configs.num_perf_trials,
measure_performance=configs.measure_performance,
timing_method=configs.timing_method,
verbose=configs.verbose,
)
return eval_result
Expand Down Expand Up @@ -502,6 +507,7 @@ def batch_eval_modal(
num_correct_trials=config.num_correct_trials,
num_perf_trials=config.num_perf_trials,
measure_performance=config.measure_performance,
timing_method=config.timing_method,
verbose=config.verbose,
backend=config.backend,
precision=config.precision,
Expand Down
2 changes: 2 additions & 0 deletions scripts/generate_and_eval_single_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def __init__(self):
self.log_eval_result = False

self.backend = "cuda"
self.timing_method = "cuda_event" # see timing.py

# Prompt construction
self.prompt_option = "one_shot" # choices: zero_shot, one_shot, few_shot
Expand Down Expand Up @@ -267,6 +268,7 @@ def main(config: EvalConfig):
custom_kernel,
verbose=config.verbose,
measure_performance=True,
timing_method=config.timing_method,
num_correct_trials=5,
num_perf_trials=100,
backend=config.backend,
Expand Down
7 changes: 4 additions & 3 deletions scripts/generate_and_eval_single_sample_modal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from datasets import load_dataset

#from src.dataset import construct_kernelbench_dataset
from src.eval import eval_kernel_against_ref
from src.prompt_constructor_toml import get_prompt_for_backend, get_custom_prompt
from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets

Expand Down Expand Up @@ -75,6 +74,7 @@ def __init__(self):
self.log_eval_result = False

self.backend = "cuda"
self.timing_method = "cuda_event" # see timing.py
# Prompt generation settings
self.prompt_option = "one_shot" # zero_shot, one_shot, few_shot
self.include_hardware_info = False
Expand Down Expand Up @@ -110,7 +110,7 @@ def __repr__(self):
class EvalFunc:

@modal.method()
def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision):
def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision, timing_method):
# 3. Evaluate Kernel
# NOTE: no need to wrap around process here as only a single sample
# see batch eval for examples of process isolation
Expand All @@ -121,6 +121,7 @@ def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arc
modal_set_gpu_arch(gpu_arch)
return eval_kernel_against_ref(
ref_arch_src, custom_kernel, verbose=verbose, measure_performance=True,
timing_method=timing_method,
num_correct_trials=5, num_perf_trials=100, backend=backend, precision=get_torch_dtype_from_string(precision)
)

Expand Down Expand Up @@ -274,7 +275,7 @@ def main(config: EvalConfig):

with app.run():
kernel_exec_result = EvalFunc.with_options(gpu=config.gpu)().eval_single_sample_modal.remote(
ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision
ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision, config.timing_method
)

print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")
Expand Down
14 changes: 10 additions & 4 deletions scripts/generate_baseline_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
import numpy as np
from src.eval import (
load_original_model_and_inputs,
time_execution_with_cuda_event,
get_timing_stats,
set_seed,
fetch_ref_arch_from_problem_id,
)
from src.timing import (
get_timing_function,
get_timing_stats,
)
from src.dataset import construct_problem_dataset_from_problem_dir
from src.utils import read_file
import os
Expand Down Expand Up @@ -81,6 +83,7 @@ def measure_program_time(
torch_compile_options: str="default",
device: torch.device="cuda:0",
verbose: bool = False,
timing_method: str = "cuda_event",
) -> dict:
"""
Measure the time of a KernelBench reference architecture
Expand Down Expand Up @@ -116,8 +119,11 @@ def measure_program_time(

model = model.cuda(device=device)
torch.cuda.synchronize(device=device)
elapsed_times = time_execution_with_cuda_event(
model, *inputs, num_trials=num_trials, verbose=verbose, device=device

# run chosen timing function
timing_fn = get_timing_function(timing_method)
elapsed_times = timing_fn(
model, inputs, num_trials=num_trials, verbose=verbose, device=device
)
runtime_stats = get_timing_stats(elapsed_times, device=device)

Expand Down
20 changes: 16 additions & 4 deletions scripts/run_and_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
Usage:
1. PyTorch reference is a local file (local eval)
python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/model_ex_add.py kernel_src_path=src/prompts/model_new_ex_add.py eval_mode=local
python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/few_shot/model_ex_tiled_matmul.py kernel_src_path=src/prompts/few_shot/model_new_ex_tiled_matmul.py eval_mode=local


2. PyTorch reference is a kernelbench problem (local eval)
python3 scripts/run_and_check.py ref_origin=kernelbench level=<level> problem_id=<problem_id> kernel_src_path=<path to model-generated kernel> eval_mode=local
Expand Down Expand Up @@ -101,6 +103,7 @@ def __init__(self):
# verbose logging
self.verbose = False
self.measure_performance = True
self.timing_method = "cuda_event" # see timing.py
self.build_dir_prefix = "" # if you want to specify a custom build directory
self.clear_cache = False # TODO

Expand Down Expand Up @@ -128,18 +131,23 @@ def evaluate_single_sample_src(ref_arch_src: str, kernel_src: str, configs: dict
num_perf_trials = configs["num_perf_trials"]
verbose = configs["verbose"]
measure_performance = configs["measure_performance"]
timing_method = configs["timing_method"]
backend = configs["backend"]
precision = kernel_eval.get_torch_dtype_from_string(configs["precision"])

try:
eval_result = kernel_eval.eval_kernel_against_ref(
original_model_src=ref_arch_src,
custom_model_src=kernel_src,
measure_performance=measure_performance,
timing_method=timing_method,
verbose=verbose,
num_correct_trials=num_correct_trials,
num_perf_trials=num_perf_trials,
build_dir=build_dir,
device=device,
backend=configs["backend"],
precision=kernel_eval.get_torch_dtype_from_string(configs["precision"])
backend=backend,
precision=precision
)
return eval_result
except Exception as e:
Expand Down Expand Up @@ -180,17 +188,21 @@ def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, c
num_perf_trials = configs["num_perf_trials"]
verbose = configs["verbose"]
measure_performance = configs["measure_performance"]
timing_method = configs["timing_method"]
backend = configs["backend"]
precision = kernel_eval.get_torch_dtype_from_string(configs["precision"])

eval_result = eval_kernel_against_ref(
original_model_src=ref_arch_src,
custom_model_src=kernel_src,
measure_performance=measure_performance,
timing_method=timing_method,
verbose=verbose,
num_correct_trials=num_correct_trials,
num_perf_trials=num_perf_trials,
device=device,
backend=configs["backend"],
precision=get_torch_dtype_from_string(configs["precision"])
backend=backend,
precision=precision
)
return eval_result

Expand Down
Loading