From ed54e1c426902ed504dd4f21379ee74856338fea Mon Sep 17 00:00:00 2001 From: pythonomar22 Date: Wed, 5 Nov 2025 15:04:36 -0800 Subject: [PATCH 1/2] adding modal support to run and check, debugging tensor size error --- scripts/run_and_check.py | 244 ++++++++++++++++++++++++++++++++------- 1 file changed, 205 insertions(+), 39 deletions(-) diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py index a863afcd..2d0bd7a5 100644 --- a/scripts/run_and_check.py +++ b/scripts/run_and_check.py @@ -4,13 +4,61 @@ from pydra import REQUIRED, Config import os from datasets import load_dataset - +import modal from src import eval as kernel_eval from src import utils as kernel_utils from scripts.generate_baseline_time import measure_program_time from src.utils import read_file +# Modal setup +app = modal.App("run_and_check") +gpu_arch_mapping = { + "L40S": ["Ada"], + "H100": ["Hopper"], + "H200": ["Hopper"], + "A100": ["Ampere"], + "A100-80GB": ["Ampere"], + "L4": ["Ada"], + "T4": ["Turing"], + "A10G": ["Ampere"] +} + +REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench") + +cuda_version = "12.4.0" +flavor = "devel" +operating_sys = "ubuntu22.04" +tag = f"{cuda_version}-{flavor}-{operating_sys}" + +image = ( + modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10") + .apt_install("git", "gcc-10", "g++-10", "clang") + .pip_install( + "anthropic", + "numpy", + "openai", + "packaging", + "pydra_config", + "torch==2.5.0", + "tqdm", + "datasets", + "transformers", + "google-generativeai", + "together", + "pytest", + "ninja", + "utils", + "einops", + "python-dotenv", + "litellm[proxy]", + ) + .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench") + .add_local_python_source("src") + .add_local_python_source("scripts") +) + """ Run a pair of KernelBench format (problem, solution) to check if solution is correct and compute speedup @@ -25,11 +73,17 @@ ==================================================== Usage: -1. PyTorch reference is a local file -python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/model_ex_add.py kernel_src_path=src/prompts/model_new_ex_add.py +1. PyTorch reference is a local file (local eval) +python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/model_ex_add.py kernel_src_path=src/prompts/model_new_ex_add.py eval_mode=local + +2. PyTorch reference is a kernelbench problem (local eval) +python3 scripts/run_and_check.py ref_origin=kernelbench level= problem_id= kernel_src_path= eval_mode=local -2. PyTorch refernece is a kernelbench problem -python3 scripts/run_and_check.py ref_origin=kernelbench level= problem_id= kernel_src_path= +3. PyTorch reference is a local file (modal eval on cloud GPU) +python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/model_ex_add.py kernel_src_path=src/prompts/model_new_ex_add.py eval_mode=modal gpu=H100 + +4. PyTorch reference is a kernelbench problem (modal eval on cloud GPU) +python3 scripts/run_and_check.py ref_origin=kernelbench level= problem_id= kernel_src_path= eval_mode=modal gpu=L40S ==================================================== """ @@ -51,6 +105,9 @@ def __init__(self): # Solution src definition self.kernel_src_path = "" + # Evaluation mode + self.eval_mode = "local" # either "local" or "modal" + self.gpu = "L40S" # GPU type for modal (L40S, H100, H200, A100, etc.) # KernelBench Eval specific # number of trials to run for correctness @@ -66,7 +123,7 @@ def __init__(self): self.clear_cache = False # TODO # Replace with your NVIDIA GPU architecture, e.g. ["Hopper"] - self.gpu_arch = ["Ada"] + self.gpu_arch = ["Ada"] self.precision = "fp32" self.backend = "cuda" @@ -119,11 +176,70 @@ def evaluate_single_sample_src(ref_arch_src: str, kernel_src: str, configs: dict "hardware": torch.cuda.get_device_name(device=device), "device": str(device) } - eval_result = kernel_eval.KernelExecResult(compiled=False, correctness=False, + eval_result = kernel_eval.KernelExecResult(compiled=False, correctness=False, metadata=metadata) return eval_result +# Modal evaluation class +@app.cls(image=image, scaledown_window=5) +class EvalFunc: + + @modal.method() + def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, configs: dict, gpu_arch: list): + """Evaluate a single sample source code against a reference source code on Modal""" + from src.utils import set_gpu_arch + from src.eval import eval_kernel_against_ref, get_torch_dtype_from_string + + set_gpu_arch(gpu_arch) + device = torch.device("cuda:0") + + num_correct_trials = configs["num_correct_trials"] + num_perf_trials = configs["num_perf_trials"] + verbose = configs["verbose"] + measure_performance = configs["measure_performance"] + + eval_result = eval_kernel_against_ref( + original_model_src=ref_arch_src, + custom_model_src=kernel_src, + measure_performance=measure_performance, + verbose=verbose, + num_correct_trials=num_correct_trials, + num_perf_trials=num_perf_trials, + device=device, + backend=configs["backend"], + precision=get_torch_dtype_from_string(configs["precision"]) + ) + return eval_result + + @modal.method() + def measure_program_time_modal( + self, + ref_arch_src: str, + num_trials: int, + use_torch_compile: bool, + torch_compile_backend: str, + torch_compile_options: str, + gpu_arch: list + ): + """Measure the execution time of a reference program on Modal""" + from scripts.generate_baseline_time import measure_program_time + from src.utils import set_gpu_arch + + set_gpu_arch(gpu_arch) + device = torch.device("cuda:0") + + return measure_program_time( + ref_arch_name="Reference Program", + ref_arch_src=ref_arch_src, + num_trials=num_trials, + use_torch_compile=use_torch_compile, + torch_compile_backend=torch_compile_backend, + torch_compile_options=torch_compile_options, + device=device + ) + + @pydra.main(base=ScriptConfig) def main(config: ScriptConfig): @@ -162,38 +278,88 @@ def main(config: ScriptConfig): kernel_src = read_file(config.kernel_src_path) # Start Evaluation - device = torch.device("cuda:0") # default device - kernel_utils.set_gpu_arch(config.gpu_arch) - - print("[INFO] Evaluating kernel against reference code") - # Evaluate kernel against reference code - kernel_eval_result = evaluate_single_sample_src( - ref_arch_src=ref_arch_src, - kernel_src=kernel_src, - configs=config.to_dict(), - device=device - ) - kernel_exec_time = kernel_eval_result.runtime - - # Measure baseline time - print("[INFO] Measuring reference program time") - # Default using PyTorch Eager here - ref_time_eager_result = measure_program_time(ref_arch_name="Reference Program", - ref_arch_src=ref_arch_src, - num_trials=config.num_perf_trials, - use_torch_compile=False, - device=device) - ref_exec_eager_time = ref_time_eager_result.get("mean", None) - - # Measure Torch Compile time - ref_time_compile_result = measure_program_time(ref_arch_name="Reference Program", - ref_arch_src=ref_arch_src, - num_trials=config.num_perf_trials, - use_torch_compile=True, - torch_compile_backend="inductor", - torch_compile_options="default", - device=device) - ref_exec_compile_time = ref_time_compile_result.get("mean", None) + assert config.eval_mode in ["local", "modal"], "eval_mode must be either 'local' or 'modal'" + + if config.eval_mode == "local": + # Local evaluation (existing code path) + device = torch.device("cuda:0") + kernel_utils.set_gpu_arch(config.gpu_arch) + + print("[INFO] Evaluating kernel against reference code (LOCAL)") + # Evaluate kernel against reference code + kernel_eval_result = evaluate_single_sample_src( + ref_arch_src=ref_arch_src, + kernel_src=kernel_src, + configs=config.to_dict(), + device=device + ) + kernel_exec_time = kernel_eval_result.runtime + + # Measure baseline time + print("[INFO] Measuring reference program time") + # Default using PyTorch Eager here + ref_time_eager_result = measure_program_time(ref_arch_name="Reference Program", + ref_arch_src=ref_arch_src, + num_trials=config.num_perf_trials, + use_torch_compile=False, + device=device) + ref_exec_eager_time = ref_time_eager_result.get("mean", None) + + # Measure Torch Compile time + ref_time_compile_result = measure_program_time(ref_arch_name="Reference Program", + ref_arch_src=ref_arch_src, + num_trials=config.num_perf_trials, + use_torch_compile=True, + torch_compile_backend="inductor", + torch_compile_options="default", + device=device) + ref_exec_compile_time = ref_time_compile_result.get("mean", None) + + elif config.eval_mode == "modal": + # Modal evaluation (remote execution) + gpu_arch = gpu_arch_mapping.get(config.gpu, config.gpu_arch) + print(f"[INFO] Using GPU: {config.gpu} with architecture: {gpu_arch}") + + with app.run(): + print("[INFO] Evaluating kernel against reference code (MODAL)") + # Evaluate kernel against reference code + kernel_eval_result = EvalFunc.with_options( + gpu=config.gpu + )().evaluate_single_sample_src_modal.remote( + ref_arch_src=ref_arch_src, + kernel_src=kernel_src, + configs=config.to_dict(), + gpu_arch=gpu_arch + ) + kernel_exec_time = kernel_eval_result.runtime + + # Measure baseline time + print("[INFO] Measuring reference program time (PyTorch Eager)") + ref_time_eager_result = EvalFunc.with_options( + gpu=config.gpu + )().measure_program_time_modal.remote( + ref_arch_src=ref_arch_src, + num_trials=config.num_perf_trials, + use_torch_compile=False, + torch_compile_backend=None, + torch_compile_options=None, + gpu_arch=gpu_arch + ) + ref_exec_eager_time = ref_time_eager_result.get("mean", None) + + # Measure Torch Compile time + print("[INFO] Measuring reference program time (torch.compile)") + ref_time_compile_result = EvalFunc.with_options( + gpu=config.gpu + )().measure_program_time_modal.remote( + ref_arch_src=ref_arch_src, + num_trials=config.num_perf_trials, + use_torch_compile=True, + torch_compile_backend="inductor", + torch_compile_options="default", + gpu_arch=gpu_arch + ) + ref_exec_compile_time = ref_time_compile_result.get("mean", None) print("="*40) print(f"[Eval] Kernel eval result: {kernel_eval_result}") From 86cb7d9904db6bb2c69b0aa33570bd5fcd4b890a Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Sun, 9 Nov 2025 01:22:57 +0000 Subject: [PATCH 2/2] remove a few unncessary modal dependencies, ready to merge --- README.md | 2 +- scripts/eval_from_generations.py | 4 ---- scripts/generate_baseline_time_modal.py | 5 +---- scripts/run_and_check.py | 4 ---- 4 files changed, 2 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index b3db083a..ba897602 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ To evaluate model-generated kernels, we need to check if they: Check out `src/eval.py` for details on how we implement correctness check and timing. -We provide a convenient script `scripts/run_and_check.py` to evaluate one single sample source code against a reference source code, check correctness and compute speedup. You can use this to evaluate a model-generated kernel. +We provide a convenient script `scripts/run_and_check.py` to evaluate one single sample source code against a reference source code, check correctness and compute speedup. You can use this to evaluate a kernel either locally or remotely by setting `eval_mode=local` or `eval_mode=modal`. #### Overall Benchmark Metric diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py index c7f6fe61..accdb2db 100644 --- a/scripts/eval_from_generations.py +++ b/scripts/eval_from_generations.py @@ -68,17 +68,13 @@ "clang" ) .pip_install( - "anthropic", "numpy", - "openai", "packaging", "pydra_config", "torch==2.5.0", "tqdm", "datasets", "transformers", - "google-generativeai", - "together", "pytest", "ninja", "utils", diff --git a/scripts/generate_baseline_time_modal.py b/scripts/generate_baseline_time_modal.py index 52b44451..8caa8058 100644 --- a/scripts/generate_baseline_time_modal.py +++ b/scripts/generate_baseline_time_modal.py @@ -68,17 +68,14 @@ "clang" # note i skip a step ) .pip_install( # required to build flash-attn - "anthropic", + # Let's unify these dependencies somewhere "numpy", - "openai", "packaging", "pydra_config", "torch==2.5.0", "tqdm", "datasets", "transformers", - "google-generativeai", - "together", "pytest", "ninja", "utils", diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py index 2d0bd7a5..d60ef30f 100644 --- a/scripts/run_and_check.py +++ b/scripts/run_and_check.py @@ -36,17 +36,13 @@ modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10") .apt_install("git", "gcc-10", "g++-10", "clang") .pip_install( - "anthropic", "numpy", - "openai", "packaging", "pydra_config", "torch==2.5.0", "tqdm", "datasets", "transformers", - "google-generativeai", - "together", "pytest", "ninja", "utils",