From db46141cdcdf84b9905175754b90ca05e2995331 Mon Sep 17 00:00:00 2001 From: Kesavan Date: Mon, 10 Nov 2025 18:55:16 +0000 Subject: [PATCH 01/13] Start seperate timing file for timing functions Co-authored-by: Simon Guo Co-authored-by: Pietro Marsella --- src/timing.py | 181 +++++++++++++++++++++++++++++ src/unit_tests/test_eval_timing.py | 22 ++++ 2 files changed, 203 insertions(+) create mode 100644 src/timing.py create mode 100644 src/unit_tests/test_eval_timing.py diff --git a/src/timing.py b/src/timing.py new file mode 100644 index 00000000..e6c2c006 --- /dev/null +++ b/src/timing.py @@ -0,0 +1,181 @@ +import torch +import json +import triton +import numpy as np +import time +from typing import Any +import os, sys + +################################################################################ +# Performance Eval +################################################################################ + +############################################################# +# Timing Functions +# TODO: see our detailed study on how to time kernel execution +# we implement a few ways to do timing studies +# agnositic whether the modules are rather Model or ModelNew +############################################################# + + +def get_timing_function( + method: str = "cuda_event", # by default +) -> callable: + """ + Get the timing function based on different timing methods + """ + assert method in ["cuda_event", "do_bench", "time_time"] + print( + f"[Profiling] Using timing method: {method}" + ) + match method: + case "cuda_event": + return time_execution_with_cuda_event + case "do_bench": + return time_execution_with_do_bench + case "time_time": + return time_execution_with_tim_dot_time + case _: + raise ValueError(f"Unknown timing method: {method}") + + +# TODO: do we want to support pytorch profiler + +def time_execution_with_do_bench( + kernel_fn: callable, + *args, + num_warmup: int = 3, + num_trials: int = 10, + verbose: bool = True, + device: torch.device = None, +) -> list[float]: + """ + Time a CUDA kernel function over multiple trials using triton.do_bench + """ + + raise NotImplementedError + + +def time_execution_with_time_dot_time( + kernel_fn: callable, + *args, + num_warmup: int = 3, + num_trials: int = 10, + verbose: bool = True, + device: torch.device = None, +) -> list[float]: + """ + Time a CUDA kernel function over multiple trials using time.time() + """ + raise RuntimeError("This function should not be used for timing, it's here purely for reference") + + # use this + # start = time.time() + # this is not the way but we will implement it for tutorial + + + +def time_execution_with_cuda_event( + kernel_fn: callable, + args: list[Any], + num_warmup: int = 3, + num_trials: int = 10, + verbose: bool = True, + device: torch.device = None, +) -> list[float]: + """ + Time a CUDA kernel function over multiple trials using torch.cuda.Event + + Args: + kernel_fn: Function to time + *args: Arguments to pass to kernel_fn + num_trials: Number of timing trials to run + verbose: Whether to print per-trial timing info + device: CUDA device to use, if None, use current device + + TODO: make this super solid and check this + Returns: + List of elapsed times in milliseconds + """ + if device is None: + if verbose: + print(f"Using current device: {torch.cuda.current_device()}") + device = torch.cuda.current_device() + + # Warm ups + for _ in range(num_warmup): + kernel_fn(*args) + torch.cuda.synchronize(device=device) + + print( + f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" + ) + elapsed_times = [] + + # Actual trials + for trial in range(num_trials): + # create event marker default is not interprocess + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + start_event.record() + kernel_fn(*args) + end_event.record() + + # Synchronize to ensure the events have completed + torch.cuda.synchronize(device=device) + + # Calculate the elapsed time in milliseconds + elapsed_time_ms = start_event.elapsed_time(end_event) + if verbose: + print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms") + elapsed_times.append(elapsed_time_ms) + + return elapsed_times + +######################################################## +# Timing stats +######################################################### +def fetch_baseline_time( + level_name: str, problem_id: int, dataset: list[str], baseline_time_filepath: str +) -> dict: + """ + Fetch the baseline time from the time + """ + if not os.path.exists(baseline_time_filepath): + raise FileNotFoundError( + f"Baseline time file not found at {baseline_time_filepath}" + ) + + with open(baseline_time_filepath, "r") as f: + baseline_json = json.load(f) + + problem_name = dataset[problem_id].split("/")[-1] + baseline_time = baseline_json[level_name].get(problem_name, None) + return baseline_time + + +def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> dict: + """Get timing statistics from a list of elapsed times. + + Args: + elapsed_times: List of elapsed times in milliseconds + device: CUDA device, record device info + Returns: + Dict containing mean, std, min, max and num_trials + all timing are in ms + """ + + stats = { + "mean": float(f"{np.mean(elapsed_times):.3g}"), + "std": float(f"{np.std(elapsed_times):.3g}"), + "min": float(f"{np.min(elapsed_times):.3g}"), + "max": float(f"{np.max(elapsed_times):.3g}"), + "num_trials": len(elapsed_times), + } + + if device: + stats["hardware"] = torch.cuda.get_device_name(device=device) + stats["device"] = str(device) # for debugging + + return stats diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py new file mode 100644 index 00000000..ef6fffc6 --- /dev/null +++ b/src/unit_tests/test_eval_timing.py @@ -0,0 +1,22 @@ +import os + + +""" +Test Timing + +We want to systematically study different timing methodologies. + +""" +REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) + +# use exampls in the few shot directory +EXAMPLES_PATH = os.path.join(REPO_PATH, "src", "prompts", "few_shot") + +# Configure your test cases here +TEST_REF_FILE = "model_ex_tiled_matmul.py" +TEST_KERNEL_FILE = "model_new_ex_tiled_matmul.py" + +assert os.path.exists(os.path.join(EXAMPLES_PATH, TEST_REF_FILE)), f"Reference file {TEST_REF_FILE} does not exist in {EXAMPLES_PATH}" +assert os.path.exists(os.path.join(EXAMPLES_PATH, TEST_KERNEL_FILE)), f"Kernel file {TEST_KERNEL_FILE} does not exist in {EXAMPLES_PATH}" + + From 5ff8891b3af1044c89208b5971418b70e753aad5 Mon Sep 17 00:00:00 2001 From: Sahan Date: Sun, 16 Nov 2025 23:38:39 +0000 Subject: [PATCH 02/13] Add tests, cache clearning, time, and do_bench --- src/do_bench.py | 126 +++++++++++++++++++++++++++++ src/timing.py | 75 +++++++++++++---- src/unit_tests/test_eval_timing.py | 73 +++++++++++++++++ 3 files changed, 259 insertions(+), 15 deletions(-) create mode 100644 src/do_bench.py diff --git a/src/do_bench.py b/src/do_bench.py new file mode 100644 index 00000000..a6e8d8f0 --- /dev/null +++ b/src/do_bench.py @@ -0,0 +1,126 @@ +import math +import statistics +from triton import runtime + + +# pure Python implementation of np.quantile/torch.quantile +# to avoid unnecessary runtime dependency on numpy/torch + +# This is a slightly modfied version of triton.testing.do_bench (triton v3.5.x) from +# https://github.com/triton-lang/triton/blob/0add68262ab0a2e33b84524346cb27cbb2787356/python/triton/testing.py#L127 +# with minor a minor modification to support having warmup and repeat time instead be specified in number of iterations +# instead of ms. All changes are explcitly marked + +def _quantile(a, q): + n = len(a) + a = sorted(a) + + def get_quantile(q): + if not (0 <= q <= 1): + raise ValueError("Quantiles must be in the range [0, 1]") + point = q * (n - 1) + lower = math.floor(point) + upper = math.ceil(point) + t = point - lower + return (1 - t) * a[lower] + t * a[upper] + + return [get_quantile(q) for q in q] + + +def _summarize_statistics(times, quantiles, return_mode): + if quantiles is not None: + ret = _quantile(times, quantiles) + if len(ret) == 1: + ret = ret[0] + return ret + if return_mode == "all": + return times + elif return_mode == "min": + return min(times) + elif return_mode == "max": + return max(times) + elif return_mode == "mean": + return statistics.mean(times) + elif return_mode == "median": + return statistics.median(times) + + +def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_mode="mean"): + """ + Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with + the 20-th and 80-th performance percentile. CHANGE: warmup and repeat time are specified in number of iterations rather than ms + + + :param fn: Function to benchmark + :type fn: Callable + :param warmup: Warmup time (in number of iterations) + :type warmup: int + :param rep: Repetition time (in number of iterations) + :type rep: int + :param grad_to_none: Reset the gradient of the provided tensor to None + :type grad_to_none: torch.tensor, optional + :param quantiles: Performance percentile to return in addition to the median. + :type quantiles: list[float], optional + :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean". + :type return_mode: str + """ + assert return_mode in ["min", "max", "mean", "median", "all"] + + # Change + # mean, max, min, quantiles, etc. make no sense with 0 reps + if not (return_mode == "all" and quantiles is None) and rep < 1: + error_msg = ( + f"You are running with {rep} reps. This is likely a mistake!!!\n" + "We do let you do this, but ONLY when quantiles is None when return_mode is not 'all'\n" + "to be consistent with the rest of KernelBench's timing functions" + ) + raise ValueError(error_msg) + # End of change + di = runtime.driver.active.get_device_interface() + + fn() + di.synchronize() + + cache = runtime.driver.active.get_empty_cache_for_benchmark() + + # Estimate the runtime of the function + start_event = di.Event(enable_timing=True) + end_event = di.Event(enable_timing=True) + start_event.record() + for _ in range(5): + runtime.driver.active.clear_cache(cache) + fn() + end_event.record() + di.synchronize() + estimate_ms = start_event.elapsed_time(end_event) / 5 + + # compute number of warmup and repeat + # Change + # n_warmup = max(1, int(warmup / estimate_ms)) + # n_repeat = max(1, int(rep / estimate_ms)) + n_warmup = warmup + n_repeat = rep + # end of change + start_event = [di.Event(enable_timing=True) for i in range(n_repeat)] + end_event = [di.Event(enable_timing=True) for i in range(n_repeat)] + # Warm-up + for _ in range(n_warmup): + fn() + # Benchmark + for i in range(n_repeat): + # we don't want `fn` to accumulate gradient values + # if it contains a backward pass. So we clear the + # provided gradients + if grad_to_none is not None: + for x in grad_to_none: + x.grad = None + # we clear the L2 cache before each run + runtime.driver.active.clear_cache(cache) + # record time of `fn` + start_event[i].record() + fn() + end_event[i].record() + # Record clocks + di.synchronize() + times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)] + return _summarize_statistics(times, quantiles, return_mode) \ No newline at end of file diff --git a/src/timing.py b/src/timing.py index e6c2c006..3dc49b0d 100644 --- a/src/timing.py +++ b/src/timing.py @@ -1,10 +1,11 @@ import torch import json -import triton import numpy as np import time +import warnings from typing import Any -import os, sys +import os +from do_bench import do_bench ################################################################################ # Performance Eval @@ -17,7 +18,6 @@ # agnositic whether the modules are rather Model or ModelNew ############################################################# - def get_timing_function( method: str = "cuda_event", # by default ) -> callable: @@ -37,13 +37,10 @@ def get_timing_function( return time_execution_with_tim_dot_time case _: raise ValueError(f"Unknown timing method: {method}") - - -# TODO: do we want to support pytorch profiler def time_execution_with_do_bench( kernel_fn: callable, - *args, + args: list[Any], num_warmup: int = 3, num_trials: int = 10, verbose: bool = True, @@ -52,13 +49,17 @@ def time_execution_with_do_bench( """ Time a CUDA kernel function over multiple trials using triton.do_bench """ - - raise NotImplementedError + return do_bench( + lambda: kernel_fn(*args), + warmup=num_warmup, + rep=num_trials, + return_mode="all", + ) def time_execution_with_time_dot_time( kernel_fn: callable, - *args, + args: list[Any], num_warmup: int = 3, num_trials: int = 10, verbose: bool = True, @@ -66,12 +67,54 @@ def time_execution_with_time_dot_time( ) -> list[float]: """ Time a CUDA kernel function over multiple trials using time.time() + + Args: + kernel_fn: Function to time + args: Arguments to pass to kernel_fn + num_trials: Number of timing trials to run + verbose: Whether to print per-trial timing info + device: CUDA device to use, if None, use current device + + Returns: + List of elapsed times in milliseconds """ - raise RuntimeError("This function should not be used for timing, it's here purely for reference") - - # use this - # start = time.time() - # this is not the way but we will implement it for tutorial + + # give warning that this is not the way to do it + warnings.warn( + "time_execution_with_time_dot_time is meant for educational purposes only, please other options like time_with_cuda_event or time_with_do_bench", + UserWarning, + ) + + if device is None: + if verbose: + print(f"Using current device: {torch.cuda.current_device()}") + device = torch.cuda.current_device() + + # Warm ups + for _ in range(num_warmup): + kernel_fn(*args) + torch.cuda.synchronize(device=device) + + print( + f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" + ) + elapsed_times = [] + + # Actual trials + for trial in range(num_trials): + start_time = time.time() + kernel_fn(*args) + torch.cuda.synchronize(device=device) + end_time = time.time() + + # Calculate the elapsed time in milliseconds + elapsed_time_ms = (end_time - start_time) * 1000 + if verbose: + print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms") + elapsed_times.append(elapsed_time_ms) + + return elapsed_times + @@ -106,6 +149,7 @@ def time_execution_with_cuda_event( for _ in range(num_warmup): kernel_fn(*args) torch.cuda.synchronize(device=device) + torch.cuda.clear_cache() print( f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" @@ -124,6 +168,7 @@ def time_execution_with_cuda_event( # Synchronize to ensure the events have completed torch.cuda.synchronize(device=device) + torch.cuda.clear_cache() # Calculate the elapsed time in milliseconds elapsed_time_ms = start_event.elapsed_time(end_event) diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py index ef6fffc6..23a100db 100644 --- a/src/unit_tests/test_eval_timing.py +++ b/src/unit_tests/test_eval_timing.py @@ -1,4 +1,14 @@ import os +import sys +import torch +import pytest + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +from timing import ( + time_execution_with_cuda_event, + time_execution_with_time_dot_time, + time_execution_with_do_bench, +) """ @@ -20,3 +30,66 @@ assert os.path.exists(os.path.join(EXAMPLES_PATH, TEST_KERNEL_FILE)), f"Kernel file {TEST_KERNEL_FILE} does not exist in {EXAMPLES_PATH}" +def _run_timing_smoke_test(timing_fn): + """ + Scaffold function for timing smoke tests. + + Args: + timing_fn: The timing function to test + use_args: Whether the timing function expects args parameter (True for cuda_event/time_dot_time, False for do_bench) + """ + # Skip if CUDA is not available + if not torch.cuda.is_available(): + pytest.skip("CUDA not available, skipping timing tests") + + # Create test matrices + size = 512 + a = torch.randn(size, size, device='cuda') + b = torch.randn(size, size, device='cuda') + + num_warmup = 5 + num_trials = 5 + + # Define the kernel function to time + def matmul_kernel(a, b): + return torch.matmul(a, b) + + elapsed_times = timing_fn( + matmul_kernel, + args=[a, b], + num_warmup=num_warmup, + num_trials=num_trials, + verbose=False, + ) + + # Validate results + assert isinstance(elapsed_times, list), "Expected list of elapsed times" + assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}" + assert all(isinstance(t, float) for t in elapsed_times), "All timing results should be floats" + assert all(t > 0 for t in elapsed_times), "All timing results should be positive" + + +def test_time_execution_with_cuda_event_smoke(): + """ + Smoke test for time_execution_with_cuda_event using 512x512 matmul. + Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned. + """ + _run_timing_smoke_test(time_execution_with_cuda_event) + + +def test_time_execution_with_time_dot_time_smoke(): + """ + Smoke test for time_execution_with_time_dot_time using 512x512 matmul. + Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned. + """ + _run_timing_smoke_test(time_execution_with_time_dot_time) + + +def test_time_execution_with_do_bench_smoke(): + """ + Smoke test for time_execution_with_do_bench using 512x512 matmul. + Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned. + """ + _run_timing_smoke_test(time_execution_with_do_bench) + + From 9581487ea29a9f742cfaf1040bd4deb1626e9026 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 12 Dec 2025 03:29:51 +0000 Subject: [PATCH 03/13] reorganize timing func, migrate cuda event with l2 cache from branch to here; a few other to implement --- requirements.txt | 4 +- src/timing.py | 140 +++++++++++++++++++++-------- src/unit_tests/test_eval_timing.py | 56 ++++++------ 3 files changed, 131 insertions(+), 69 deletions(-) diff --git a/requirements.txt b/requirements.txt index d7f31a49..cc7300f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Frameworks -torch==2.5.0 +torch==2.9.0 # we shall upgrade torch for blackwell when it is stable transformers datasets @@ -9,6 +9,7 @@ modal nvidia-cutlass-dsl tilelang apache-tvm +triton # helper tqdm @@ -16,6 +17,7 @@ packaging pydra_config pytest ninja +cupy-cuda12x # Numerics einops diff --git a/src/timing.py b/src/timing.py index 3dc49b0d..007d75c6 100644 --- a/src/timing.py +++ b/src/timing.py @@ -5,26 +5,25 @@ import warnings from typing import Any import os -from do_bench import do_bench +from triton import runtime +from eval import clear_l2_cache ################################################################################ # Performance Eval ################################################################################ -############################################################# -# Timing Functions -# TODO: see our detailed study on how to time kernel execution -# we implement a few ways to do timing studies -# agnositic whether the modules are rather Model or ModelNew -############################################################# - +""" +Kernel Timing Functions [Revamp WIP] +TODO: see our detailed study on how to time kernel execution and benchmarking guide +we implement a few ways to do timing studies +These should be implemnted to be agnostic whether the modules are rather Model (reference kernel) or ModelNew (generated kernel) +""" def get_timing_function( method: str = "cuda_event", # by default ) -> callable: """ Get the timing function based on different timing methods """ - assert method in ["cuda_event", "do_bench", "time_time"] print( f"[Profiling] Using timing method: {method}" ) @@ -34,9 +33,9 @@ def get_timing_function( case "do_bench": return time_execution_with_do_bench case "time_time": - return time_execution_with_tim_dot_time + return time_execution_with_time_dot_time # this is just for education purpose, don't use this case _: - raise ValueError(f"Unknown timing method: {method}") + raise ValueError(f"Unsupported timing method: {method}") def time_execution_with_do_bench( kernel_fn: callable, @@ -44,18 +43,71 @@ def time_execution_with_do_bench( num_warmup: int = 3, num_trials: int = 10, verbose: bool = True, - device: torch.device = None, -) -> list[float]: + device: torch.device | None = None) -> list[float]: """ - Time a CUDA kernel function over multiple trials using triton.do_bench + TODO: need check do_bench + [WIP] need to check """ - return do_bench( - lambda: kernel_fn(*args), - warmup=num_warmup, - rep=num_trials, - return_mode="all", - ) - + + device = torch.cuda.current_device() if device is not None else device + + if verbose: print("Using do_bench to evaluate kernel") + + # note: for both nvidia and amd, di is torch.cuda (amd uses a cuda compatible interface), so we could really just have torch.cuda + di = runtime.driver.active.get_device_interface() + + kernel_fn(*args) + di.synchronize(device=device) + + cache = runtime.driver.active.get_empty_cache_for_benchmark() + + # Estimate the runtime of the function (not needed since now the warmup and repeat steps are set by the user) + + # start_event = di.Event(enable_timing=True) + # end_event = di.Event(enable_timing=True) + # start_event.record() + # for _ in range(5): + # runtime.driver.active.clear_cache(cache) + # kernel_fn(*args) + # end_event.record() + # di.synchronize() + # estimate_ms = start_event.elapsed_time(end_event) / 5 + + # compute number of warmup and repeat + # Change + # n_warmup = max(1, int(warmup / estimate_ms)) + # n_repeat = max(1, int(rep / estimate_ms)) + # n_warmup = warmup + # n_repeat = rep + # end of change + start_event = [di.Event(enable_timing=True) for i in range(num_trials)] + end_event = [di.Event(enable_timing=True) for i in range(num_trials)] + # Warm-up + for _ in range(num_warmup): + kernel_fn(*args) + # Benchmark + for i in range(num_trials): + + # All our functions are forward passes, so we don't need to reset gradients + # we don't want `fn` to accumulate gradient values + # if it contains a backward pass. So we clear the + # provided gradients + # if grad_to_none is not None: + # for x in grad_to_none: + # x.grad = None + + # we clear the L2 cache before each run + runtime.driver.active.clear_cache(cache) + # record time of `fn` + start_event[i].record() + kernel_fn(*args) + end_event[i].record() + # Record clocks + di.synchronize(device=device) + if verbose: print('Done with do_bench evaluation') + times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)] + return times + def time_execution_with_time_dot_time( kernel_fn: callable, @@ -63,11 +115,11 @@ def time_execution_with_time_dot_time( num_warmup: int = 3, num_trials: int = 10, verbose: bool = True, - device: torch.device = None, + device: torch.device | None = None, ) -> list[float]: """ Time a CUDA kernel function over multiple trials using time.time() - + [WIP] Args: kernel_fn: Function to time args: Arguments to pass to kernel_fn @@ -77,6 +129,8 @@ def time_execution_with_time_dot_time( Returns: List of elapsed times in milliseconds + + Not recommended: """ # give warning that this is not the way to do it @@ -95,9 +149,7 @@ def time_execution_with_time_dot_time( kernel_fn(*args) torch.cuda.synchronize(device=device) - print( - f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" - ) + print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}") elapsed_times = [] # Actual trials @@ -128,6 +180,8 @@ def time_execution_with_cuda_event( ) -> list[float]: """ Time a CUDA kernel function over multiple trials using torch.cuda.Event + The first version of KenrelBench used this for evaluation. + We care about cold cache performance here. Args: kernel_fn: Function to time @@ -136,7 +190,7 @@ def time_execution_with_cuda_event( verbose: Whether to print per-trial timing info device: CUDA device to use, if None, use current device - TODO: make this super solid and check this + TODO: double check this with team Returns: List of elapsed times in milliseconds """ @@ -149,35 +203,44 @@ def time_execution_with_cuda_event( for _ in range(num_warmup): kernel_fn(*args) torch.cuda.synchronize(device=device) - torch.cuda.clear_cache() - print( - f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" + # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache + torch.cuda.empty_cache() + + print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" ) - elapsed_times = [] - # Actual trials + elapsed_times: list[float] = [] # in ms + + # Timing trials for trial in range(num_trials): + torch.cuda.synchronize(device=device) # block on all streams + # create event marker default is not interprocess start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) - + + clear_l2_cache() # measuring cold cache performance + + # note cuda events mark event on current stream start_event.record() - kernel_fn(*args) - end_event.record() + _ = kernel_fn(*args) + end_event.record() - # Synchronize to ensure the events have completed + # waits for all streams on that device + # though it is important to note the events only record time between on current stream + # TODO: find ways to check hacks by launching work on additional stream torch.cuda.synchronize(device=device) - torch.cuda.clear_cache() # Calculate the elapsed time in milliseconds elapsed_time_ms = start_event.elapsed_time(end_event) if verbose: - print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms") + print(f"Timing Trial {trial + 1}: {elapsed_time_ms:.3g} ms") elapsed_times.append(elapsed_time_ms) return elapsed_times + ######################################################## # Timing stats ######################################################### @@ -195,6 +258,7 @@ def fetch_baseline_time( with open(baseline_time_filepath, "r") as f: baseline_json = json.load(f) + # TODO: replace with the new Dataset object that Omar will merge in problem_name = dataset[problem_id].split("/")[-1] baseline_time = baseline_json[level_name].get(problem_name, None) return baseline_time diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py index 23a100db..955b2519 100644 --- a/src/unit_tests/test_eval_timing.py +++ b/src/unit_tests/test_eval_timing.py @@ -4,18 +4,11 @@ import pytest sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from timing import ( - time_execution_with_cuda_event, - time_execution_with_time_dot_time, - time_execution_with_do_bench, -) - +import timing """ Test Timing - We want to systematically study different timing methodologies. - """ REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")) @@ -30,10 +23,11 @@ assert os.path.exists(os.path.join(EXAMPLES_PATH, TEST_KERNEL_FILE)), f"Kernel file {TEST_KERNEL_FILE} does not exist in {EXAMPLES_PATH}" -def _run_timing_smoke_test(timing_fn): +def _run_timing_smoke_test_matmul(timing_func_name:str, device:str="cuda"): """ Scaffold function for timing smoke tests. - + Smoke test for using 512x512 matmul. + Args: timing_fn: The timing function to test use_args: Whether the timing function expects args parameter (True for cuda_event/time_dot_time, False for do_bench) @@ -42,10 +36,10 @@ def _run_timing_smoke_test(timing_fn): if not torch.cuda.is_available(): pytest.skip("CUDA not available, skipping timing tests") - # Create test matrices + # Create simple test matrices size = 512 - a = torch.randn(size, size, device='cuda') - b = torch.randn(size, size, device='cuda') + a = torch.randn(size, size, device=device) + b = torch.randn(size, size, device=device) num_warmup = 5 num_trials = 5 @@ -54,7 +48,8 @@ def _run_timing_smoke_test(timing_fn): def matmul_kernel(a, b): return torch.matmul(a, b) - elapsed_times = timing_fn( + timing_func = timing.get_timing_function(timing_func_name) + elapsed_times = timing_func( matmul_kernel, args=[a, b], num_warmup=num_warmup, @@ -67,29 +62,30 @@ def matmul_kernel(a, b): assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}" assert all(isinstance(t, float) for t in elapsed_times), "All timing results should be floats" assert all(t > 0 for t in elapsed_times), "All timing results should be positive" + print(f"smoke test matmul elapsed times with {timing_func_name} (in ms): {elapsed_times}") -def test_time_execution_with_cuda_event_smoke(): - """ - Smoke test for time_execution_with_cuda_event using 512x512 matmul. - Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned. - """ - _run_timing_smoke_test(time_execution_with_cuda_event) +_run_timing_smoke_test_matmul("cuda_event") -def test_time_execution_with_time_dot_time_smoke(): +def test_do_bench_simple_smoke(): """ - Smoke test for time_execution_with_time_dot_time using 512x512 matmul. - Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned. + Smoke test for do_bench itself on a simple CUDA operation. + Just checks it runs and returns timings. """ - _run_timing_smoke_test(time_execution_with_time_dot_time) + if not torch.cuda.is_available(): + pytest.skip("CUDA not available, skipping do_bench smoke test") + from do_bench import do_bench -def test_time_execution_with_do_bench_smoke(): - """ - Smoke test for time_execution_with_do_bench using 512x512 matmul. - Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned. - """ - _run_timing_smoke_test(time_execution_with_do_bench) + x = torch.randn(1024, device="cuda") + + def fn(): + # simple GPU op; do_bench will sync/timestamp internally + return (x * 2).sum() + rep = 5 + times = do_bench(fn, warmup=2, rep=rep, return_mode="all") + assert isinstance(times, list) + assert len(times) == rep From 467f856d6af44629fcc07ccfcdf36365b4cb95d7 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Fri, 12 Dec 2025 04:25:48 +0000 Subject: [PATCH 04/13] implement do_bench and cpu host timing, script to run all 4 timing methods --- src/do_bench.py | 126 ------------ src/timing.py | 296 ++++++++++++++++++----------- src/unit_tests/test_eval_timing.py | 14 +- 3 files changed, 193 insertions(+), 243 deletions(-) delete mode 100644 src/do_bench.py diff --git a/src/do_bench.py b/src/do_bench.py deleted file mode 100644 index a6e8d8f0..00000000 --- a/src/do_bench.py +++ /dev/null @@ -1,126 +0,0 @@ -import math -import statistics -from triton import runtime - - -# pure Python implementation of np.quantile/torch.quantile -# to avoid unnecessary runtime dependency on numpy/torch - -# This is a slightly modfied version of triton.testing.do_bench (triton v3.5.x) from -# https://github.com/triton-lang/triton/blob/0add68262ab0a2e33b84524346cb27cbb2787356/python/triton/testing.py#L127 -# with minor a minor modification to support having warmup and repeat time instead be specified in number of iterations -# instead of ms. All changes are explcitly marked - -def _quantile(a, q): - n = len(a) - a = sorted(a) - - def get_quantile(q): - if not (0 <= q <= 1): - raise ValueError("Quantiles must be in the range [0, 1]") - point = q * (n - 1) - lower = math.floor(point) - upper = math.ceil(point) - t = point - lower - return (1 - t) * a[lower] + t * a[upper] - - return [get_quantile(q) for q in q] - - -def _summarize_statistics(times, quantiles, return_mode): - if quantiles is not None: - ret = _quantile(times, quantiles) - if len(ret) == 1: - ret = ret[0] - return ret - if return_mode == "all": - return times - elif return_mode == "min": - return min(times) - elif return_mode == "max": - return max(times) - elif return_mode == "mean": - return statistics.mean(times) - elif return_mode == "median": - return statistics.median(times) - - -def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_mode="mean"): - """ - Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with - the 20-th and 80-th performance percentile. CHANGE: warmup and repeat time are specified in number of iterations rather than ms - - - :param fn: Function to benchmark - :type fn: Callable - :param warmup: Warmup time (in number of iterations) - :type warmup: int - :param rep: Repetition time (in number of iterations) - :type rep: int - :param grad_to_none: Reset the gradient of the provided tensor to None - :type grad_to_none: torch.tensor, optional - :param quantiles: Performance percentile to return in addition to the median. - :type quantiles: list[float], optional - :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean". - :type return_mode: str - """ - assert return_mode in ["min", "max", "mean", "median", "all"] - - # Change - # mean, max, min, quantiles, etc. make no sense with 0 reps - if not (return_mode == "all" and quantiles is None) and rep < 1: - error_msg = ( - f"You are running with {rep} reps. This is likely a mistake!!!\n" - "We do let you do this, but ONLY when quantiles is None when return_mode is not 'all'\n" - "to be consistent with the rest of KernelBench's timing functions" - ) - raise ValueError(error_msg) - # End of change - di = runtime.driver.active.get_device_interface() - - fn() - di.synchronize() - - cache = runtime.driver.active.get_empty_cache_for_benchmark() - - # Estimate the runtime of the function - start_event = di.Event(enable_timing=True) - end_event = di.Event(enable_timing=True) - start_event.record() - for _ in range(5): - runtime.driver.active.clear_cache(cache) - fn() - end_event.record() - di.synchronize() - estimate_ms = start_event.elapsed_time(end_event) / 5 - - # compute number of warmup and repeat - # Change - # n_warmup = max(1, int(warmup / estimate_ms)) - # n_repeat = max(1, int(rep / estimate_ms)) - n_warmup = warmup - n_repeat = rep - # end of change - start_event = [di.Event(enable_timing=True) for i in range(n_repeat)] - end_event = [di.Event(enable_timing=True) for i in range(n_repeat)] - # Warm-up - for _ in range(n_warmup): - fn() - # Benchmark - for i in range(n_repeat): - # we don't want `fn` to accumulate gradient values - # if it contains a backward pass. So we clear the - # provided gradients - if grad_to_none is not None: - for x in grad_to_none: - x.grad = None - # we clear the L2 cache before each run - runtime.driver.active.clear_cache(cache) - # record time of `fn` - start_event[i].record() - fn() - end_event[i].record() - # Record clocks - di.synchronize() - times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)] - return _summarize_statistics(times, quantiles, return_mode) \ No newline at end of file diff --git a/src/timing.py b/src/timing.py index 007d75c6..c10b1c58 100644 --- a/src/timing.py +++ b/src/timing.py @@ -5,19 +5,43 @@ import warnings from typing import Any import os -from triton import runtime -from eval import clear_l2_cache +from triton import runtime as triton_runtime +from triton import testing as triton_testing ################################################################################ # Performance Eval ################################################################################ -""" -Kernel Timing Functions [Revamp WIP] -TODO: see our detailed study on how to time kernel execution and benchmarking guide -we implement a few ways to do timing studies -These should be implemnted to be agnostic whether the modules are rather Model (reference kernel) or ModelNew (generated kernel) -""" +def clear_l2_cache(device: str = "cuda"): + """ + Clear L2 Cache line by thrashing + From GPU mode reference kernel repo: + https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163 + + We can improve this + TODO; should prob check device_name + """ + # don't reserve space for persisting lines + # cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0) + + # Thrash L2 cache by creating a larger dummy tensor, effectively flushing the cache + # 32 * 1024 * 1024 * 8B = 256MB + # NOTE: we can make this more adaptive based on device + # L2 cache sizes: A100=40MB, H100=50MB, H200=90MB, RTX4090=72MB, L40S=48MB, Blackwell≈192MB → overwrite >200MB to fully thrash L2 + dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device=device) + # write to tenosr with inplace fill + dummy.fill_(42) + del dummy + +def clear_l2_cache_triton(cache=None, device: str = "cuda"): + """ + Thrash the cache by making a large dummy tensor, using triton runtime's functionality + """ + with torch.cuda.device(device): + cache = triton_runtime.driver.active.get_empty_cache_for_benchmark() + triton_runtime.driver.active.clear_cache(cache) + + def get_timing_function( method: str = "cuda_event", # by default ) -> callable: @@ -30,44 +54,155 @@ def get_timing_function( match method: case "cuda_event": return time_execution_with_cuda_event - case "do_bench": - return time_execution_with_do_bench - case "time_time": - return time_execution_with_time_dot_time # this is just for education purpose, don't use this + case "do_bench_interface": + return time_execution_with_do_bench_interface + case "do_bench_impl": + return time_execution_with_do_bench_impl + case "cpu_time": + return time_execution_with_cpu_time + # we might add other methods in the future case _: raise ValueError(f"Unsupported timing method: {method}") -def time_execution_with_do_bench( +""" +Kernel Timing Functions [Revamp WIP] +TODO: see our detailed study on how to time kernel execution and benchmarking guide +we implement a few ways to do timing studies +These should be implemnted to be agnostic whether the modules are rather Model (reference kernel) or ModelNew (generated kernel) +""" + + +def time_execution_with_cuda_event( kernel_fn: callable, args: list[Any], num_warmup: int = 3, num_trials: int = 10, + discard_first: int = 1, + verbose: bool = True, + device: torch.device = None, +) -> list[float]: + """ + Time a CUDA kernel function over multiple trials using torch.cuda.Event + The first version of KenrelBench used this for evaluation. + We care about cold cache performance here. + + Args: + kernel_fn: Function to time + *args: Arguments to pass to kernel_fn + num_trials: Number of timing trials to run + verbose: Whether to print per-trial timing info + device: CUDA device to use, if None, use current device + + TODO: double check this with team + Returns: + List of elapsed times in milliseconds + """ + if device is None: + if verbose: + print(f"Using current device: {torch.cuda.current_device()}") + device = torch.cuda.current_device() + + # Warm ups + for _ in range(num_warmup): + kernel_fn(*args) + torch.cuda.synchronize(device=device) + + # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache + torch.cuda.empty_cache() + + print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" + ) + + elapsed_times: list[float] = [] # in ms + + # Timing trials + for trial in range(num_trials + discard_first): + torch.cuda.synchronize(device=device) # block on all streams + + # create event marker default is not interprocess + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + clear_l2_cache() # measuring cold cache performance + + # note cuda events mark event on current stream + start_event.record() + _ = kernel_fn(*args) + end_event.record() + + # waits for all streams on that device + # though it is important to note the events only record time between on current stream + # TODO: find ways to check hacks by launching work on additional stream + torch.cuda.synchronize(device=device) + + # Calculate the elapsed time in milliseconds + elapsed_time_ms = start_event.elapsed_time(end_event) + if trial >= discard_first: + if verbose: + logical_idx = trial - discard_first + 1 + print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms") + elapsed_times.append(elapsed_time_ms) + + return elapsed_times + + +def time_execution_with_do_bench_interface( + kernel_fn: callable, + args: list[Any], + # this is different for triton do_bench + num_warmup: int = 3, + num_trials: int = 10, + discard_first: int = 1, # not used yet verbose: bool = True, device: torch.device | None = None) -> list[float]: """ - TODO: need check do_bench - [WIP] need to check + Just using triton's do_bench as it is """ - device = torch.cuda.current_device() if device is not None else device + do_bench_fn = lambda : kernel_fn(*args) + return triton_testing.do_bench(fn=do_bench_fn, + warmup=25, + rep=100, + grad_to_none=None, + quantiles=None, + return_mode="all") + - if verbose: print("Using do_bench to evaluate kernel") +def time_execution_with_do_bench_impl( + kernel_fn: callable, + args: list[Any], + num_warmup: int = 3, + num_trials: int = 10, + discard_first: int = 1, # not used yet + verbose: bool = True, + device: torch.device | None = None) -> list[float]: + """ + This is modifying the triton do_bench codebase + See Triton's implementation for more details + https://github.com/triton-lang/triton/blob/9073370d5979218d1afa44ec895bbd80e7419a8c/python/triton/testing.py#L127 + """ + + device = torch.cuda.current_device() if device is not None else device + if verbose: + print(f"Using do_bench to evaluate kernel on {device}") - # note: for both nvidia and amd, di is torch.cuda (amd uses a cuda compatible interface), so we could really just have torch.cuda - di = runtime.driver.active.get_device_interface() + # speicfy device interface (supports both nvidia and amd) + # under the hood, di is torch.cuda (amd uses a cuda compatible interface) + di = triton_runtime.driver.active.get_device_interface() kernel_fn(*args) di.synchronize(device=device) - cache = runtime.driver.active.get_empty_cache_for_benchmark() - - # Estimate the runtime of the function (not needed since now the warmup and repeat steps are set by the user) + # clear l2 cache + cache = triton_runtime.driver.active.get_empty_cache_for_benchmark() + # do_bench Estimate the runtime of the function + # Here we are not using it not needed since now the warmup and repeat steps are set by the user) # start_event = di.Event(enable_timing=True) # end_event = di.Event(enable_timing=True) # start_event.record() # for _ in range(5): - # runtime.driver.active.clear_cache(cache) + # triton_runtime.driver.active.clear_cache(cache) # kernel_fn(*args) # end_event.record() # di.synchronize() @@ -87,8 +222,7 @@ def time_execution_with_do_bench( kernel_fn(*args) # Benchmark for i in range(num_trials): - - # All our functions are forward passes, so we don't need to reset gradients + # All KernelBench functions are forward passes, so we don't need to reset gradients # we don't want `fn` to accumulate gradient values # if it contains a backward pass. So we clear the # provided gradients @@ -97,7 +231,7 @@ def time_execution_with_do_bench( # x.grad = None # we clear the L2 cache before each run - runtime.driver.active.clear_cache(cache) + triton_runtime.driver.active.clear_cache(cache) # record time of `fn` start_event[i].record() kernel_fn(*args) @@ -109,16 +243,17 @@ def time_execution_with_do_bench( return times -def time_execution_with_time_dot_time( +def time_execution_with_cpu_time( kernel_fn: callable, args: list[Any], num_warmup: int = 3, num_trials: int = 10, + discard_first: int = 1, verbose: bool = True, device: torch.device | None = None, ) -> list[float]: """ - Time a CUDA kernel function over multiple trials using time.time() + Time a CUDA kernel function over multiple trials using CPU side timing [WIP] Args: kernel_fn: Function to time @@ -132,13 +267,6 @@ def time_execution_with_time_dot_time( Not recommended: """ - - # give warning that this is not the way to do it - warnings.warn( - "time_execution_with_time_dot_time is meant for educational purposes only, please other options like time_with_cuda_event or time_with_do_bench", - UserWarning, - ) - if device is None: if verbose: print(f"Using current device: {torch.cuda.current_device()}") @@ -152,95 +280,35 @@ def time_execution_with_time_dot_time( print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}") elapsed_times = [] + # clear PyTorch allocator cache + torch.cuda.empty_cache() + # Actual trials - for trial in range(num_trials): - start_time = time.time() - kernel_fn(*args) + for trial in range(num_trials + discard_first): + # block all streams on device torch.cuda.synchronize(device=device) - end_time = time.time() - # Calculate the elapsed time in milliseconds - elapsed_time_ms = (end_time - start_time) * 1000 - if verbose: - print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms") - elapsed_times.append(elapsed_time_ms) - - return elapsed_times + # focus on cold_cache performance + clear_l2_cache() - - - -def time_execution_with_cuda_event( - kernel_fn: callable, - args: list[Any], - num_warmup: int = 3, - num_trials: int = 10, - verbose: bool = True, - device: torch.device = None, -) -> list[float]: - """ - Time a CUDA kernel function over multiple trials using torch.cuda.Event - The first version of KenrelBench used this for evaluation. - We care about cold cache performance here. - - Args: - kernel_fn: Function to time - *args: Arguments to pass to kernel_fn - num_trials: Number of timing trials to run - verbose: Whether to print per-trial timing info - device: CUDA device to use, if None, use current device - - TODO: double check this with team - Returns: - List of elapsed times in milliseconds - """ - if device is None: - if verbose: - print(f"Using current device: {torch.cuda.current_device()}") - device = torch.cuda.current_device() - - # Warm ups - for _ in range(num_warmup): + # CPU-side wall clock time using perf_counter (high-resolution timer) + start_time = time.perf_counter() kernel_fn(*args) - torch.cuda.synchronize(device=device) - - # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache - torch.cuda.empty_cache() - - print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" - ) - - elapsed_times: list[float] = [] # in ms - - # Timing trials - for trial in range(num_trials): - torch.cuda.synchronize(device=device) # block on all streams - - # create event marker default is not interprocess - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - clear_l2_cache() # measuring cold cache performance - - # note cuda events mark event on current stream - start_event.record() - _ = kernel_fn(*args) - end_event.record() - - # waits for all streams on that device - # though it is important to note the events only record time between on current stream - # TODO: find ways to check hacks by launching work on additional stream - torch.cuda.synchronize(device=device) + torch.cuda.synchronize(device=device) # wait for all stream to finish + # this blocks the CPU until all GPU work on device is done + # this means all kernels on all streams + end_time = time.perf_counter() # Calculate the elapsed time in milliseconds - elapsed_time_ms = start_event.elapsed_time(end_event) - if verbose: - print(f"Timing Trial {trial + 1}: {elapsed_time_ms:.3g} ms") - elapsed_times.append(elapsed_time_ms) + elapsed_time_ms = (end_time - start_time) * 1000 + if trial >= discard_first: + if verbose: + logical_idx = trial - discard_first + 1 + print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms") + elapsed_times.append(elapsed_time_ms) return elapsed_times - ######################################################## # Timing stats ######################################################### diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py index 955b2519..24d0faf2 100644 --- a/src/unit_tests/test_eval_timing.py +++ b/src/unit_tests/test_eval_timing.py @@ -59,13 +59,21 @@ def matmul_kernel(a, b): # Validate results assert isinstance(elapsed_times, list), "Expected list of elapsed times" - assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}" + # assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}" assert all(isinstance(t, float) for t in elapsed_times), "All timing results should be floats" assert all(t > 0 for t in elapsed_times), "All timing results should be positive" - print(f"smoke test matmul elapsed times with {timing_func_name} (in ms): {elapsed_times}") + # print(f"smoke test matmul elapsed times with {timing_func_name} (in ms): {elapsed_times}") + stats = timing.get_timing_stats(elapsed_times, device=device) + print("Timing stats") + print(stats) + + +timing_methods = ["cuda_event", "cpu_time", "do_bench_interface", "do_bench_impl"] + +for timing_method in timing_methods: + _run_timing_smoke_test_matmul(timing_method) -_run_timing_smoke_test_matmul("cuda_event") def test_do_bench_simple_smoke(): From 920a7932939eb3e189572d85cfdea08a2ea4aee4 Mon Sep 17 00:00:00 2001 From: Pietro Date: Fri, 12 Dec 2025 04:44:47 +0000 Subject: [PATCH 05/13] some annotations --- .gitignore | Bin 171 -> 152 bytes src/eval.py | 261 ++++++++++++++++++++------------------------------ src/timing.py | 36 ++++--- 3 files changed, 126 insertions(+), 171 deletions(-) diff --git a/.gitignore b/.gitignore index abb125d6aabe372688772976daa7ee024353eaf8..3956bf85591170be4f1159744411e5afa85d36a3 100644 GIT binary patch delta 22 dcmZ3@ID>IQCx3iWX=YAJd~#xPMyft97XV!Z2X+7e delta 41 vcmbQixSDZ7r&1zADMJcFCPN8BJVOpcDnl6%=P(#E=rI&9R5I`~a4`S?*LnzW diff --git a/src/eval.py b/src/eval.py index 4a072c89..2573da9c 100644 --- a/src/eval.py +++ b/src/eval.py @@ -15,13 +15,14 @@ from io import StringIO from typing import Union, Optional -import numpy as np -import requests import torch import torch.nn as nn from pydantic import BaseModel +from triton import runtime -from . import utils + +# import cupy as cp +import utils, timing REPO_TOP_PATH = os.path.abspath( os.path.join( @@ -46,6 +47,7 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str if isinstance(problem_id, str): problem_id = int(problem_id) + # TODO: replace dataset object @Omar problem_path = problems[problem_id] # problem_path = os.path.join(REPO_ROOT_PATH, problem) @@ -58,7 +60,6 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str else: return (problem_path, ref_arch) - def fetch_ref_arch_from_level_problem_id(level, problem_id, with_name=False): PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level)) dataset = utils.construct_problem_dataset_from_problem_dir(PROBLEM_DIR) @@ -70,6 +71,36 @@ def set_seed(seed: int): # NOTE: this only sets on current cuda device torch.cuda.manual_seed(seed) + +def clear_l2_cache(device: str = "cuda"): + """ + Clear L2 Cache line by thrashing + From GPU mode reference kernel repo: + https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163 + + We can improve this + TODO; should prob check device_name + """ + # don't reserve space for persisting lines + # cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0) + + # Thrash L2 cache by creating a larger dummy tensor, effectively flushing the cache + # 32 * 1024 * 1024 * 8B = 256MB + # NOTE: we can make this more adaptive based on device + # L2 cache sizes: A100=40MB, H100=50MB, H200=90MB, RTX4090=72MB, L40S=48MB, Blackwell≈192MB → overwrite >200MB to fully thrash L2 + dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device=device) + # write to tenosr with inplace fill + dummy.fill_(42) + del dummy + +def clear_l2_cache_triton(cache, device): + # cp.cuda.runtime.cudaDeviceSetLimit( + # cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0 + # ) + cache = runtime.driver.active.get_empty_cache_for_benchmark() + runtime.driver.active.clear_cache(cache) + + def get_torch_dtype_from_string(precision: str) -> torch.dtype: """ Get the torch dtype for specific precision @@ -107,9 +138,8 @@ def get_tolerance_for_precision(precision: str | torch.dtype) -> float: class KernelExecResult(BaseModel): """ - Single Kernel Execution + Single Kernel Execution - all the information it needs """ - compiled: bool = False correctness: bool = False metadata: dict = {} @@ -387,6 +417,57 @@ def _process_input_tensor(input, device, backend="cuda", precision=torch.float32 return input_tensor.to(device=device) +def load_kernel( + verbose: str, + backend: str, + custom_model_src, + context, + build_dir, + device, + metadata: dict, + ) -> tuple[Union[nn.Module, KernelExecResult, None], Optional[tempfile.NamedTemporaryFile]]: + '''KernelExecResult means that loading the kernel failed (either because of compilation or something else), ModelNew that we succesfully loaded ModelNew''' + if verbose: + print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel") + + try: + os.environ["TORCH_USE_CUDA_DSA"] = "1" # compile with device side assertion + tempfile = None + + if backend.lower() in ["triton", "tilelang", "cute"]: + # Use tempfile approach for triton, tilelang, and cute + # These DSLs require proper module import for JIT decorators to work + ModelNew, tempfile = load_custom_model_with_tempfile( + custom_model_src, entry_point="ModelNew" + ) + else: + # Default CUDA backend + ModelNew = load_custom_model(custom_model_src, context, build_dir) + torch.cuda.synchronize(device=device) # not sure if this is too much + except Exception as e: + print( + f"Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}" + ) + # TODO: add metadata for compilation error (how to we get the compilation error message?) + + if "lock" in str(e) or "No such file or directory" in str(e): + # this is a lock file error, likely due to concurrent compilation + # this does not necessarily mean the compilation failed, but we should retry + print( + f"[Eval] Lock file error during compilation, Please retry. Error: {e}" + ) + graceful_eval_cleanup(context, device, tempfile) + return None, None + else: + metadata["compilation_error_name"] = get_error_name(e) + metadata["compilation_error"] = str(e) + graceful_eval_cleanup(context, device, tempfile) + return KernelExecResult( + compiled=False, metadata=metadata + ), None + return ModelNew, tempfile + + def eval_kernel_against_ref( original_model_src: str, custom_model_src: str, @@ -470,50 +551,15 @@ def eval_kernel_against_ref( assert hasattr(original_model, "forward") if verbose: print("[Eval] Original Model Loaded") - - if verbose: - print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel") - - # this is where compilation happens - try: - os.environ["TORCH_USE_CUDA_DSA"] = "1" # compile with device side assertion - tempfile = None - # add hash for later to distinguish between multi-turn kernels - - backend_lower = backend.lower() - if backend_lower in ["triton", "tilelang", "cute"]: - # Use tempfile approach for triton, tilelang, and cute - # These DSLs require proper module import for JIT decorators to work - ModelNew, tempfile = load_custom_model_with_tempfile( - custom_model_src, entry_point="ModelNew" - ) - else: - # Default CUDA backend - ModelNew = load_custom_model(custom_model_src, context, build_dir) - torch.cuda.synchronize(device=device) # not sure if this is too much - except Exception as e: - print( - f"Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}" - ) - # TODO: add metadata for compilation error (how to we get the compilation error message?) - - if "lock" in str(e) or "No such file or directory" in str(e): - # this is a lock file error, likely due to concurrent compilation - # this does not necessarily mean the compilation failed, but we should retry - print( - f"[Eval] Lock file error during compilation, Please retry. Error: {e}" - ) - graceful_eval_cleanup(context, device, tempfile) - return None - else: - metadata["compilation_error_name"] = get_error_name(e) - metadata["compilation_error"] = e - graceful_eval_cleanup(context, device, tempfile) - return KernelExecResult( - compiled=False, metadata=metadata - ) # skip further steps - - # at this point we passed compilation + result, tempfile = load_kernel( + verbose, backend, custom_model_src, context, build_dir, device, metadata + ) + if isinstance(result, KernelExecResult): + return result # loading the kernel failed, return the exec result + if result is None: + # lockfile / concurrent compilation: retryable failure + return None + ModelNew = result # we passed loading try: with torch.no_grad(): set_seed(seed_num) # set seed for reproducible weights @@ -536,7 +582,7 @@ def eval_kernel_against_ref( compiled=True, correctness=False, metadata=metadata ) # skip further steps - kernel_exec_result = None + # kernel_exec_result = None # Check Correctness if verbose: @@ -578,14 +624,16 @@ def eval_kernel_against_ref( model_new = custom_model.to(device=device, dtype=precision) torch.cuda.synchronize(device=device) - elapsed_times = time_execution_with_cuda_event( + # TODO: replace functions from timing based on we choose + # we should pass in which timing method you wanna do + elapsed_times = timing.time_execution_with_cuda_event( model_new, - *inputs, + inputs, num_trials=num_perf_trials, verbose=verbose, device=device, ) - runtime_stats = get_timing_stats(elapsed_times, device=device) + runtime_stats = timing.get_timing_stats(elapsed_times, device=device) if verbose: print(f"[Eval] Performance Stats: {runtime_stats}") @@ -625,63 +673,6 @@ def register_and_format_exception( return metadata -def time_execution_with_cuda_event( - kernel_fn: callable, - *args, - num_warmup: int = 3, - num_trials: int = 10, - verbose: bool = True, - device: torch.device = None, -) -> list[float]: - """ - Time a CUDA kernel function over multiple trials using torch.cuda.Event - - Args: - kernel_fn: Function to time - *args: Arguments to pass to kernel_fn - num_trials: Number of timing trials to run - verbose: Whether to print per-trial timing info - device: CUDA device to use, if None, use current device - - Returns: - List of elapsed times in milliseconds - """ - if device is None: - if verbose: - print(f"Using current device: {torch.cuda.current_device()}") - device = torch.cuda.current_device() - - # Warm ups - for _ in range(num_warmup): - kernel_fn(*args) - torch.cuda.synchronize(device=device) - - print( - f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" - ) - elapsed_times = [] - - # Actual trials - for trial in range(num_trials): - # create event marker default is not interprocess - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) - - start_event.record() - kernel_fn(*args) - end_event.record() - - # Synchronize to ensure the events have completed - torch.cuda.synchronize(device=device) - - # Calculate the elapsed time in milliseconds - elapsed_time_ms = start_event.elapsed_time(end_event) - if verbose: - print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms") - elapsed_times.append(elapsed_time_ms) - - return elapsed_times - def run_and_check_correctness( original_model_instance: nn.Module, @@ -865,54 +856,6 @@ def convert_to_serializable(obj): return converted_metadata -################################################################################ -# Performance Eval -################################################################################ - - -def fetch_baseline_time( - level_name: str, problem_id: int, dataset: list[str], baseline_time_filepath: str -) -> dict: - """ - Fetch the baseline time from the time - """ - if not os.path.exists(baseline_time_filepath): - raise FileNotFoundError( - f"Baseline time file not found at {baseline_time_filepath}" - ) - - with open(baseline_time_filepath, "r") as f: - baseline_json = json.load(f) - - problem_name = dataset[problem_id].split("/")[-1] - baseline_time = baseline_json[level_name].get(problem_name, None) - return baseline_time - - -def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> dict: - """Get timing statistics from a list of elapsed times. - - Args: - elapsed_times: List of elapsed times in milliseconds - device: CUDA device, record device info - Returns: - Dict containing mean, std, min, max and num_trials - all timing are in ms - """ - - stats = { - "mean": float(f"{np.mean(elapsed_times):.3g}"), - "std": float(f"{np.std(elapsed_times):.3g}"), - "min": float(f"{np.min(elapsed_times):.3g}"), - "max": float(f"{np.max(elapsed_times):.3g}"), - "num_trials": len(elapsed_times), - } - - if device: - stats["hardware"] = torch.cuda.get_device_name(device=device) - stats["device"] = str(device) # for debugging - - return stats # if __name__ == "__main__": diff --git a/src/timing.py b/src/timing.py index c10b1c58..56cefd09 100644 --- a/src/timing.py +++ b/src/timing.py @@ -2,9 +2,10 @@ import json import numpy as np import time -import warnings from typing import Any import os + +# we leverage triton's testing functionality for some timing methods from triton import runtime as triton_runtime from triton import testing as triton_testing @@ -17,9 +18,6 @@ def clear_l2_cache(device: str = "cuda"): Clear L2 Cache line by thrashing From GPU mode reference kernel repo: https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163 - - We can improve this - TODO; should prob check device_name """ # don't reserve space for persisting lines # cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0) @@ -39,6 +37,7 @@ def clear_l2_cache_triton(cache=None, device: str = "cuda"): """ with torch.cuda.device(device): cache = triton_runtime.driver.active.get_empty_cache_for_benchmark() + # this effectively thrashes L2 cache under the hood too triton_runtime.driver.active.clear_cache(cache) @@ -58,8 +57,8 @@ def get_timing_function( return time_execution_with_do_bench_interface case "do_bench_impl": return time_execution_with_do_bench_impl - case "cpu_time": - return time_execution_with_cpu_time + case "host_time": + return time_execution_with_host_time # we might add other methods in the future case _: raise ValueError(f"Unsupported timing method: {method}") @@ -156,10 +155,12 @@ def time_execution_with_do_bench_interface( verbose: bool = True, device: torch.device | None = None) -> list[float]: """ - Just using triton's do_bench as it is + Using triton's default do_bench as it is + Note we don't set num_warmup and num_trials, and we use warmup 25 ms and repetition time 100 ms with Triton's default values + See doc: https://triton-lang.org/main/python-api/generated/triton.testing.do_bench.html + Benchmark the runtime of the provided function. By default, return the median runtime of fn along with the 20-th and 80-th performance percentile. """ - - do_bench_fn = lambda : kernel_fn(*args) + do_bench_fn = lambda : kernel_fn(*args) # wrap function with arguments return triton_testing.do_bench(fn=do_bench_fn, warmup=25, rep=100, @@ -180,6 +181,10 @@ def time_execution_with_do_bench_impl( This is modifying the triton do_bench codebase See Triton's implementation for more details https://github.com/triton-lang/triton/blob/9073370d5979218d1afa44ec895bbd80e7419a8c/python/triton/testing.py#L127 + + Note we duplicate triton's implementation and modify / comment out parts + to use num_warmup and num_trials that explicitly follows what user define here + instead of do_bench's version that computes how many times to run warmup and profile based on total warmup and repetition time """ device = torch.cuda.current_device() if device is not None else device @@ -243,7 +248,7 @@ def time_execution_with_do_bench_impl( return times -def time_execution_with_cpu_time( +def time_execution_with_host_time( kernel_fn: callable, args: list[Any], num_warmup: int = 3, @@ -253,8 +258,12 @@ def time_execution_with_cpu_time( device: torch.device | None = None, ) -> list[float]: """ - Time a CUDA kernel function over multiple trials using CPU side timing - [WIP] + Time a CUDA kernel function over multiple trials using Host (CPU) side timing + + This measures host-side wall clock time, E2E latency observed by host + Note that could take including Python overhead, CUDA launch/runtime costs, synchronization, all GPU work across all streams, and host OS overhaed + Hence results might be longer than device-side (CUDA event) timings + Args: kernel_fn: Function to time args: Arguments to pass to kernel_fn @@ -311,12 +320,15 @@ def time_execution_with_cpu_time( ######################################################## # Timing stats +# tools to help compute speedup and other time ######################################################### def fetch_baseline_time( level_name: str, problem_id: int, dataset: list[str], baseline_time_filepath: str ) -> dict: """ Fetch the baseline time from the time + + Note: might be better to just run the refernece using torch eager and compile sometimes """ if not os.path.exists(baseline_time_filepath): raise FileNotFoundError( From 05d408faec732ea05d1c78875e2d1f104588bf60 Mon Sep 17 00:00:00 2001 From: Pietro Date: Fri, 12 Dec 2025 05:02:36 +0000 Subject: [PATCH 06/13] run_and_check compatible --- scripts/generate_baseline_time.py | 14 ++++++++++---- src/eval.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py index 5a68ea08..0a1f608b 100644 --- a/scripts/generate_baseline_time.py +++ b/scripts/generate_baseline_time.py @@ -2,11 +2,13 @@ import numpy as np from src.eval import ( load_original_model_and_inputs, - time_execution_with_cuda_event, - get_timing_stats, set_seed, fetch_ref_arch_from_problem_id, ) +from src.timing import ( + get_timing_function, + get_timing_stats, +) from src.dataset import construct_problem_dataset_from_problem_dir from src.utils import read_file import os @@ -81,6 +83,7 @@ def measure_program_time( torch_compile_options: str="default", device: torch.device="cuda:0", verbose: bool = False, + timing_method: str = "cuda_event", ) -> dict: """ Measure the time of a KernelBench reference architecture @@ -116,8 +119,11 @@ def measure_program_time( model = model.cuda(device=device) torch.cuda.synchronize(device=device) - elapsed_times = time_execution_with_cuda_event( - model, *inputs, num_trials=num_trials, verbose=verbose, device=device + + # run chosen timing function + timing_fn = get_timing_function(timing_method) + elapsed_times = timing_fn( + model, inputs, num_trials=num_trials, verbose=verbose, device=device ) runtime_stats = get_timing_stats(elapsed_times, device=device) diff --git a/src/eval.py b/src/eval.py index 2573da9c..e117d7dc 100644 --- a/src/eval.py +++ b/src/eval.py @@ -22,7 +22,7 @@ # import cupy as cp -import utils, timing +from . import utils, timing REPO_TOP_PATH = os.path.abspath( os.path.join( From 2be968a5a1a5165af1e007eadbae59f70326fd31 Mon Sep 17 00:00:00 2001 From: Pietro Date: Fri, 12 Dec 2025 05:10:13 +0000 Subject: [PATCH 07/13] revert eval and add only necessary changes --- src/eval.py | 154 ++++++++++++++++++---------------------------------- 1 file changed, 53 insertions(+), 101 deletions(-) diff --git a/src/eval.py b/src/eval.py index e117d7dc..f157a6b7 100644 --- a/src/eval.py +++ b/src/eval.py @@ -15,13 +15,12 @@ from io import StringIO from typing import Union, Optional +import numpy as np +import requests import torch import torch.nn as nn from pydantic import BaseModel -from triton import runtime - -# import cupy as cp from . import utils, timing REPO_TOP_PATH = os.path.abspath( @@ -47,7 +46,6 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str if isinstance(problem_id, str): problem_id = int(problem_id) - # TODO: replace dataset object @Omar problem_path = problems[problem_id] # problem_path = os.path.join(REPO_ROOT_PATH, problem) @@ -60,6 +58,7 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str else: return (problem_path, ref_arch) + def fetch_ref_arch_from_level_problem_id(level, problem_id, with_name=False): PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level)) dataset = utils.construct_problem_dataset_from_problem_dir(PROBLEM_DIR) @@ -71,36 +70,6 @@ def set_seed(seed: int): # NOTE: this only sets on current cuda device torch.cuda.manual_seed(seed) - -def clear_l2_cache(device: str = "cuda"): - """ - Clear L2 Cache line by thrashing - From GPU mode reference kernel repo: - https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163 - - We can improve this - TODO; should prob check device_name - """ - # don't reserve space for persisting lines - # cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0) - - # Thrash L2 cache by creating a larger dummy tensor, effectively flushing the cache - # 32 * 1024 * 1024 * 8B = 256MB - # NOTE: we can make this more adaptive based on device - # L2 cache sizes: A100=40MB, H100=50MB, H200=90MB, RTX4090=72MB, L40S=48MB, Blackwell≈192MB → overwrite >200MB to fully thrash L2 - dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device=device) - # write to tenosr with inplace fill - dummy.fill_(42) - del dummy - -def clear_l2_cache_triton(cache, device): - # cp.cuda.runtime.cudaDeviceSetLimit( - # cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0 - # ) - cache = runtime.driver.active.get_empty_cache_for_benchmark() - runtime.driver.active.clear_cache(cache) - - def get_torch_dtype_from_string(precision: str) -> torch.dtype: """ Get the torch dtype for specific precision @@ -138,8 +107,9 @@ def get_tolerance_for_precision(precision: str | torch.dtype) -> float: class KernelExecResult(BaseModel): """ - Single Kernel Execution - all the information it needs + Single Kernel Execution """ + compiled: bool = False correctness: bool = False metadata: dict = {} @@ -417,57 +387,6 @@ def _process_input_tensor(input, device, backend="cuda", precision=torch.float32 return input_tensor.to(device=device) -def load_kernel( - verbose: str, - backend: str, - custom_model_src, - context, - build_dir, - device, - metadata: dict, - ) -> tuple[Union[nn.Module, KernelExecResult, None], Optional[tempfile.NamedTemporaryFile]]: - '''KernelExecResult means that loading the kernel failed (either because of compilation or something else), ModelNew that we succesfully loaded ModelNew''' - if verbose: - print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel") - - try: - os.environ["TORCH_USE_CUDA_DSA"] = "1" # compile with device side assertion - tempfile = None - - if backend.lower() in ["triton", "tilelang", "cute"]: - # Use tempfile approach for triton, tilelang, and cute - # These DSLs require proper module import for JIT decorators to work - ModelNew, tempfile = load_custom_model_with_tempfile( - custom_model_src, entry_point="ModelNew" - ) - else: - # Default CUDA backend - ModelNew = load_custom_model(custom_model_src, context, build_dir) - torch.cuda.synchronize(device=device) # not sure if this is too much - except Exception as e: - print( - f"Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}" - ) - # TODO: add metadata for compilation error (how to we get the compilation error message?) - - if "lock" in str(e) or "No such file or directory" in str(e): - # this is a lock file error, likely due to concurrent compilation - # this does not necessarily mean the compilation failed, but we should retry - print( - f"[Eval] Lock file error during compilation, Please retry. Error: {e}" - ) - graceful_eval_cleanup(context, device, tempfile) - return None, None - else: - metadata["compilation_error_name"] = get_error_name(e) - metadata["compilation_error"] = str(e) - graceful_eval_cleanup(context, device, tempfile) - return KernelExecResult( - compiled=False, metadata=metadata - ), None - return ModelNew, tempfile - - def eval_kernel_against_ref( original_model_src: str, custom_model_src: str, @@ -551,15 +470,50 @@ def eval_kernel_against_ref( assert hasattr(original_model, "forward") if verbose: print("[Eval] Original Model Loaded") - result, tempfile = load_kernel( - verbose, backend, custom_model_src, context, build_dir, device, metadata - ) - if isinstance(result, KernelExecResult): - return result # loading the kernel failed, return the exec result - if result is None: - # lockfile / concurrent compilation: retryable failure - return None - ModelNew = result # we passed loading + + if verbose: + print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel") + + # this is where compilation happens + try: + os.environ["TORCH_USE_CUDA_DSA"] = "1" # compile with device side assertion + tempfile = None + # add hash for later to distinguish between multi-turn kernels + + backend_lower = backend.lower() + if backend_lower in ["triton", "tilelang", "cute"]: + # Use tempfile approach for triton, tilelang, and cute + # These DSLs require proper module import for JIT decorators to work + ModelNew, tempfile = load_custom_model_with_tempfile( + custom_model_src, entry_point="ModelNew" + ) + else: + # Default CUDA backend + ModelNew = load_custom_model(custom_model_src, context, build_dir) + torch.cuda.synchronize(device=device) # not sure if this is too much + except Exception as e: + print( + f"Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}" + ) + # TODO: add metadata for compilation error (how to we get the compilation error message?) + + if "lock" in str(e) or "No such file or directory" in str(e): + # this is a lock file error, likely due to concurrent compilation + # this does not necessarily mean the compilation failed, but we should retry + print( + f"[Eval] Lock file error during compilation, Please retry. Error: {e}" + ) + graceful_eval_cleanup(context, device, tempfile) + return None + else: + metadata["compilation_error_name"] = get_error_name(e) + metadata["compilation_error"] = e + graceful_eval_cleanup(context, device, tempfile) + return KernelExecResult( + compiled=False, metadata=metadata + ) # skip further steps + + # at this point we passed compilation try: with torch.no_grad(): set_seed(seed_num) # set seed for reproducible weights @@ -582,7 +536,7 @@ def eval_kernel_against_ref( compiled=True, correctness=False, metadata=metadata ) # skip further steps - # kernel_exec_result = None + kernel_exec_result = None # Check Correctness if verbose: @@ -624,9 +578,9 @@ def eval_kernel_against_ref( model_new = custom_model.to(device=device, dtype=precision) torch.cuda.synchronize(device=device) - # TODO: replace functions from timing based on we choose - # we should pass in which timing method you wanna do - elapsed_times = timing.time_execution_with_cuda_event( + # support multiple timing backend + timing_fn = timing.get_timing_function("cuda_event") + elapsed_times = timing_fn( model_new, inputs, num_trials=num_perf_trials, @@ -673,7 +627,6 @@ def register_and_format_exception( return metadata - def run_and_check_correctness( original_model_instance: nn.Module, new_model_instance: nn.Module, @@ -857,7 +810,6 @@ def convert_to_serializable(obj): - # if __name__ == "__main__": # fetch_kernel_from_database("kernelbench_prompt_v2_level_2", 1, 1, "http://localhost:9091") # print(fetch_ref_arch_from_level_problem_id("2", 1, with_name=True)) From 936f22149f5918496a980a08ac62bb94f6d07807 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Mon, 15 Dec 2025 03:12:42 +0000 Subject: [PATCH 08/13] top_level eval entry point to set timing_method --- scripts/eval_from_generations.py | 6 ++++++ scripts/generate_and_eval_single_sample.py | 2 ++ .../generate_and_eval_single_sample_modal.py | 6 ++++-- scripts/run_and_check.py | 20 +++++++++++++++---- src/eval.py | 9 +++++++-- 5 files changed, 35 insertions(+), 8 deletions(-) diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py index 2e39e3be..b28a3be0 100644 --- a/scripts/eval_from_generations.py +++ b/scripts/eval_from_generations.py @@ -113,6 +113,7 @@ def __init__(self): self.num_perf_trials = 100 self.timeout = 180 # in seconds self.measure_performance = True + self.timing_method = "cuda_event" # Eval Flow setting # To speedup evaluation, you can start building the kernel on CPU on disk as cache @@ -173,6 +174,7 @@ def evaluate_single_sample_modal( num_correct_trials: int = 5, num_perf_trials: int = 100, measure_performance: bool = True, + timing_method: str = "cuda_event", verbose: bool = False, backend: str = "cuda", precision: str = "fp32", @@ -212,6 +214,7 @@ def evaluate_single_sample_modal( original_model_src=ref_arch_src, custom_model_src=kernel_src, measure_performance=measure_performance, + timing_method=timing_method, verbose=verbose, num_correct_trials=num_correct_trials, num_perf_trials=num_perf_trials, @@ -324,6 +327,7 @@ def evaluate_single_sample( original_model_src=ref_arch_src, custom_model_src=kernel_src, measure_performance=configs.measure_performance, + timing_method=configs.timing_method, verbose=configs.verbose, num_correct_trials=configs.num_correct_trials, num_perf_trials=configs.num_perf_trials, @@ -384,6 +388,7 @@ def evaluate_single_sample_modal_direct( num_correct_trials=configs.num_correct_trials, num_perf_trials=configs.num_perf_trials, measure_performance=configs.measure_performance, + timing_method=configs.timing_method, verbose=configs.verbose, ) return eval_result @@ -502,6 +507,7 @@ def batch_eval_modal( num_correct_trials=config.num_correct_trials, num_perf_trials=config.num_perf_trials, measure_performance=config.measure_performance, + timing_method=config.timing_method, verbose=config.verbose, backend=config.backend, precision=config.precision, diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py index 2b2d5301..2e110932 100644 --- a/scripts/generate_and_eval_single_sample.py +++ b/scripts/generate_and_eval_single_sample.py @@ -73,6 +73,7 @@ def __init__(self): self.log_eval_result = False self.backend = "cuda" + self.timing_method = "cuda_event" # see timing.py # Prompt construction self.prompt_option = "one_shot" # choices: zero_shot, one_shot, few_shot @@ -267,6 +268,7 @@ def main(config: EvalConfig): custom_kernel, verbose=config.verbose, measure_performance=True, + timing_method=config.timing_method, num_correct_trials=5, num_perf_trials=100, backend=config.backend, diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py index 7628e0bf..9dee518a 100644 --- a/scripts/generate_and_eval_single_sample_modal.py +++ b/scripts/generate_and_eval_single_sample_modal.py @@ -75,6 +75,7 @@ def __init__(self): self.log_eval_result = False self.backend = "cuda" + self.timing_method = "cuda_event" # see timing.py # Prompt generation settings self.prompt_option = "one_shot" # zero_shot, one_shot, few_shot self.include_hardware_info = False @@ -110,7 +111,7 @@ def __repr__(self): class EvalFunc: @modal.method() - def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision): + def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision, timing_method): # 3. Evaluate Kernel # NOTE: no need to wrap around process here as only a single sample # see batch eval for examples of process isolation @@ -121,6 +122,7 @@ def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arc modal_set_gpu_arch(gpu_arch) return eval_kernel_against_ref( ref_arch_src, custom_kernel, verbose=verbose, measure_performance=True, + timing_method=timing_method, num_correct_trials=5, num_perf_trials=100, backend=backend, precision=get_torch_dtype_from_string(precision) ) @@ -274,7 +276,7 @@ def main(config: EvalConfig): with app.run(): kernel_exec_result = EvalFunc.with_options(gpu=config.gpu)().eval_single_sample_modal.remote( - ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision + ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision, config.timing_method ) print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}") diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py index 316b96ee..e0492938 100644 --- a/scripts/run_and_check.py +++ b/scripts/run_and_check.py @@ -57,6 +57,8 @@ Usage: 1. PyTorch reference is a local file (local eval) python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/model_ex_add.py kernel_src_path=src/prompts/model_new_ex_add.py eval_mode=local +python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/few_shot/model_ex_tiled_matmul.py kernel_src_path=src/prompts/few_shot/model_new_ex_tiled_matmul.py eval_mode=local + 2. PyTorch reference is a kernelbench problem (local eval) python3 scripts/run_and_check.py ref_origin=kernelbench level= problem_id= kernel_src_path= eval_mode=local @@ -101,6 +103,7 @@ def __init__(self): # verbose logging self.verbose = False self.measure_performance = True + self.timing_method = "cuda_event" # see timing.py self.build_dir_prefix = "" # if you want to specify a custom build directory self.clear_cache = False # TODO @@ -128,18 +131,23 @@ def evaluate_single_sample_src(ref_arch_src: str, kernel_src: str, configs: dict num_perf_trials = configs["num_perf_trials"] verbose = configs["verbose"] measure_performance = configs["measure_performance"] + timing_method = configs["timing_method"] + backend = configs["backend"] + precision = kernel_eval.get_torch_dtype_from_string(configs["precision"]) + try: eval_result = kernel_eval.eval_kernel_against_ref( original_model_src=ref_arch_src, custom_model_src=kernel_src, measure_performance=measure_performance, + timing_method=timing_method, verbose=verbose, num_correct_trials=num_correct_trials, num_perf_trials=num_perf_trials, build_dir=build_dir, device=device, - backend=configs["backend"], - precision=kernel_eval.get_torch_dtype_from_string(configs["precision"]) + backend=backend, + precision=precision ) return eval_result except Exception as e: @@ -180,17 +188,21 @@ def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, c num_perf_trials = configs["num_perf_trials"] verbose = configs["verbose"] measure_performance = configs["measure_performance"] + timing_method = configs["timing_method"] + backend = configs["backend"] + precision = kernel_eval.get_torch_dtype_from_string(configs["precision"]) eval_result = eval_kernel_against_ref( original_model_src=ref_arch_src, custom_model_src=kernel_src, measure_performance=measure_performance, + timing_method=timing_method, verbose=verbose, num_correct_trials=num_correct_trials, num_perf_trials=num_perf_trials, device=device, - backend=configs["backend"], - precision=get_torch_dtype_from_string(configs["precision"]) + backend=backend, + precision=precision ) return eval_result diff --git a/src/eval.py b/src/eval.py index f157a6b7..5f1fe8d8 100644 --- a/src/eval.py +++ b/src/eval.py @@ -393,8 +393,9 @@ def eval_kernel_against_ref( seed_num: int = 42, num_correct_trials: int = 1, num_perf_trials: int = 10, - verbose: bool = False, measure_performance: bool = False, + timing_method: str = "cuda_event", # see timing.py + verbose: bool = False, build_dir: os.PathLike = None, device: Union[torch.device, int] = ( torch.cuda.current_device() if torch.cuda.is_available() else None @@ -405,11 +406,15 @@ def eval_kernel_against_ref( """ Evaluate the custom kernel against the original model + NOTE: we are thinking about refactor this to be more modularized + and we can add more checks as our other ongiong PRs are working on + num_correct_trials: number of trials to initialize different random inputs; correctness pass only if all trials pass num_perf_trials: run the evalutation many times to take the average device: GPU (cuda) device to run the evalutation on backend: str, one of 'cuda', 'triton', 'tilelang', or 'cute' precision: torch.dtype for computation (note: tilelang only supports fp16) + timing_method: str, method to time kernel, see timing.py for more details """ # TODO: check device is busy assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval" @@ -579,7 +584,7 @@ def eval_kernel_against_ref( torch.cuda.synchronize(device=device) # support multiple timing backend - timing_fn = timing.get_timing_function("cuda_event") + timing_fn = timing.get_timing_function(timing_method) elapsed_times = timing_fn( model_new, inputs, From 2c36572c71744a090c40194bf24d0234ded3eb68 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Mon, 15 Dec 2025 03:47:15 +0000 Subject: [PATCH 09/13] remove discard_first for cuda event and updated documentation --- src/timing.py | 121 ++++++++++++++++++++--------- src/unit_tests/test_eval_timing.py | 20 +++-- 2 files changed, 98 insertions(+), 43 deletions(-) diff --git a/src/timing.py b/src/timing.py index 56cefd09..d07dd54f 100644 --- a/src/timing.py +++ b/src/timing.py @@ -5,18 +5,23 @@ from typing import Any import os + # we leverage triton's testing functionality for some timing methods from triton import runtime as triton_runtime from triton import testing as triton_testing ################################################################################ -# Performance Eval +# timing.py +# Various timing methods and utilities for performance evaluation +# please make a PR if you have suggestions! + +# Try them out at src/unit_tests/test_eval_timing.py ################################################################################ def clear_l2_cache(device: str = "cuda"): """ - Clear L2 Cache line by thrashing - From GPU mode reference kernel repo: + Clear L2 Cache line by thrashing with a large tensor + Acknowledge GPU mode reference kernel repo: https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163 """ # don't reserve space for persisting lines @@ -27,7 +32,7 @@ def clear_l2_cache(device: str = "cuda"): # NOTE: we can make this more adaptive based on device # L2 cache sizes: A100=40MB, H100=50MB, H200=90MB, RTX4090=72MB, L40S=48MB, Blackwell≈192MB → overwrite >200MB to fully thrash L2 dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device=device) - # write to tenosr with inplace fill + # write to tensor with inplace fill dummy.fill_(42) del dummy @@ -45,17 +50,34 @@ def get_timing_function( method: str = "cuda_event", # by default ) -> callable: """ - Get the timing function based on different timing methods + Get timing function by method name. + + Available methods: + - "cuda_event": torch.cuda.event timing (default, explicit trial control) + - "do_bench": Use triton's do_bench (adaptive trial count based on time budget) + - "do_bench_impl": Mirrors Triton's do_bench implementation (explicit control) + - "host_time": Host side wall-clock timing (might include overhead) + + Args: + method: Name of timing method to use + + Returns: + Timing function with signature (kernel_fn, args, num_warmup, num_trials, + discard_first, verbose, device) -> list[float] """ print( f"[Profiling] Using timing method: {method}" ) + # NOTE: here are all the timing methods we supporting for now match method: case "cuda_event": return time_execution_with_cuda_event - case "do_bench_interface": + case "do_bench": + # caveat: just using do_bench as it is + # do not have precise control over number of trials return time_execution_with_do_bench_interface case "do_bench_impl": + # do_bench equivalent implementations for transparency and control return time_execution_with_do_bench_impl case "host_time": return time_execution_with_host_time @@ -64,10 +86,8 @@ def get_timing_function( raise ValueError(f"Unsupported timing method: {method}") """ -Kernel Timing Functions [Revamp WIP] -TODO: see our detailed study on how to time kernel execution and benchmarking guide -we implement a few ways to do timing studies -These should be implemnted to be agnostic whether the modules are rather Model (reference kernel) or ModelNew (generated kernel) +Kernel Timing Functions +NOTE: we have a WIP blogpost on this topic covering the various timing approaches """ @@ -76,25 +96,26 @@ def time_execution_with_cuda_event( args: list[Any], num_warmup: int = 3, num_trials: int = 10, - discard_first: int = 1, + discard_first: int = 1, # not used verbose: bool = True, device: torch.device = None, ) -> list[float]: """ - Time a CUDA kernel function over multiple trials using torch.cuda.Event - The first version of KenrelBench used this for evaluation. + Time a CUDA kernel function over multiple trials using torch.cuda.event + The first version of KernelBench used this for evaluation. We care about cold cache performance here. Args: kernel_fn: Function to time - *args: Arguments to pass to kernel_fn + args: Arguments to pass to kernel_fn + num_warmup: Number of warmup iterations before timing num_trials: Number of timing trials to run + discard_first: not used verbose: Whether to print per-trial timing info - device: CUDA device to use, if None, use current device + device: CUDA device to use, defaults to current device - TODO: double check this with team Returns: - List of elapsed times in milliseconds + List of elapsed times in milliseconds (length = num_trials) """ if device is None: if verbose: @@ -115,7 +136,7 @@ def time_execution_with_cuda_event( elapsed_times: list[float] = [] # in ms # Timing trials - for trial in range(num_trials + discard_first): + for trial in range(num_trials): torch.cuda.synchronize(device=device) # block on all streams # create event marker default is not interprocess @@ -136,11 +157,9 @@ def time_execution_with_cuda_event( # Calculate the elapsed time in milliseconds elapsed_time_ms = start_event.elapsed_time(end_event) - if trial >= discard_first: - if verbose: - logical_idx = trial - discard_first + 1 - print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms") - elapsed_times.append(elapsed_time_ms) + if verbose: + print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms") + elapsed_times.append(elapsed_time_ms) return elapsed_times @@ -148,17 +167,34 @@ def time_execution_with_cuda_event( def time_execution_with_do_bench_interface( kernel_fn: callable, args: list[Any], - # this is different for triton do_bench + # Not used, as triton do_bench handles adaptive trials num_warmup: int = 3, num_trials: int = 10, - discard_first: int = 1, # not used yet + discard_first: int = 1, # not used here verbose: bool = True, device: torch.device | None = None) -> list[float]: """ - Using triton's default do_bench as it is - Note we don't set num_warmup and num_trials, and we use warmup 25 ms and repetition time 100 ms with Triton's default values - See doc: https://triton-lang.org/main/python-api/generated/triton.testing.do_bench.html - Benchmark the runtime of the provided function. By default, return the median runtime of fn along with the 20-th and 80-th performance percentile. + Wrapper around Triton's do_bench for kernel timing. + + Uses Triton's adaptive benchmarking with fixed time budgets (warmup=25ms, rep=100ms) [Triton do_bench default]. + The number of trials is determined automatically based on kernel runtime. + + Note: num_warmup, num_trials, discard_first are ignored - included only for + API compatibility with other timing functions. + + Args: + kernel_fn: Function to time + args: Arguments to pass to kernel_fn + num_warmup: (ignored) Triton controls warmup + num_trials: (ignored) Triton controls trial count + discard_first: (ignored) Not used + verbose: Whether to print timing info + device: CUDA device to use + + Returns: + List of elapsed times in milliseconds + + See: https://triton-lang.org/main/python-api/generated/triton.testing.do_bench.html """ do_bench_fn = lambda : kernel_fn(*args) # wrap function with arguments return triton_testing.do_bench(fn=do_bench_fn, @@ -174,7 +210,7 @@ def time_execution_with_do_bench_impl( args: list[Any], num_warmup: int = 3, num_trials: int = 10, - discard_first: int = 1, # not used yet + discard_first: int = 1, # not used here verbose: bool = True, device: torch.device | None = None) -> list[float]: """ @@ -184,14 +220,27 @@ def time_execution_with_do_bench_impl( Note we duplicate triton's implementation and modify / comment out parts to use num_warmup and num_trials that explicitly follows what user define here - instead of do_bench's version that computes how many times to run warmup and profile based on total warmup and repetition time + instead of do_bench's version that computes how many times to run warmup and + profile based on total warmup and repetition time + + We commented out unused parts and kept only what's needed for kernelbench timing eval + Args: + kernel_fn: Function to time + args: Arguments to pass to kernel_fn + num_warmup: Number of warmup iterations + num_trials: Number of timing trials + discard_first: (not used) Trials to discard + verbose: Whether to print timing info + device: CUDA device to use, defaults to current device + Returns: + List of elapsed times in milliseconds (length = num_trials) """ - device = torch.cuda.current_device() if device is not None else device + device = device if device is not None else torch.cuda.current_device() if verbose: print(f"Using do_bench to evaluate kernel on {device}") - # speicfy device interface (supports both nvidia and amd) + # specify device interface (supports both nvidia and amd) # under the hood, di is torch.cuda (amd uses a cuda compatible interface) di = triton_runtime.driver.active.get_device_interface() @@ -253,7 +302,7 @@ def time_execution_with_host_time( args: list[Any], num_warmup: int = 3, num_trials: int = 10, - discard_first: int = 1, + discard_first: int = 1, # to reduce impact of initialization overhead verbose: bool = True, device: torch.device | None = None, ) -> list[float]: @@ -268,13 +317,12 @@ def time_execution_with_host_time( kernel_fn: Function to time args: Arguments to pass to kernel_fn num_trials: Number of timing trials to run + discard_first: Number of first few trials to discard (due to some initialization overhead) verbose: Whether to print per-trial timing info device: CUDA device to use, if None, use current device Returns: List of elapsed times in milliseconds - - Not recommended: """ if device is None: if verbose: @@ -329,6 +377,7 @@ def fetch_baseline_time( Fetch the baseline time from the time Note: might be better to just run the refernece using torch eager and compile sometimes + Will add this as a functionality for eval revamp """ if not os.path.exists(baseline_time_filepath): raise FileNotFoundError( diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py index 24d0faf2..b212bf78 100644 --- a/src/unit_tests/test_eval_timing.py +++ b/src/unit_tests/test_eval_timing.py @@ -37,12 +37,14 @@ def _run_timing_smoke_test_matmul(timing_func_name:str, device:str="cuda"): pytest.skip("CUDA not available, skipping timing tests") # Create simple test matrices - size = 512 - a = torch.randn(size, size, device=device) - b = torch.randn(size, size, device=device) + M = 2048 + N = 2048 + K = 2048 + a = torch.randn(M, K, device=device) + b = torch.randn(K, N, device=device) num_warmup = 5 - num_trials = 5 + num_trials = 100 # Define the kernel function to time def matmul_kernel(a, b): @@ -69,10 +71,14 @@ def matmul_kernel(a, b): print(stats) -timing_methods = ["cuda_event", "cpu_time", "do_bench_interface", "do_bench_impl"] +# test all currently available timing methods +def run_all_timing_tests(): + timing_methods = ["cuda_event", "host_time", "do_bench", "do_bench_impl"] -for timing_method in timing_methods: - _run_timing_smoke_test_matmul(timing_method) + for timing_method in timing_methods: + _run_timing_smoke_test_matmul(timing_method) + +run_all_timing_tests() From 4909b1dbdde3021ff3018425cefd27c638e75049 Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Mon, 15 Dec 2025 04:11:33 +0000 Subject: [PATCH 10/13] add discard_first for cuda_event --- src/timing.py | 20 ++++++++++++-------- src/unit_tests/test_eval_timing.py | 5 ++++- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/timing.py b/src/timing.py index d07dd54f..2d422711 100644 --- a/src/timing.py +++ b/src/timing.py @@ -96,7 +96,7 @@ def time_execution_with_cuda_event( args: list[Any], num_warmup: int = 3, num_trials: int = 10, - discard_first: int = 1, # not used + discard_first: int = 1, # set to 0 to disable verbose: bool = True, device: torch.device = None, ) -> list[float]: @@ -110,7 +110,7 @@ def time_execution_with_cuda_event( args: Arguments to pass to kernel_fn num_warmup: Number of warmup iterations before timing num_trials: Number of timing trials to run - discard_first: not used + discard_first: Number of first trials to discard, for consistency with host_time, set to 0 to disable verbose: Whether to print per-trial timing info device: CUDA device to use, defaults to current device @@ -126,17 +126,17 @@ def time_execution_with_cuda_event( for _ in range(num_warmup): kernel_fn(*args) torch.cuda.synchronize(device=device) - + # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache torch.cuda.empty_cache() - + print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" ) elapsed_times: list[float] = [] # in ms # Timing trials - for trial in range(num_trials): + for trial in range(num_trials + discard_first): torch.cuda.synchronize(device=device) # block on all streams # create event marker default is not interprocess @@ -157,9 +157,13 @@ def time_execution_with_cuda_event( # Calculate the elapsed time in milliseconds elapsed_time_ms = start_event.elapsed_time(end_event) - if verbose: - print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms") - elapsed_times.append(elapsed_time_ms) + + if trial >= discard_first: + if verbose: + logical_idx = trial - discard_first + 1 + print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms") + elapsed_times.append(elapsed_time_ms) + return elapsed_times diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py index b212bf78..24efe8ac 100644 --- a/src/unit_tests/test_eval_timing.py +++ b/src/unit_tests/test_eval_timing.py @@ -61,9 +61,12 @@ def matmul_kernel(a, b): # Validate results assert isinstance(elapsed_times, list), "Expected list of elapsed times" + + # disabled this check as do_bench does not use num_trials # assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}" assert all(isinstance(t, float) for t in elapsed_times), "All timing results should be floats" assert all(t > 0 for t in elapsed_times), "All timing results should be positive" + # DEBUG print times # print(f"smoke test matmul elapsed times with {timing_func_name} (in ms): {elapsed_times}") stats = timing.get_timing_stats(elapsed_times, device=device) @@ -74,7 +77,7 @@ def matmul_kernel(a, b): # test all currently available timing methods def run_all_timing_tests(): timing_methods = ["cuda_event", "host_time", "do_bench", "do_bench_impl"] - + # timing_methods = ["cuda_event", "do_bench_impl"] for timing_method in timing_methods: _run_timing_smoke_test_matmul(timing_method) From 6c92786b9248370b4a165cb6d2e638997082a8ee Mon Sep 17 00:00:00 2001 From: Simon Guo Date: Tue, 16 Dec 2025 01:15:22 +0000 Subject: [PATCH 11/13] add device context for profile on particular device --- .../generate_and_eval_single_sample_modal.py | 1 - src/timing.py | 200 ++++++++++-------- src/unit_tests/test_eval_timing.py | 10 +- 3 files changed, 116 insertions(+), 95 deletions(-) diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py index 9dee518a..f41ba95f 100644 --- a/scripts/generate_and_eval_single_sample_modal.py +++ b/scripts/generate_and_eval_single_sample_modal.py @@ -14,7 +14,6 @@ from datasets import load_dataset #from src.dataset import construct_kernelbench_dataset -from src.eval import eval_kernel_against_ref from src.prompt_constructor_toml import get_prompt_for_backend, get_custom_prompt from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets diff --git a/src/timing.py b/src/timing.py index 2d422711..8269feed 100644 --- a/src/timing.py +++ b/src/timing.py @@ -105,6 +105,10 @@ def time_execution_with_cuda_event( The first version of KernelBench used this for evaluation. We care about cold cache performance here. + Note: this version does not guard against adverserial cuda streams yet. + It assumes computation is done on the current stream for current device. + Stay tuned for future PRs. + Args: kernel_fn: Function to time args: Arguments to pass to kernel_fn @@ -122,47 +126,49 @@ def time_execution_with_cuda_event( print(f"Using current device: {torch.cuda.current_device()}") device = torch.cuda.current_device() - # Warm ups - for _ in range(num_warmup): - kernel_fn(*args) - torch.cuda.synchronize(device=device) - - # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache - torch.cuda.empty_cache() - - print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" - ) - - elapsed_times: list[float] = [] # in ms - - # Timing trials - for trial in range(num_trials + discard_first): - torch.cuda.synchronize(device=device) # block on all streams - - # create event marker default is not interprocess - start_event = torch.cuda.Event(enable_timing=True) - end_event = torch.cuda.Event(enable_timing=True) + with torch.cuda.device(device): - clear_l2_cache() # measuring cold cache performance + # Warm ups + for _ in range(num_warmup): + kernel_fn(*args) + torch.cuda.synchronize(device=device) - # note cuda events mark event on current stream - start_event.record() - _ = kernel_fn(*args) - end_event.record() - - # waits for all streams on that device - # though it is important to note the events only record time between on current stream - # TODO: find ways to check hacks by launching work on additional stream - torch.cuda.synchronize(device=device) - - # Calculate the elapsed time in milliseconds - elapsed_time_ms = start_event.elapsed_time(end_event) + # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache + torch.cuda.empty_cache() - if trial >= discard_first: - if verbose: - logical_idx = trial - discard_first + 1 - print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms") - elapsed_times.append(elapsed_time_ms) + print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}" + ) + + elapsed_times: list[float] = [] # in ms + + # Timing trials + for trial in range(num_trials + discard_first): + torch.cuda.synchronize(device=device) # block on all streams + + # create event marker default is not interprocess + start_event = torch.cuda.Event(enable_timing=True) + end_event = torch.cuda.Event(enable_timing=True) + + clear_l2_cache(device=device) # measuring cold cache performance + + # note cuda events mark event on current stream + start_event.record() + _ = kernel_fn(*args) + end_event.record() + + # waits for all streams on that device + # though it is important to note the events only record time between on current stream + # TODO: find ways to check hacks by launching work on additional stream + torch.cuda.synchronize(device=device) + + # Calculate the elapsed time in milliseconds + elapsed_time_ms = start_event.elapsed_time(end_event) + + if trial >= discard_first: + if verbose: + logical_idx = trial - discard_first + 1 + print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms") + elapsed_times.append(elapsed_time_ms) return elapsed_times @@ -200,8 +206,15 @@ def time_execution_with_do_bench_interface( See: https://triton-lang.org/main/python-api/generated/triton.testing.do_bench.html """ + if device is None: + if verbose: + print(f"Using current device: {torch.cuda.current_device()}") + device = torch.cuda.current_device() + + do_bench_fn = lambda : kernel_fn(*args) # wrap function with arguments - return triton_testing.do_bench(fn=do_bench_fn, + with torch.cuda.device(device): + return triton_testing.do_bench(fn=do_bench_fn, warmup=25, rep=100, grad_to_none=None, @@ -244,58 +257,63 @@ def time_execution_with_do_bench_impl( if verbose: print(f"Using do_bench to evaluate kernel on {device}") - # specify device interface (supports both nvidia and amd) - # under the hood, di is torch.cuda (amd uses a cuda compatible interface) - di = triton_runtime.driver.active.get_device_interface() - - kernel_fn(*args) - di.synchronize(device=device) - - # clear l2 cache - cache = triton_runtime.driver.active.get_empty_cache_for_benchmark() - - # do_bench Estimate the runtime of the function - # Here we are not using it not needed since now the warmup and repeat steps are set by the user) - # start_event = di.Event(enable_timing=True) - # end_event = di.Event(enable_timing=True) - # start_event.record() - # for _ in range(5): - # triton_runtime.driver.active.clear_cache(cache) - # kernel_fn(*args) - # end_event.record() - # di.synchronize() - # estimate_ms = start_event.elapsed_time(end_event) / 5 - - # compute number of warmup and repeat - # Change - # n_warmup = max(1, int(warmup / estimate_ms)) - # n_repeat = max(1, int(rep / estimate_ms)) - # n_warmup = warmup - # n_repeat = rep - # end of change - start_event = [di.Event(enable_timing=True) for i in range(num_trials)] - end_event = [di.Event(enable_timing=True) for i in range(num_trials)] - # Warm-up - for _ in range(num_warmup): - kernel_fn(*args) - # Benchmark - for i in range(num_trials): - # All KernelBench functions are forward passes, so we don't need to reset gradients - # we don't want `fn` to accumulate gradient values - # if it contains a backward pass. So we clear the - # provided gradients - # if grad_to_none is not None: - # for x in grad_to_none: - # x.grad = None - - # we clear the L2 cache before each run - triton_runtime.driver.active.clear_cache(cache) - # record time of `fn` - start_event[i].record() + + # added to constraint to this device + with torch.cuda.device(device): + + # specify device interface (supports both nvidia and amd) + # under the hood, di is torch.cuda (amd uses a cuda compatible interface) + di = triton_runtime.driver.active.get_device_interface() + kernel_fn(*args) - end_event[i].record() - # Record clocks - di.synchronize(device=device) + di.synchronize(device=device) + + # clear l2 cache + cache = triton_runtime.driver.active.get_empty_cache_for_benchmark() + + # do_bench Estimate the runtime of the function + # Here we are not using it not needed since now the warmup and repeat steps are set by the user) + # start_event = di.Event(enable_timing=True) + # end_event = di.Event(enable_timing=True) + # start_event.record() + # for _ in range(5): + # triton_runtime.driver.active.clear_cache(cache) + # kernel_fn(*args) + # end_event.record() + # di.synchronize() + # estimate_ms = start_event.elapsed_time(end_event) / 5 + + # compute number of warmup and repeat + # Change + # n_warmup = max(1, int(warmup / estimate_ms)) + # n_repeat = max(1, int(rep / estimate_ms)) + # n_warmup = warmup + # n_repeat = rep + # end of change + start_event = [di.Event(enable_timing=True) for i in range(num_trials)] + end_event = [di.Event(enable_timing=True) for i in range(num_trials)] + # Warm-up + for _ in range(num_warmup): + kernel_fn(*args) + # Benchmark + for i in range(num_trials): + # All KernelBench functions are forward passes, so we don't need to reset gradients + # we don't want `fn` to accumulate gradient values + # if it contains a backward pass. So we clear the + # provided gradients + # if grad_to_none is not None: + # for x in grad_to_none: + # x.grad = None + + # we clear the L2 cache before each run + triton_runtime.driver.active.clear_cache(cache) + # record time of `fn` + start_event[i].record() + kernel_fn(*args) + end_event[i].record() + # Record clocks + di.synchronize(device=device) + if verbose: print('Done with do_bench evaluation') times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)] return times @@ -350,7 +368,7 @@ def time_execution_with_host_time( torch.cuda.synchronize(device=device) # focus on cold_cache performance - clear_l2_cache() + clear_l2_cache(device=device) # CPU-side wall clock time using perf_counter (high-resolution timer) start_time = time.perf_counter() diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py index 24efe8ac..07fca713 100644 --- a/src/unit_tests/test_eval_timing.py +++ b/src/unit_tests/test_eval_timing.py @@ -57,6 +57,7 @@ def matmul_kernel(a, b): num_warmup=num_warmup, num_trials=num_trials, verbose=False, + device=device ) # Validate results @@ -75,13 +76,16 @@ def matmul_kernel(a, b): # test all currently available timing methods -def run_all_timing_tests(): +def run_all_timing_tests(device="cuda"): timing_methods = ["cuda_event", "host_time", "do_bench", "do_bench_impl"] # timing_methods = ["cuda_event", "do_bench_impl"] for timing_method in timing_methods: - _run_timing_smoke_test_matmul(timing_method) + _run_timing_smoke_test_matmul(timing_method, device=device) -run_all_timing_tests() + + +test_device = torch.device("cuda:5") +run_all_timing_tests(test_device) From 8a165d67271cd89bd29b8a2add808155d9d306f1 Mon Sep 17 00:00:00 2001 From: Pietro Date: Tue, 16 Dec 2025 01:22:22 +0000 Subject: [PATCH 12/13] nit fix ready for merge --- src/unit_tests/test_eval_timing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py index 07fca713..84921a37 100644 --- a/src/unit_tests/test_eval_timing.py +++ b/src/unit_tests/test_eval_timing.py @@ -26,7 +26,7 @@ def _run_timing_smoke_test_matmul(timing_func_name:str, device:str="cuda"): """ Scaffold function for timing smoke tests. - Smoke test for using 512x512 matmul. + Smoke test for using 2048x2048x2048 matmul with 5 warmup and 100 trials. Args: timing_fn: The timing function to test From c063b8127fd6b1b1825b38344bfe7ba4a378604e Mon Sep 17 00:00:00 2001 From: Pietro Date: Tue, 16 Dec 2025 01:33:32 +0000 Subject: [PATCH 13/13] type annotation for device --- src/timing.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/timing.py b/src/timing.py index 8269feed..8a36522b 100644 --- a/src/timing.py +++ b/src/timing.py @@ -18,7 +18,7 @@ # Try them out at src/unit_tests/test_eval_timing.py ################################################################################ -def clear_l2_cache(device: str = "cuda"): +def clear_l2_cache(device: torch.device | str = "cuda"): """ Clear L2 Cache line by thrashing with a large tensor Acknowledge GPU mode reference kernel repo: @@ -150,7 +150,7 @@ def time_execution_with_cuda_event( end_event = torch.cuda.Event(enable_timing=True) clear_l2_cache(device=device) # measuring cold cache performance - + # note cuda events mark event on current stream start_event.record() _ = kernel_fn(*args) @@ -295,6 +295,8 @@ def time_execution_with_do_bench_impl( # Warm-up for _ in range(num_warmup): kernel_fn(*args) + di.synchronize(device=device) + # Benchmark for i in range(num_trials): # All KernelBench functions are forward passes, so we don't need to reset gradients @@ -313,9 +315,9 @@ def time_execution_with_do_bench_impl( end_event[i].record() # Record clocks di.synchronize(device=device) - + times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)] + if verbose: print('Done with do_bench evaluation') - times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)] return times