From db46141cdcdf84b9905175754b90ca05e2995331 Mon Sep 17 00:00:00 2001
From: Kesavan <kesavan6002@gmail.com>
Date: Mon, 10 Nov 2025 18:55:16 +0000
Subject: [PATCH 01/13] Start seperate timing file for timing functions

Co-authored-by: Simon Guo <simonguozirui@stanford.edu>
Co-authored-by: Pietro Marsella <pietro.marsella@stanford.edu>
---
 src/timing.py                      | 181 +++++++++++++++++++++++++++++
 src/unit_tests/test_eval_timing.py |  22 ++++
 2 files changed, 203 insertions(+)
 create mode 100644 src/timing.py
 create mode 100644 src/unit_tests/test_eval_timing.py

diff --git a/src/timing.py b/src/timing.py
new file mode 100644
index 00000000..e6c2c006
--- /dev/null
+++ b/src/timing.py
@@ -0,0 +1,181 @@
+import torch
+import json
+import triton
+import numpy as np
+import time
+from typing import Any
+import os, sys
+
+################################################################################
+# Performance Eval
+################################################################################
+
+#############################################################
+# Timing Functions 
+# TODO: see our detailed study on how to time kernel execution
+# we implement a few ways to do timing studies 
+# agnositic whether the modules are rather Model or ModelNew
+#############################################################
+
+
+def get_timing_function(
+    method: str = "cuda_event", # by default 
+) -> callable:
+    """
+    Get the timing function based on different timing methods
+    """
+    assert method in ["cuda_event", "do_bench", "time_time"]
+    print(
+        f"[Profiling] Using timing method: {method}"
+    )
+    match method:
+        case "cuda_event":
+            return time_execution_with_cuda_event
+        case "do_bench":
+            return time_execution_with_do_bench
+        case "time_time":
+            return time_execution_with_tim_dot_time
+        case _: 
+            raise ValueError(f"Unknown timing method: {method}")    
+    
+
+# TODO: do we want to support pytorch profiler
+
+def time_execution_with_do_bench(
+    kernel_fn: callable,
+    *args,
+    num_warmup: int = 3,
+    num_trials: int = 10,
+    verbose: bool = True,
+    device: torch.device = None,
+) -> list[float]:
+    """
+    Time a CUDA kernel function over multiple trials using triton.do_bench
+    """
+    
+    raise NotImplementedError
+    
+
+def time_execution_with_time_dot_time(
+    kernel_fn: callable,
+    *args,
+    num_warmup: int = 3,
+    num_trials: int = 10,
+    verbose: bool = True,
+    device: torch.device = None,
+) -> list[float]:
+    """
+    Time a CUDA kernel function over multiple trials using time.time()
+    """
+    raise RuntimeError("This function should not be used for timing, it's here purely for reference")
+    
+    # use this
+    # start = time.time()
+    # this is not the way but we will implement it for tutorial
+
+
+
+def time_execution_with_cuda_event(
+    kernel_fn: callable,
+    args: list[Any],
+    num_warmup: int = 3,
+    num_trials: int = 10,
+    verbose: bool = True,
+    device: torch.device = None,
+) -> list[float]:
+    """
+    Time a CUDA kernel function over multiple trials using torch.cuda.Event
+
+    Args:
+        kernel_fn: Function to time
+        *args: Arguments to pass to kernel_fn
+        num_trials: Number of timing trials to run
+        verbose: Whether to print per-trial timing info
+        device: CUDA device to use, if None, use current device
+
+    TODO: make this super solid and check this
+    Returns:
+        List of elapsed times in milliseconds
+    """
+    if device is None:
+        if verbose:
+            print(f"Using current device: {torch.cuda.current_device()}")
+        device = torch.cuda.current_device()
+
+    # Warm ups
+    for _ in range(num_warmup):
+        kernel_fn(*args)
+        torch.cuda.synchronize(device=device)
+
+    print(
+        f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
+    )
+    elapsed_times = []
+
+    # Actual trials
+    for trial in range(num_trials):
+        # create event marker default is not interprocess
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        start_event.record()
+        kernel_fn(*args)
+        end_event.record()
+
+        # Synchronize to ensure the events have completed
+        torch.cuda.synchronize(device=device)
+
+        # Calculate the elapsed time in milliseconds
+        elapsed_time_ms = start_event.elapsed_time(end_event)
+        if verbose:
+            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
+        elapsed_times.append(elapsed_time_ms)
+
+    return elapsed_times
+
+########################################################
+# Timing stats
+#########################################################
+def fetch_baseline_time(
+    level_name: str, problem_id: int, dataset: list[str], baseline_time_filepath: str
+) -> dict:
+    """
+    Fetch the baseline time from the time
+    """
+    if not os.path.exists(baseline_time_filepath):
+        raise FileNotFoundError(
+            f"Baseline time file not found at {baseline_time_filepath}"
+        )
+
+    with open(baseline_time_filepath, "r") as f:
+        baseline_json = json.load(f)
+
+    problem_name = dataset[problem_id].split("/")[-1]
+    baseline_time = baseline_json[level_name].get(problem_name, None)
+    return baseline_time
+
+
+def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> dict:
+    """Get timing statistics from a list of elapsed times.
+
+    Args:
+        elapsed_times: List of elapsed times in milliseconds
+        device: CUDA device, record device info
+    Returns:
+        Dict containing mean, std, min, max and num_trials
+        all timing are in ms
+    """
+
+    stats = {
+        "mean": float(f"{np.mean(elapsed_times):.3g}"),
+        "std": float(f"{np.std(elapsed_times):.3g}"),
+        "min": float(f"{np.min(elapsed_times):.3g}"),
+        "max": float(f"{np.max(elapsed_times):.3g}"),
+        "num_trials": len(elapsed_times),
+    }
+
+    if device:
+        stats["hardware"] = torch.cuda.get_device_name(device=device)
+        stats["device"] = str(device)  # for debugging
+
+    return stats
diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py
new file mode 100644
index 00000000..ef6fffc6
--- /dev/null
+++ b/src/unit_tests/test_eval_timing.py
@@ -0,0 +1,22 @@
+import os
+
+
+"""
+Test Timing
+
+We want to systematically study different timing methodologies.
+
+"""
+REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+# use exampls in the few shot directory
+EXAMPLES_PATH = os.path.join(REPO_PATH, "src", "prompts", "few_shot")
+
+# Configure your test cases here
+TEST_REF_FILE = "model_ex_tiled_matmul.py"
+TEST_KERNEL_FILE = "model_new_ex_tiled_matmul.py"
+
+assert os.path.exists(os.path.join(EXAMPLES_PATH, TEST_REF_FILE)), f"Reference file {TEST_REF_FILE} does not exist in {EXAMPLES_PATH}"
+assert os.path.exists(os.path.join(EXAMPLES_PATH, TEST_KERNEL_FILE)), f"Kernel file {TEST_KERNEL_FILE} does not exist in {EXAMPLES_PATH}"
+
+

From 5ff8891b3af1044c89208b5971418b70e753aad5 Mon Sep 17 00:00:00 2001
From: Sahan <sahancpal@gmail.com>
Date: Sun, 16 Nov 2025 23:38:39 +0000
Subject: [PATCH 02/13] Add tests, cache clearning, time, and do_bench

---
 src/do_bench.py                    | 126 +++++++++++++++++++++++++++++
 src/timing.py                      |  75 +++++++++++++----
 src/unit_tests/test_eval_timing.py |  73 +++++++++++++++++
 3 files changed, 259 insertions(+), 15 deletions(-)
 create mode 100644 src/do_bench.py

diff --git a/src/do_bench.py b/src/do_bench.py
new file mode 100644
index 00000000..a6e8d8f0
--- /dev/null
+++ b/src/do_bench.py
@@ -0,0 +1,126 @@
+import math
+import statistics
+from triton import runtime
+
+
+# pure Python implementation of np.quantile/torch.quantile
+# to avoid unnecessary runtime dependency on numpy/torch
+
+# This is a slightly modfied version of triton.testing.do_bench (triton v3.5.x) from 
+# https://github.com/triton-lang/triton/blob/0add68262ab0a2e33b84524346cb27cbb2787356/python/triton/testing.py#L127
+# with minor a minor modification to support having warmup and repeat time instead be specified in number of iterations
+# instead of ms. All changes are explcitly marked
+
+def _quantile(a, q):
+    n = len(a)
+    a = sorted(a)
+
+    def get_quantile(q):
+        if not (0 <= q <= 1):
+            raise ValueError("Quantiles must be in the range [0, 1]")
+        point = q * (n - 1)
+        lower = math.floor(point)
+        upper = math.ceil(point)
+        t = point - lower
+        return (1 - t) * a[lower] + t * a[upper]
+
+    return [get_quantile(q) for q in q]
+
+
+def _summarize_statistics(times, quantiles, return_mode):
+    if quantiles is not None:
+        ret = _quantile(times, quantiles)
+        if len(ret) == 1:
+            ret = ret[0]
+        return ret
+    if return_mode == "all":
+        return times
+    elif return_mode == "min":
+        return min(times)
+    elif return_mode == "max":
+        return max(times)
+    elif return_mode == "mean":
+        return statistics.mean(times)
+    elif return_mode == "median":
+        return statistics.median(times)
+
+
+def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_mode="mean"):
+    """
+    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
+    the 20-th and 80-th performance percentile. CHANGE: warmup and repeat time are specified in number of iterations rather than ms
+    
+
+    :param fn: Function to benchmark
+    :type fn: Callable
+    :param warmup: Warmup time (in number of iterations)
+    :type warmup: int
+    :param rep: Repetition time (in number of iterations)
+    :type rep: int
+    :param grad_to_none: Reset the gradient of the provided tensor to None
+    :type grad_to_none: torch.tensor, optional
+    :param quantiles: Performance percentile to return in addition to the median.
+    :type quantiles: list[float], optional
+    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
+    :type return_mode: str
+    """
+    assert return_mode in ["min", "max", "mean", "median", "all"]
+
+    # Change
+    # mean, max, min, quantiles, etc. make no sense with 0 reps
+    if not (return_mode == "all" and quantiles is None) and rep < 1:
+        error_msg = (
+            f"You are running with {rep} reps. This is likely a mistake!!!\n"
+            "We do let you do this, but ONLY when quantiles is None when return_mode is not 'all'\n"
+            "to be consistent with the rest of KernelBench's timing functions"
+        )
+        raise ValueError(error_msg)
+    # End of change
+    di = runtime.driver.active.get_device_interface()
+
+    fn()
+    di.synchronize()
+
+    cache = runtime.driver.active.get_empty_cache_for_benchmark()
+
+    # Estimate the runtime of the function
+    start_event = di.Event(enable_timing=True)
+    end_event = di.Event(enable_timing=True)
+    start_event.record()
+    for _ in range(5):
+        runtime.driver.active.clear_cache(cache)
+        fn()
+    end_event.record()
+    di.synchronize()
+    estimate_ms = start_event.elapsed_time(end_event) / 5
+
+    # compute number of warmup and repeat
+    # Change
+    # n_warmup = max(1, int(warmup / estimate_ms))
+    # n_repeat = max(1, int(rep / estimate_ms))
+    n_warmup = warmup
+    n_repeat = rep
+    # end of change
+    start_event = [di.Event(enable_timing=True) for i in range(n_repeat)]
+    end_event = [di.Event(enable_timing=True) for i in range(n_repeat)]
+    # Warm-up
+    for _ in range(n_warmup):
+        fn()
+    # Benchmark
+    for i in range(n_repeat):
+        # we don't want `fn` to accumulate gradient values
+        # if it contains a backward pass. So we clear the
+        # provided gradients
+        if grad_to_none is not None:
+            for x in grad_to_none:
+                x.grad = None
+        # we clear the L2 cache before each run
+        runtime.driver.active.clear_cache(cache)
+        # record time of `fn`
+        start_event[i].record()
+        fn()
+        end_event[i].record()
+    # Record clocks
+    di.synchronize()
+    times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
+    return _summarize_statistics(times, quantiles, return_mode)
\ No newline at end of file
diff --git a/src/timing.py b/src/timing.py
index e6c2c006..3dc49b0d 100644
--- a/src/timing.py
+++ b/src/timing.py
@@ -1,10 +1,11 @@
 import torch
 import json
-import triton
 import numpy as np
 import time
+import warnings
 from typing import Any
-import os, sys
+import os
+from do_bench import do_bench
 
 ################################################################################
 # Performance Eval
@@ -17,7 +18,6 @@
 # agnositic whether the modules are rather Model or ModelNew
 #############################################################
 
-
 def get_timing_function(
     method: str = "cuda_event", # by default 
 ) -> callable:
@@ -37,13 +37,10 @@ def get_timing_function(
             return time_execution_with_tim_dot_time
         case _: 
             raise ValueError(f"Unknown timing method: {method}")    
-    
-
-# TODO: do we want to support pytorch profiler
 
 def time_execution_with_do_bench(
     kernel_fn: callable,
-    *args,
+    args: list[Any],
     num_warmup: int = 3,
     num_trials: int = 10,
     verbose: bool = True,
@@ -52,13 +49,17 @@ def time_execution_with_do_bench(
     """
     Time a CUDA kernel function over multiple trials using triton.do_bench
     """
-    
-    raise NotImplementedError
+    return do_bench(
+        lambda: kernel_fn(*args),
+        warmup=num_warmup,
+        rep=num_trials,
+        return_mode="all",
+    )
     
 
 def time_execution_with_time_dot_time(
     kernel_fn: callable,
-    *args,
+    args: list[Any],
     num_warmup: int = 3,
     num_trials: int = 10,
     verbose: bool = True,
@@ -66,12 +67,54 @@ def time_execution_with_time_dot_time(
 ) -> list[float]:
     """
     Time a CUDA kernel function over multiple trials using time.time()
+
+    Args:
+        kernel_fn: Function to time
+        args: Arguments to pass to kernel_fn
+        num_trials: Number of timing trials to run
+        verbose: Whether to print per-trial timing info
+        device: CUDA device to use, if None, use current device
+
+    Returns:
+        List of elapsed times in milliseconds
     """
-    raise RuntimeError("This function should not be used for timing, it's here purely for reference")
-    
-    # use this
-    # start = time.time()
-    # this is not the way but we will implement it for tutorial
+
+    # give warning that this is not the way to do it
+    warnings.warn(
+        "time_execution_with_time_dot_time is meant for educational purposes only, please other options like time_with_cuda_event or time_with_do_bench",
+        UserWarning,
+    )
+
+    if device is None:
+        if verbose:
+            print(f"Using current device: {torch.cuda.current_device()}")
+        device = torch.cuda.current_device()
+
+    # Warm ups
+    for _ in range(num_warmup):
+        kernel_fn(*args)
+        torch.cuda.synchronize(device=device)
+
+    print(
+        f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
+    )
+    elapsed_times = []
+
+    # Actual trials
+    for trial in range(num_trials):
+        start_time = time.time()
+        kernel_fn(*args)
+        torch.cuda.synchronize(device=device)
+        end_time = time.time()
+
+        # Calculate the elapsed time in milliseconds
+        elapsed_time_ms = (end_time - start_time) * 1000
+        if verbose:
+            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
+        elapsed_times.append(elapsed_time_ms)
+
+    return elapsed_times
+
 
 
 
@@ -106,6 +149,7 @@ def time_execution_with_cuda_event(
     for _ in range(num_warmup):
         kernel_fn(*args)
         torch.cuda.synchronize(device=device)
+        torch.cuda.clear_cache()
 
     print(
         f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
@@ -124,6 +168,7 @@ def time_execution_with_cuda_event(
 
         # Synchronize to ensure the events have completed
         torch.cuda.synchronize(device=device)
+        torch.cuda.clear_cache()
 
         # Calculate the elapsed time in milliseconds
         elapsed_time_ms = start_event.elapsed_time(end_event)
diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py
index ef6fffc6..23a100db 100644
--- a/src/unit_tests/test_eval_timing.py
+++ b/src/unit_tests/test_eval_timing.py
@@ -1,4 +1,14 @@
 import os
+import sys
+import torch
+import pytest
+
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from timing import (
+    time_execution_with_cuda_event,
+    time_execution_with_time_dot_time,
+    time_execution_with_do_bench,
+)
 
 
 """
@@ -20,3 +30,66 @@
 assert os.path.exists(os.path.join(EXAMPLES_PATH, TEST_KERNEL_FILE)), f"Kernel file {TEST_KERNEL_FILE} does not exist in {EXAMPLES_PATH}"
 
 
+def _run_timing_smoke_test(timing_fn):
+    """
+    Scaffold function for timing smoke tests.
+    
+    Args:
+        timing_fn: The timing function to test
+        use_args: Whether the timing function expects args parameter (True for cuda_event/time_dot_time, False for do_bench)
+    """
+    # Skip if CUDA is not available
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available, skipping timing tests")
+    
+    # Create test matrices
+    size = 512
+    a = torch.randn(size, size, device='cuda')
+    b = torch.randn(size, size, device='cuda')
+    
+    num_warmup = 5
+    num_trials = 5
+    
+    # Define the kernel function to time
+    def matmul_kernel(a, b):
+        return torch.matmul(a, b)
+    
+    elapsed_times = timing_fn(
+        matmul_kernel,
+        args=[a, b],
+        num_warmup=num_warmup,
+        num_trials=num_trials,
+        verbose=False,
+    )
+    
+    # Validate results
+    assert isinstance(elapsed_times, list), "Expected list of elapsed times"
+    assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}"
+    assert all(isinstance(t, float) for t in elapsed_times), "All timing results should be floats"
+    assert all(t > 0 for t in elapsed_times), "All timing results should be positive"
+
+
+def test_time_execution_with_cuda_event_smoke():
+    """
+    Smoke test for time_execution_with_cuda_event using 512x512 matmul.
+    Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned.
+    """
+    _run_timing_smoke_test(time_execution_with_cuda_event)
+
+
+def test_time_execution_with_time_dot_time_smoke():
+    """
+    Smoke test for time_execution_with_time_dot_time using 512x512 matmul.
+    Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned.
+    """
+    _run_timing_smoke_test(time_execution_with_time_dot_time)
+
+
+def test_time_execution_with_do_bench_smoke():
+    """
+    Smoke test for time_execution_with_do_bench using 512x512 matmul.
+    Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned.
+    """
+    _run_timing_smoke_test(time_execution_with_do_bench)
+
+

From 9581487ea29a9f742cfaf1040bd4deb1626e9026 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Fri, 12 Dec 2025 03:29:51 +0000
Subject: [PATCH 03/13] reorganize timing func, migrate cuda event with l2
 cache from branch to here; a few other to implement

---
 requirements.txt                   |   4 +-
 src/timing.py                      | 140 +++++++++++++++++++++--------
 src/unit_tests/test_eval_timing.py |  56 ++++++------
 3 files changed, 131 insertions(+), 69 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d7f31a49..cc7300f1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Frameworks
-torch==2.5.0
+torch==2.9.0
 # we shall upgrade torch for blackwell when it is stable
 transformers
 datasets
@@ -9,6 +9,7 @@ modal
 nvidia-cutlass-dsl
 tilelang
 apache-tvm
+triton
 
 # helper
 tqdm
@@ -16,6 +17,7 @@ packaging
 pydra_config
 pytest
 ninja
+cupy-cuda12x
 
 # Numerics
 einops
diff --git a/src/timing.py b/src/timing.py
index 3dc49b0d..007d75c6 100644
--- a/src/timing.py
+++ b/src/timing.py
@@ -5,26 +5,25 @@
 import warnings
 from typing import Any
 import os
-from do_bench import do_bench
+from triton import runtime
+from eval import clear_l2_cache
 
 ################################################################################
 # Performance Eval
 ################################################################################
 
-#############################################################
-# Timing Functions 
-# TODO: see our detailed study on how to time kernel execution
-# we implement a few ways to do timing studies 
-# agnositic whether the modules are rather Model or ModelNew
-#############################################################
-
+"""
+Kernel Timing Functions [Revamp WIP]
+TODO: see our detailed study on how to time kernel execution and benchmarking guide
+we implement a few ways to do timing studies 
+These should be implemnted to be agnostic whether the modules are rather Model (reference kernel) or ModelNew (generated kernel)
+"""
 def get_timing_function(
     method: str = "cuda_event", # by default 
 ) -> callable:
     """
     Get the timing function based on different timing methods
     """
-    assert method in ["cuda_event", "do_bench", "time_time"]
     print(
         f"[Profiling] Using timing method: {method}"
     )
@@ -34,9 +33,9 @@ def get_timing_function(
         case "do_bench":
             return time_execution_with_do_bench
         case "time_time":
-            return time_execution_with_tim_dot_time
+            return time_execution_with_time_dot_time # this is just for education purpose, don't use this
         case _: 
-            raise ValueError(f"Unknown timing method: {method}")    
+            raise ValueError(f"Unsupported timing method: {method}")
 
 def time_execution_with_do_bench(
     kernel_fn: callable,
@@ -44,18 +43,71 @@ def time_execution_with_do_bench(
     num_warmup: int = 3,
     num_trials: int = 10,
     verbose: bool = True,
-    device: torch.device = None,
-) -> list[float]:
+    device: torch.device | None = None) -> list[float]:
     """
-    Time a CUDA kernel function over multiple trials using triton.do_bench
+    TODO: need check do_bench
+    [WIP] need to check
     """
-    return do_bench(
-        lambda: kernel_fn(*args),
-        warmup=num_warmup,
-        rep=num_trials,
-        return_mode="all",
-    )
-    
+
+    device = torch.cuda.current_device() if device is not None else device
+
+    if verbose: print("Using do_bench to evaluate kernel")
+
+    # note: for both nvidia and amd, di is torch.cuda (amd uses a cuda compatible interface), so we could really just have torch.cuda
+    di = runtime.driver.active.get_device_interface()
+
+    kernel_fn(*args)
+    di.synchronize(device=device)
+
+    cache = runtime.driver.active.get_empty_cache_for_benchmark()
+
+    # Estimate the runtime of the function (not needed since now the warmup and repeat steps are set by the user)
+
+    # start_event = di.Event(enable_timing=True)
+    # end_event = di.Event(enable_timing=True)
+    # start_event.record()
+    # for _ in range(5):
+    #     runtime.driver.active.clear_cache(cache)
+    #     kernel_fn(*args)
+    # end_event.record()
+    # di.synchronize()
+    # estimate_ms = start_event.elapsed_time(end_event) / 5
+
+    # compute number of warmup and repeat
+    # Change
+    # n_warmup = max(1, int(warmup / estimate_ms))
+    # n_repeat = max(1, int(rep / estimate_ms))
+    # n_warmup = warmup
+    # n_repeat = rep
+    # end of change
+    start_event = [di.Event(enable_timing=True) for i in range(num_trials)]
+    end_event = [di.Event(enable_timing=True) for i in range(num_trials)]
+    # Warm-up
+    for _ in range(num_warmup):
+        kernel_fn(*args)
+    # Benchmark
+    for i in range(num_trials):
+
+        # All our functions are forward passes, so we don't need to reset gradients
+        # we don't want `fn` to accumulate gradient values
+        # if it contains a backward pass. So we clear the
+        # provided gradients
+        # if grad_to_none is not None:
+        #     for x in grad_to_none:
+        #         x.grad = None
+        
+        # we clear the L2 cache before each run
+        runtime.driver.active.clear_cache(cache)
+        # record time of `fn`
+        start_event[i].record()
+        kernel_fn(*args)
+        end_event[i].record()
+    # Record clocks
+    di.synchronize(device=device)
+    if verbose: print('Done with do_bench evaluation')
+    times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
+    return times
+
 
 def time_execution_with_time_dot_time(
     kernel_fn: callable,
@@ -63,11 +115,11 @@ def time_execution_with_time_dot_time(
     num_warmup: int = 3,
     num_trials: int = 10,
     verbose: bool = True,
-    device: torch.device = None,
+    device: torch.device | None = None,
 ) -> list[float]:
     """
     Time a CUDA kernel function over multiple trials using time.time()
-
+    [WIP]
     Args:
         kernel_fn: Function to time
         args: Arguments to pass to kernel_fn
@@ -77,6 +129,8 @@ def time_execution_with_time_dot_time(
 
     Returns:
         List of elapsed times in milliseconds
+
+    Not recommended: 
     """
 
     # give warning that this is not the way to do it
@@ -95,9 +149,7 @@ def time_execution_with_time_dot_time(
         kernel_fn(*args)
         torch.cuda.synchronize(device=device)
 
-    print(
-        f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
-    )
+    print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}")
     elapsed_times = []
 
     # Actual trials
@@ -128,6 +180,8 @@ def time_execution_with_cuda_event(
 ) -> list[float]:
     """
     Time a CUDA kernel function over multiple trials using torch.cuda.Event
+    The first version of KenrelBench used this for evaluation.
+    We care about cold cache performance here.
 
     Args:
         kernel_fn: Function to time
@@ -136,7 +190,7 @@ def time_execution_with_cuda_event(
         verbose: Whether to print per-trial timing info
         device: CUDA device to use, if None, use current device
 
-    TODO: make this super solid and check this
+    TODO: double check this with team 
     Returns:
         List of elapsed times in milliseconds
     """
@@ -149,35 +203,44 @@ def time_execution_with_cuda_event(
     for _ in range(num_warmup):
         kernel_fn(*args)
         torch.cuda.synchronize(device=device)
-        torch.cuda.clear_cache()
 
-    print(
-        f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
+    # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache
+    torch.cuda.empty_cache()
+
+    print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
     )
-    elapsed_times = []
 
-    # Actual trials
+    elapsed_times: list[float] = [] # in ms
+
+    # Timing trials
     for trial in range(num_trials):
+        torch.cuda.synchronize(device=device) # block on all streams
+
         # create event marker default is not interprocess
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
-
+        
+        clear_l2_cache() # measuring cold cache performance
+        
+        # note cuda events mark event on current stream
         start_event.record()
-        kernel_fn(*args)
-        end_event.record()
+        _ = kernel_fn(*args)
+        end_event.record() 
 
-        # Synchronize to ensure the events have completed
+        # waits for all streams on that device
+        # though it is important to note the events only record time between on current stream
+        # TODO: find ways to check hacks by launching work on additional stream
         torch.cuda.synchronize(device=device)
-        torch.cuda.clear_cache()
 
         # Calculate the elapsed time in milliseconds
         elapsed_time_ms = start_event.elapsed_time(end_event)
         if verbose:
-            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
+            print(f"Timing Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
         elapsed_times.append(elapsed_time_ms)
 
     return elapsed_times
 
+
 ########################################################
 # Timing stats
 #########################################################
@@ -195,6 +258,7 @@ def fetch_baseline_time(
     with open(baseline_time_filepath, "r") as f:
         baseline_json = json.load(f)
 
+    # TODO: replace with the new Dataset object that Omar will merge in
     problem_name = dataset[problem_id].split("/")[-1]
     baseline_time = baseline_json[level_name].get(problem_name, None)
     return baseline_time
diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py
index 23a100db..955b2519 100644
--- a/src/unit_tests/test_eval_timing.py
+++ b/src/unit_tests/test_eval_timing.py
@@ -4,18 +4,11 @@
 import pytest
 
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-from timing import (
-    time_execution_with_cuda_event,
-    time_execution_with_time_dot_time,
-    time_execution_with_do_bench,
-)
-
+import timing
 
 """
 Test Timing
-
 We want to systematically study different timing methodologies.
-
 """
 REPO_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
 
@@ -30,10 +23,11 @@
 assert os.path.exists(os.path.join(EXAMPLES_PATH, TEST_KERNEL_FILE)), f"Kernel file {TEST_KERNEL_FILE} does not exist in {EXAMPLES_PATH}"
 
 
-def _run_timing_smoke_test(timing_fn):
+def _run_timing_smoke_test_matmul(timing_func_name:str, device:str="cuda"):
     """
     Scaffold function for timing smoke tests.
-    
+    Smoke test for using 512x512 matmul.
+
     Args:
         timing_fn: The timing function to test
         use_args: Whether the timing function expects args parameter (True for cuda_event/time_dot_time, False for do_bench)
@@ -42,10 +36,10 @@ def _run_timing_smoke_test(timing_fn):
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available, skipping timing tests")
     
-    # Create test matrices
+    # Create simple test matrices
     size = 512
-    a = torch.randn(size, size, device='cuda')
-    b = torch.randn(size, size, device='cuda')
+    a = torch.randn(size, size, device=device)
+    b = torch.randn(size, size, device=device)
     
     num_warmup = 5
     num_trials = 5
@@ -54,7 +48,8 @@ def _run_timing_smoke_test(timing_fn):
     def matmul_kernel(a, b):
         return torch.matmul(a, b)
     
-    elapsed_times = timing_fn(
+    timing_func = timing.get_timing_function(timing_func_name)
+    elapsed_times = timing_func(
         matmul_kernel,
         args=[a, b],
         num_warmup=num_warmup,
@@ -67,29 +62,30 @@ def matmul_kernel(a, b):
     assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}"
     assert all(isinstance(t, float) for t in elapsed_times), "All timing results should be floats"
     assert all(t > 0 for t in elapsed_times), "All timing results should be positive"
+    print(f"smoke test matmul elapsed times with {timing_func_name} (in ms): {elapsed_times}")
 
 
-def test_time_execution_with_cuda_event_smoke():
-    """
-    Smoke test for time_execution_with_cuda_event using 512x512 matmul.
-    Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned.
-    """
-    _run_timing_smoke_test(time_execution_with_cuda_event)
+_run_timing_smoke_test_matmul("cuda_event")
 
 
-def test_time_execution_with_time_dot_time_smoke():
+def test_do_bench_simple_smoke():
     """
-    Smoke test for time_execution_with_time_dot_time using 512x512 matmul.
-    Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned.
+    Smoke test for do_bench itself on a simple CUDA operation.
+    Just checks it runs and returns timings.
     """
-    _run_timing_smoke_test(time_execution_with_time_dot_time)
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available, skipping do_bench smoke test")
 
+    from do_bench import do_bench
 
-def test_time_execution_with_do_bench_smoke():
-    """
-    Smoke test for time_execution_with_do_bench using 512x512 matmul.
-    Tests with 5 warmup and 5 trials, validates list of 5 positive floats is returned.
-    """
-    _run_timing_smoke_test(time_execution_with_do_bench)
+    x = torch.randn(1024, device="cuda")
+
+    def fn():
+        # simple GPU op; do_bench will sync/timestamp internally
+        return (x * 2).sum()
 
+    rep = 5
+    times = do_bench(fn, warmup=2, rep=rep, return_mode="all")
+    assert isinstance(times, list)
+    assert len(times) == rep
 

From 467f856d6af44629fcc07ccfcdf36365b4cb95d7 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Fri, 12 Dec 2025 04:25:48 +0000
Subject: [PATCH 04/13] implement do_bench and cpu host timing, script to run
 all 4 timing methods

---
 src/do_bench.py                    | 126 ------------
 src/timing.py                      | 296 ++++++++++++++++++-----------
 src/unit_tests/test_eval_timing.py |  14 +-
 3 files changed, 193 insertions(+), 243 deletions(-)
 delete mode 100644 src/do_bench.py

diff --git a/src/do_bench.py b/src/do_bench.py
deleted file mode 100644
index a6e8d8f0..00000000
--- a/src/do_bench.py
+++ /dev/null
@@ -1,126 +0,0 @@
-import math
-import statistics
-from triton import runtime
-
-
-# pure Python implementation of np.quantile/torch.quantile
-# to avoid unnecessary runtime dependency on numpy/torch
-
-# This is a slightly modfied version of triton.testing.do_bench (triton v3.5.x) from 
-# https://github.com/triton-lang/triton/blob/0add68262ab0a2e33b84524346cb27cbb2787356/python/triton/testing.py#L127
-# with minor a minor modification to support having warmup and repeat time instead be specified in number of iterations
-# instead of ms. All changes are explcitly marked
-
-def _quantile(a, q):
-    n = len(a)
-    a = sorted(a)
-
-    def get_quantile(q):
-        if not (0 <= q <= 1):
-            raise ValueError("Quantiles must be in the range [0, 1]")
-        point = q * (n - 1)
-        lower = math.floor(point)
-        upper = math.ceil(point)
-        t = point - lower
-        return (1 - t) * a[lower] + t * a[upper]
-
-    return [get_quantile(q) for q in q]
-
-
-def _summarize_statistics(times, quantiles, return_mode):
-    if quantiles is not None:
-        ret = _quantile(times, quantiles)
-        if len(ret) == 1:
-            ret = ret[0]
-        return ret
-    if return_mode == "all":
-        return times
-    elif return_mode == "min":
-        return min(times)
-    elif return_mode == "max":
-        return max(times)
-    elif return_mode == "mean":
-        return statistics.mean(times)
-    elif return_mode == "median":
-        return statistics.median(times)
-
-
-def do_bench(fn, warmup=25, rep=100, grad_to_none=None, quantiles=None, return_mode="mean"):
-    """
-    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
-    the 20-th and 80-th performance percentile. CHANGE: warmup and repeat time are specified in number of iterations rather than ms
-    
-
-    :param fn: Function to benchmark
-    :type fn: Callable
-    :param warmup: Warmup time (in number of iterations)
-    :type warmup: int
-    :param rep: Repetition time (in number of iterations)
-    :type rep: int
-    :param grad_to_none: Reset the gradient of the provided tensor to None
-    :type grad_to_none: torch.tensor, optional
-    :param quantiles: Performance percentile to return in addition to the median.
-    :type quantiles: list[float], optional
-    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
-    :type return_mode: str
-    """
-    assert return_mode in ["min", "max", "mean", "median", "all"]
-
-    # Change
-    # mean, max, min, quantiles, etc. make no sense with 0 reps
-    if not (return_mode == "all" and quantiles is None) and rep < 1:
-        error_msg = (
-            f"You are running with {rep} reps. This is likely a mistake!!!\n"
-            "We do let you do this, but ONLY when quantiles is None when return_mode is not 'all'\n"
-            "to be consistent with the rest of KernelBench's timing functions"
-        )
-        raise ValueError(error_msg)
-    # End of change
-    di = runtime.driver.active.get_device_interface()
-
-    fn()
-    di.synchronize()
-
-    cache = runtime.driver.active.get_empty_cache_for_benchmark()
-
-    # Estimate the runtime of the function
-    start_event = di.Event(enable_timing=True)
-    end_event = di.Event(enable_timing=True)
-    start_event.record()
-    for _ in range(5):
-        runtime.driver.active.clear_cache(cache)
-        fn()
-    end_event.record()
-    di.synchronize()
-    estimate_ms = start_event.elapsed_time(end_event) / 5
-
-    # compute number of warmup and repeat
-    # Change
-    # n_warmup = max(1, int(warmup / estimate_ms))
-    # n_repeat = max(1, int(rep / estimate_ms))
-    n_warmup = warmup
-    n_repeat = rep
-    # end of change
-    start_event = [di.Event(enable_timing=True) for i in range(n_repeat)]
-    end_event = [di.Event(enable_timing=True) for i in range(n_repeat)]
-    # Warm-up
-    for _ in range(n_warmup):
-        fn()
-    # Benchmark
-    for i in range(n_repeat):
-        # we don't want `fn` to accumulate gradient values
-        # if it contains a backward pass. So we clear the
-        # provided gradients
-        if grad_to_none is not None:
-            for x in grad_to_none:
-                x.grad = None
-        # we clear the L2 cache before each run
-        runtime.driver.active.clear_cache(cache)
-        # record time of `fn`
-        start_event[i].record()
-        fn()
-        end_event[i].record()
-    # Record clocks
-    di.synchronize()
-    times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
-    return _summarize_statistics(times, quantiles, return_mode)
\ No newline at end of file
diff --git a/src/timing.py b/src/timing.py
index 007d75c6..c10b1c58 100644
--- a/src/timing.py
+++ b/src/timing.py
@@ -5,19 +5,43 @@
 import warnings
 from typing import Any
 import os
-from triton import runtime
-from eval import clear_l2_cache
+from triton import runtime as triton_runtime
+from triton import testing as triton_testing    
 
 ################################################################################
 # Performance Eval
 ################################################################################
 
-"""
-Kernel Timing Functions [Revamp WIP]
-TODO: see our detailed study on how to time kernel execution and benchmarking guide
-we implement a few ways to do timing studies 
-These should be implemnted to be agnostic whether the modules are rather Model (reference kernel) or ModelNew (generated kernel)
-"""
+def clear_l2_cache(device: str = "cuda"):
+    """
+    Clear L2 Cache line by thrashing
+    From GPU mode reference kernel repo:
+    https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163
+
+    We can improve this 
+    TODO; should prob check device_name
+    """
+    # don't reserve space for persisting lines
+    # cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    
+    # Thrash L2 cache by creating a larger dummy tensor, effectively flushing the cache
+    # 32 * 1024 * 1024 * 8B = 256MB 
+    # NOTE: we can make this more adaptive based on device
+    # L2 cache sizes: A100=40MB, H100=50MB, H200=90MB, RTX4090=72MB, L40S=48MB, Blackwell≈192MB → overwrite >200MB to fully thrash L2
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device=device)
+    # write to tenosr with inplace fill
+    dummy.fill_(42) 
+    del dummy
+
+def clear_l2_cache_triton(cache=None, device: str = "cuda"):
+    """
+    Thrash the cache by making a large dummy tensor, using triton runtime's functionality
+    """
+    with torch.cuda.device(device):
+        cache = triton_runtime.driver.active.get_empty_cache_for_benchmark()
+        triton_runtime.driver.active.clear_cache(cache)
+
+
 def get_timing_function(
     method: str = "cuda_event", # by default 
 ) -> callable:
@@ -30,44 +54,155 @@ def get_timing_function(
     match method:
         case "cuda_event":
             return time_execution_with_cuda_event
-        case "do_bench":
-            return time_execution_with_do_bench
-        case "time_time":
-            return time_execution_with_time_dot_time # this is just for education purpose, don't use this
+        case "do_bench_interface":
+            return time_execution_with_do_bench_interface
+        case "do_bench_impl":
+            return time_execution_with_do_bench_impl
+        case "cpu_time":
+            return time_execution_with_cpu_time 
+        # we might add other methods in the future
         case _: 
             raise ValueError(f"Unsupported timing method: {method}")
 
-def time_execution_with_do_bench(
+"""
+Kernel Timing Functions [Revamp WIP]
+TODO: see our detailed study on how to time kernel execution and benchmarking guide
+we implement a few ways to do timing studies 
+These should be implemnted to be agnostic whether the modules are rather Model (reference kernel) or ModelNew (generated kernel)
+"""
+
+
+def time_execution_with_cuda_event(
     kernel_fn: callable,
     args: list[Any],
     num_warmup: int = 3,
     num_trials: int = 10,
+    discard_first: int = 1,
+    verbose: bool = True,
+    device: torch.device = None,
+) -> list[float]:
+    """
+    Time a CUDA kernel function over multiple trials using torch.cuda.Event
+    The first version of KenrelBench used this for evaluation.
+    We care about cold cache performance here.
+
+    Args:
+        kernel_fn: Function to time
+        *args: Arguments to pass to kernel_fn
+        num_trials: Number of timing trials to run
+        verbose: Whether to print per-trial timing info
+        device: CUDA device to use, if None, use current device
+
+    TODO: double check this with team 
+    Returns:
+        List of elapsed times in milliseconds
+    """
+    if device is None:
+        if verbose:
+            print(f"Using current device: {torch.cuda.current_device()}")
+        device = torch.cuda.current_device()
+
+    # Warm ups
+    for _ in range(num_warmup):
+        kernel_fn(*args)
+        torch.cuda.synchronize(device=device)
+
+    # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache
+    torch.cuda.empty_cache()
+
+    print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
+    )
+
+    elapsed_times: list[float] = [] # in ms
+
+    # Timing trials
+    for trial in range(num_trials + discard_first):
+        torch.cuda.synchronize(device=device) # block on all streams
+
+        # create event marker default is not interprocess
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        
+        clear_l2_cache() # measuring cold cache performance
+        
+        # note cuda events mark event on current stream
+        start_event.record()
+        _ = kernel_fn(*args)
+        end_event.record() 
+
+        # waits for all streams on that device
+        # though it is important to note the events only record time between on current stream
+        # TODO: find ways to check hacks by launching work on additional stream
+        torch.cuda.synchronize(device=device)
+
+        # Calculate the elapsed time in milliseconds
+        elapsed_time_ms = start_event.elapsed_time(end_event)
+        if trial >= discard_first:
+            if verbose:
+                logical_idx = trial - discard_first + 1
+                print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms")
+            elapsed_times.append(elapsed_time_ms)
+
+    return elapsed_times
+
+
+def time_execution_with_do_bench_interface(
+    kernel_fn: callable,
+    args: list[Any],
+    # this is different for triton do_bench
+    num_warmup: int = 3, 
+    num_trials: int = 10,
+    discard_first: int = 1, # not used yet
     verbose: bool = True,
     device: torch.device | None = None) -> list[float]:
     """
-    TODO: need check do_bench
-    [WIP] need to check
+    Just using triton's do_bench as it is 
     """
 
-    device = torch.cuda.current_device() if device is not None else device
+    do_bench_fn = lambda : kernel_fn(*args)
+    return triton_testing.do_bench(fn=do_bench_fn,
+            warmup=25,
+            rep=100, 
+            grad_to_none=None, 
+            quantiles=None, 
+            return_mode="all")
+
 
-    if verbose: print("Using do_bench to evaluate kernel")
+def time_execution_with_do_bench_impl(
+    kernel_fn: callable,
+    args: list[Any],
+    num_warmup: int = 3,
+    num_trials: int = 10,
+    discard_first: int = 1, # not used yet
+    verbose: bool = True,
+    device: torch.device | None = None) -> list[float]:
+    """
+    This is modifying the triton do_bench codebase
+    See Triton's implementation for more details
+    https://github.com/triton-lang/triton/blob/9073370d5979218d1afa44ec895bbd80e7419a8c/python/triton/testing.py#L127
+    """
+
+    device = torch.cuda.current_device() if device is not None else device
+    if verbose: 
+        print(f"Using do_bench to evaluate kernel on {device}")
 
-    # note: for both nvidia and amd, di is torch.cuda (amd uses a cuda compatible interface), so we could really just have torch.cuda
-    di = runtime.driver.active.get_device_interface()
+    # speicfy device interface (supports both nvidia and amd)
+    # under the hood, di is torch.cuda (amd uses a cuda compatible interface)
+    di = triton_runtime.driver.active.get_device_interface()
 
     kernel_fn(*args)
     di.synchronize(device=device)
 
-    cache = runtime.driver.active.get_empty_cache_for_benchmark()
-
-    # Estimate the runtime of the function (not needed since now the warmup and repeat steps are set by the user)
+    # clear l2 cache
+    cache = triton_runtime.driver.active.get_empty_cache_for_benchmark()
 
+    # do_bench Estimate the runtime of the function 
+    # Here we are not using it not needed since now the warmup and repeat steps are set by the user)
     # start_event = di.Event(enable_timing=True)
     # end_event = di.Event(enable_timing=True)
     # start_event.record()
     # for _ in range(5):
-    #     runtime.driver.active.clear_cache(cache)
+    #     triton_runtime.driver.active.clear_cache(cache)
     #     kernel_fn(*args)
     # end_event.record()
     # di.synchronize()
@@ -87,8 +222,7 @@ def time_execution_with_do_bench(
         kernel_fn(*args)
     # Benchmark
     for i in range(num_trials):
-
-        # All our functions are forward passes, so we don't need to reset gradients
+        # All KernelBench functions are forward passes, so we don't need to reset gradients
         # we don't want `fn` to accumulate gradient values
         # if it contains a backward pass. So we clear the
         # provided gradients
@@ -97,7 +231,7 @@ def time_execution_with_do_bench(
         #         x.grad = None
         
         # we clear the L2 cache before each run
-        runtime.driver.active.clear_cache(cache)
+        triton_runtime.driver.active.clear_cache(cache)
         # record time of `fn`
         start_event[i].record()
         kernel_fn(*args)
@@ -109,16 +243,17 @@ def time_execution_with_do_bench(
     return times
 
 
-def time_execution_with_time_dot_time(
+def time_execution_with_cpu_time(
     kernel_fn: callable,
     args: list[Any],
     num_warmup: int = 3,
     num_trials: int = 10,
+    discard_first: int = 1,
     verbose: bool = True,
     device: torch.device | None = None,
 ) -> list[float]:
     """
-    Time a CUDA kernel function over multiple trials using time.time()
+    Time a CUDA kernel function over multiple trials using CPU side timing
     [WIP]
     Args:
         kernel_fn: Function to time
@@ -132,13 +267,6 @@ def time_execution_with_time_dot_time(
 
     Not recommended: 
     """
-
-    # give warning that this is not the way to do it
-    warnings.warn(
-        "time_execution_with_time_dot_time is meant for educational purposes only, please other options like time_with_cuda_event or time_with_do_bench",
-        UserWarning,
-    )
-
     if device is None:
         if verbose:
             print(f"Using current device: {torch.cuda.current_device()}")
@@ -152,95 +280,35 @@ def time_execution_with_time_dot_time(
     print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}")
     elapsed_times = []
 
+    # clear PyTorch allocator cache
+    torch.cuda.empty_cache()
+
     # Actual trials
-    for trial in range(num_trials):
-        start_time = time.time()
-        kernel_fn(*args)
+    for trial in range(num_trials + discard_first):
+        # block all streams on device
         torch.cuda.synchronize(device=device)
-        end_time = time.time()
 
-        # Calculate the elapsed time in milliseconds
-        elapsed_time_ms = (end_time - start_time) * 1000
-        if verbose:
-            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
-        elapsed_times.append(elapsed_time_ms)
-
-    return elapsed_times
+        # focus on cold_cache performance
+        clear_l2_cache()
 
-
-
-
-def time_execution_with_cuda_event(
-    kernel_fn: callable,
-    args: list[Any],
-    num_warmup: int = 3,
-    num_trials: int = 10,
-    verbose: bool = True,
-    device: torch.device = None,
-) -> list[float]:
-    """
-    Time a CUDA kernel function over multiple trials using torch.cuda.Event
-    The first version of KenrelBench used this for evaluation.
-    We care about cold cache performance here.
-
-    Args:
-        kernel_fn: Function to time
-        *args: Arguments to pass to kernel_fn
-        num_trials: Number of timing trials to run
-        verbose: Whether to print per-trial timing info
-        device: CUDA device to use, if None, use current device
-
-    TODO: double check this with team 
-    Returns:
-        List of elapsed times in milliseconds
-    """
-    if device is None:
-        if verbose:
-            print(f"Using current device: {torch.cuda.current_device()}")
-        device = torch.cuda.current_device()
-
-    # Warm ups
-    for _ in range(num_warmup):
+        # CPU-side wall clock time using perf_counter (high-resolution timer)
+        start_time = time.perf_counter()
         kernel_fn(*args)
-        torch.cuda.synchronize(device=device)
-
-    # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache
-    torch.cuda.empty_cache()
-
-    print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
-    )
-
-    elapsed_times: list[float] = [] # in ms
-
-    # Timing trials
-    for trial in range(num_trials):
-        torch.cuda.synchronize(device=device) # block on all streams
-
-        # create event marker default is not interprocess
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-        
-        clear_l2_cache() # measuring cold cache performance
-        
-        # note cuda events mark event on current stream
-        start_event.record()
-        _ = kernel_fn(*args)
-        end_event.record() 
-
-        # waits for all streams on that device
-        # though it is important to note the events only record time between on current stream
-        # TODO: find ways to check hacks by launching work on additional stream
-        torch.cuda.synchronize(device=device)
+        torch.cuda.synchronize(device=device) # wait for all stream to finish
+        # this blocks the CPU until all GPU work on device is done
+        # this means all kernels on all streams
+        end_time = time.perf_counter()
 
         # Calculate the elapsed time in milliseconds
-        elapsed_time_ms = start_event.elapsed_time(end_event)
-        if verbose:
-            print(f"Timing Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
-        elapsed_times.append(elapsed_time_ms)
+        elapsed_time_ms = (end_time - start_time) * 1000
+        if trial >= discard_first:
+            if verbose:
+                logical_idx = trial - discard_first + 1
+                print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms")
+            elapsed_times.append(elapsed_time_ms)
 
     return elapsed_times
 
-
 ########################################################
 # Timing stats
 #########################################################
diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py
index 955b2519..24d0faf2 100644
--- a/src/unit_tests/test_eval_timing.py
+++ b/src/unit_tests/test_eval_timing.py
@@ -59,13 +59,21 @@ def matmul_kernel(a, b):
     
     # Validate results
     assert isinstance(elapsed_times, list), "Expected list of elapsed times"
-    assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}"
+    # assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}"
     assert all(isinstance(t, float) for t in elapsed_times), "All timing results should be floats"
     assert all(t > 0 for t in elapsed_times), "All timing results should be positive"
-    print(f"smoke test matmul elapsed times with {timing_func_name} (in ms): {elapsed_times}")
+    # print(f"smoke test matmul elapsed times with {timing_func_name} (in ms): {elapsed_times}")
 
+    stats = timing.get_timing_stats(elapsed_times, device=device)
+    print("Timing stats")
+    print(stats)
+    
+
+timing_methods = ["cuda_event", "cpu_time", "do_bench_interface", "do_bench_impl"]
+
+for timing_method in timing_methods:
+    _run_timing_smoke_test_matmul(timing_method)
 
-_run_timing_smoke_test_matmul("cuda_event")
 
 
 def test_do_bench_simple_smoke():

From 920a7932939eb3e189572d85cfdea08a2ea4aee4 Mon Sep 17 00:00:00 2001
From: Pietro <marsella@stanford.edu>
Date: Fri, 12 Dec 2025 04:44:47 +0000
Subject: [PATCH 05/13] some annotations

---
 .gitignore    | Bin 171 -> 152 bytes
 src/eval.py   | 261 ++++++++++++++++++++------------------------------
 src/timing.py |  36 ++++---
 3 files changed, 126 insertions(+), 171 deletions(-)

diff --git a/.gitignore b/.gitignore
index abb125d6aabe372688772976daa7ee024353eaf8..3956bf85591170be4f1159744411e5afa85d36a3 100644
GIT binary patch
delta 22
dcmZ3@ID>IQCx3iWX=YAJd~#xPMyft97XV!Z2X+7e

delta 41
vcmbQixSDZ7r&1zADMJcFCPN8BJVOpcDnl6%=P(#E=rI&9R5I`~a4`S?*LnzW

diff --git a/src/eval.py b/src/eval.py
index 4a072c89..2573da9c 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -15,13 +15,14 @@
 from io import StringIO
 from typing import Union, Optional
 
-import numpy as np
-import requests
 import torch
 import torch.nn as nn
 from pydantic import BaseModel
+from triton import runtime
 
-from . import utils
+
+# import cupy as cp   
+import utils, timing
 
 REPO_TOP_PATH = os.path.abspath(
     os.path.join(
@@ -46,6 +47,7 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str
     if isinstance(problem_id, str):
         problem_id = int(problem_id)
 
+     # TODO: replace dataset object @Omar
     problem_path = problems[problem_id]
 
     # problem_path = os.path.join(REPO_ROOT_PATH, problem)
@@ -58,7 +60,6 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str
     else:
         return (problem_path, ref_arch)
 
-
 def fetch_ref_arch_from_level_problem_id(level, problem_id, with_name=False):
     PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level))
     dataset = utils.construct_problem_dataset_from_problem_dir(PROBLEM_DIR)
@@ -70,6 +71,36 @@ def set_seed(seed: int):
     # NOTE: this only sets on current cuda device
     torch.cuda.manual_seed(seed)
 
+
+def clear_l2_cache(device: str = "cuda"):
+    """
+    Clear L2 Cache line by thrashing
+    From GPU mode reference kernel repo:
+    https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163
+
+    We can improve this 
+    TODO; should prob check device_name
+    """
+    # don't reserve space for persisting lines
+    # cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
+    
+    # Thrash L2 cache by creating a larger dummy tensor, effectively flushing the cache
+    # 32 * 1024 * 1024 * 8B = 256MB 
+    # NOTE: we can make this more adaptive based on device
+    # L2 cache sizes: A100=40MB, H100=50MB, H200=90MB, RTX4090=72MB, L40S=48MB, Blackwell≈192MB → overwrite >200MB to fully thrash L2
+    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device=device)
+    # write to tenosr with inplace fill
+    dummy.fill_(42) 
+    del dummy
+
+def clear_l2_cache_triton(cache, device): 
+    # cp.cuda.runtime.cudaDeviceSetLimit(
+    #     cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0
+    # )
+    cache = runtime.driver.active.get_empty_cache_for_benchmark()
+    runtime.driver.active.clear_cache(cache)
+
+
 def get_torch_dtype_from_string(precision: str) -> torch.dtype:
     """
     Get the torch dtype for specific precision
@@ -107,9 +138,8 @@ def get_tolerance_for_precision(precision: str | torch.dtype) -> float:
 
 class KernelExecResult(BaseModel):
     """
-    Single Kernel Execution
+    Single Kernel Execution - all the information it needs
     """
-
     compiled: bool = False
     correctness: bool = False
     metadata: dict = {}
@@ -387,6 +417,57 @@ def _process_input_tensor(input, device, backend="cuda", precision=torch.float32
     return input_tensor.to(device=device)
 
 
+def load_kernel(
+    verbose: str,
+    backend: str,
+    custom_model_src, 
+    context,
+    build_dir,
+    device,
+    metadata: dict,
+    ) -> tuple[Union[nn.Module, KernelExecResult, None], Optional[tempfile.NamedTemporaryFile]]:
+    '''KernelExecResult means that loading the kernel failed (either because of compilation or something else), ModelNew that we succesfully loaded ModelNew'''
+    if verbose:
+        print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel")
+
+    try:
+        os.environ["TORCH_USE_CUDA_DSA"] = "1"  # compile with device side assertion
+        tempfile = None
+        
+        if backend.lower() in ["triton", "tilelang", "cute"]:
+            # Use tempfile approach for triton, tilelang, and cute
+            # These DSLs require proper module import for JIT decorators to work
+            ModelNew, tempfile = load_custom_model_with_tempfile(
+                custom_model_src, entry_point="ModelNew"
+            )
+        else:
+            # Default CUDA backend
+            ModelNew = load_custom_model(custom_model_src, context, build_dir)
+        torch.cuda.synchronize(device=device)  # not sure if this is too much
+    except Exception as e:
+        print(
+            f"Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}"
+        )
+        # TODO: add metadata for compilation error (how to we get the compilation error message?)
+
+        if "lock" in str(e) or "No such file or directory" in str(e):
+            # this is a lock file error, likely due to concurrent compilation
+            # this does not necessarily mean the compilation failed, but we should retry
+            print(
+                f"[Eval] Lock file error during compilation, Please retry. Error: {e}"
+            )
+            graceful_eval_cleanup(context, device, tempfile)
+            return None, None
+        else:
+            metadata["compilation_error_name"] = get_error_name(e)
+            metadata["compilation_error"] = str(e)
+            graceful_eval_cleanup(context, device, tempfile)
+            return KernelExecResult(
+                compiled=False, metadata=metadata
+            ), None
+    return ModelNew, tempfile
+
+
 def eval_kernel_against_ref(
     original_model_src: str,
     custom_model_src: str,
@@ -470,50 +551,15 @@ def eval_kernel_against_ref(
         assert hasattr(original_model, "forward")
         if verbose:
             print("[Eval] Original Model Loaded")
-    
-    if verbose:
-        print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel")
-
-    # this is where compilation happens
-    try:
-        os.environ["TORCH_USE_CUDA_DSA"] = "1"  # compile with device side assertion
-        tempfile = None
-        # add hash for later to distinguish between multi-turn kernels
-        
-        backend_lower = backend.lower()
-        if backend_lower in ["triton", "tilelang", "cute"]:
-            # Use tempfile approach for triton, tilelang, and cute
-            # These DSLs require proper module import for JIT decorators to work
-            ModelNew, tempfile = load_custom_model_with_tempfile(
-                custom_model_src, entry_point="ModelNew"
-            )
-        else:
-            # Default CUDA backend
-            ModelNew = load_custom_model(custom_model_src, context, build_dir)
-        torch.cuda.synchronize(device=device)  # not sure if this is too much
-    except Exception as e:
-        print(
-            f"Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}"
-        )
-        # TODO: add metadata for compilation error (how to we get the compilation error message?)
-
-        if "lock" in str(e) or "No such file or directory" in str(e):
-            # this is a lock file error, likely due to concurrent compilation
-            # this does not necessarily mean the compilation failed, but we should retry
-            print(
-                f"[Eval] Lock file error during compilation, Please retry. Error: {e}"
-            )
-            graceful_eval_cleanup(context, device, tempfile)
-            return None
-        else:
-            metadata["compilation_error_name"] = get_error_name(e)
-            metadata["compilation_error"] = e
-            graceful_eval_cleanup(context, device, tempfile)
-            return KernelExecResult(
-                compiled=False, metadata=metadata
-            )  # skip further steps
-
-    # at this point we passed compilation
+    result, tempfile = load_kernel(
+        verbose, backend, custom_model_src, context, build_dir, device, metadata
+    )
+    if isinstance(result, KernelExecResult):
+        return result # loading the kernel failed, return the exec result
+    if result is None:
+        # lockfile / concurrent compilation: retryable failure
+        return None
+    ModelNew = result # we passed loading
     try:
         with torch.no_grad():
             set_seed(seed_num)  # set seed for reproducible weights
@@ -536,7 +582,7 @@ def eval_kernel_against_ref(
             compiled=True, correctness=False, metadata=metadata
         )  # skip further steps
 
-    kernel_exec_result = None
+    # kernel_exec_result = None
 
     # Check Correctness
     if verbose:
@@ -578,14 +624,16 @@ def eval_kernel_against_ref(
                 model_new = custom_model.to(device=device, dtype=precision)
                 torch.cuda.synchronize(device=device)
 
-                elapsed_times = time_execution_with_cuda_event(
+                # TODO: replace functions from timing based on we choose
+                # we should pass in which timing method you wanna do
+                elapsed_times = timing.time_execution_with_cuda_event(
                     model_new,
-                    *inputs,
+                    inputs,
                     num_trials=num_perf_trials,
                     verbose=verbose,
                     device=device,
                 )
-                runtime_stats = get_timing_stats(elapsed_times, device=device)
+                runtime_stats = timing.get_timing_stats(elapsed_times, device=device)
 
                 if verbose:
                     print(f"[Eval] Performance Stats: {runtime_stats}")
@@ -625,63 +673,6 @@ def register_and_format_exception(
     return metadata
 
 
-def time_execution_with_cuda_event(
-    kernel_fn: callable,
-    *args,
-    num_warmup: int = 3,
-    num_trials: int = 10,
-    verbose: bool = True,
-    device: torch.device = None,
-) -> list[float]:
-    """
-    Time a CUDA kernel function over multiple trials using torch.cuda.Event
-
-    Args:
-        kernel_fn: Function to time
-        *args: Arguments to pass to kernel_fn
-        num_trials: Number of timing trials to run
-        verbose: Whether to print per-trial timing info
-        device: CUDA device to use, if None, use current device
-
-    Returns:
-        List of elapsed times in milliseconds
-    """
-    if device is None:
-        if verbose:
-            print(f"Using current device: {torch.cuda.current_device()}")
-        device = torch.cuda.current_device()
-
-    # Warm ups
-    for _ in range(num_warmup):
-        kernel_fn(*args)
-        torch.cuda.synchronize(device=device)
-
-    print(
-        f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
-    )
-    elapsed_times = []
-
-    # Actual trials
-    for trial in range(num_trials):
-        # create event marker default is not interprocess
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-
-        start_event.record()
-        kernel_fn(*args)
-        end_event.record()
-
-        # Synchronize to ensure the events have completed
-        torch.cuda.synchronize(device=device)
-
-        # Calculate the elapsed time in milliseconds
-        elapsed_time_ms = start_event.elapsed_time(end_event)
-        if verbose:
-            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
-        elapsed_times.append(elapsed_time_ms)
-
-    return elapsed_times
-
 
 def run_and_check_correctness(
     original_model_instance: nn.Module,
@@ -865,54 +856,6 @@ def convert_to_serializable(obj):
         return converted_metadata
 
 
-################################################################################
-# Performance Eval
-################################################################################
-
-
-def fetch_baseline_time(
-    level_name: str, problem_id: int, dataset: list[str], baseline_time_filepath: str
-) -> dict:
-    """
-    Fetch the baseline time from the time
-    """
-    if not os.path.exists(baseline_time_filepath):
-        raise FileNotFoundError(
-            f"Baseline time file not found at {baseline_time_filepath}"
-        )
-
-    with open(baseline_time_filepath, "r") as f:
-        baseline_json = json.load(f)
-
-    problem_name = dataset[problem_id].split("/")[-1]
-    baseline_time = baseline_json[level_name].get(problem_name, None)
-    return baseline_time
-
-
-def get_timing_stats(elapsed_times: list[float], device: torch.device = None) -> dict:
-    """Get timing statistics from a list of elapsed times.
-
-    Args:
-        elapsed_times: List of elapsed times in milliseconds
-        device: CUDA device, record device info
-    Returns:
-        Dict containing mean, std, min, max and num_trials
-        all timing are in ms
-    """
-
-    stats = {
-        "mean": float(f"{np.mean(elapsed_times):.3g}"),
-        "std": float(f"{np.std(elapsed_times):.3g}"),
-        "min": float(f"{np.min(elapsed_times):.3g}"),
-        "max": float(f"{np.max(elapsed_times):.3g}"),
-        "num_trials": len(elapsed_times),
-    }
-
-    if device:
-        stats["hardware"] = torch.cuda.get_device_name(device=device)
-        stats["device"] = str(device)  # for debugging
-
-    return stats
 
 
 # if __name__ == "__main__":
diff --git a/src/timing.py b/src/timing.py
index c10b1c58..56cefd09 100644
--- a/src/timing.py
+++ b/src/timing.py
@@ -2,9 +2,10 @@
 import json
 import numpy as np
 import time
-import warnings
 from typing import Any
 import os
+
+# we leverage triton's testing functionality for some timing methods
 from triton import runtime as triton_runtime
 from triton import testing as triton_testing    
 
@@ -17,9 +18,6 @@ def clear_l2_cache(device: str = "cuda"):
     Clear L2 Cache line by thrashing
     From GPU mode reference kernel repo:
     https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163
-
-    We can improve this 
-    TODO; should prob check device_name
     """
     # don't reserve space for persisting lines
     # cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
@@ -39,6 +37,7 @@ def clear_l2_cache_triton(cache=None, device: str = "cuda"):
     """
     with torch.cuda.device(device):
         cache = triton_runtime.driver.active.get_empty_cache_for_benchmark()
+        # this effectively thrashes L2 cache under the hood too
         triton_runtime.driver.active.clear_cache(cache)
 
 
@@ -58,8 +57,8 @@ def get_timing_function(
             return time_execution_with_do_bench_interface
         case "do_bench_impl":
             return time_execution_with_do_bench_impl
-        case "cpu_time":
-            return time_execution_with_cpu_time 
+        case "host_time":
+            return time_execution_with_host_time 
         # we might add other methods in the future
         case _: 
             raise ValueError(f"Unsupported timing method: {method}")
@@ -156,10 +155,12 @@ def time_execution_with_do_bench_interface(
     verbose: bool = True,
     device: torch.device | None = None) -> list[float]:
     """
-    Just using triton's do_bench as it is 
+    Using triton's default do_bench as it is 
+    Note we don't set num_warmup and num_trials, and we use warmup 25 ms and repetition time 100 ms with Triton's default values
+    See doc: https://triton-lang.org/main/python-api/generated/triton.testing.do_bench.html
+    Benchmark the runtime of the provided function. By default, return the median runtime of fn along with the 20-th and 80-th performance percentile.
     """
-
-    do_bench_fn = lambda : kernel_fn(*args)
+    do_bench_fn = lambda : kernel_fn(*args) # wrap function with arguments
     return triton_testing.do_bench(fn=do_bench_fn,
             warmup=25,
             rep=100, 
@@ -180,6 +181,10 @@ def time_execution_with_do_bench_impl(
     This is modifying the triton do_bench codebase
     See Triton's implementation for more details
     https://github.com/triton-lang/triton/blob/9073370d5979218d1afa44ec895bbd80e7419a8c/python/triton/testing.py#L127
+
+    Note we duplicate triton's implementation and modify / comment out parts
+    to use num_warmup and num_trials that explicitly follows what user define here
+    instead of do_bench's version that computes how many times to run warmup and profile based on total warmup and repetition time
     """
 
     device = torch.cuda.current_device() if device is not None else device
@@ -243,7 +248,7 @@ def time_execution_with_do_bench_impl(
     return times
 
 
-def time_execution_with_cpu_time(
+def time_execution_with_host_time(
     kernel_fn: callable,
     args: list[Any],
     num_warmup: int = 3,
@@ -253,8 +258,12 @@ def time_execution_with_cpu_time(
     device: torch.device | None = None,
 ) -> list[float]:
     """
-    Time a CUDA kernel function over multiple trials using CPU side timing
-    [WIP]
+    Time a CUDA kernel function over multiple trials using Host (CPU) side timing
+    
+    This measures host-side wall clock time, E2E latency observed by host
+    Note that could take including Python overhead, CUDA launch/runtime costs, synchronization, all GPU work across all streams, and host OS overhaed
+    Hence results might be longer than device-side (CUDA event) timings
+
     Args:
         kernel_fn: Function to time
         args: Arguments to pass to kernel_fn
@@ -311,12 +320,15 @@ def time_execution_with_cpu_time(
 
 ########################################################
 # Timing stats
+# tools to help compute speedup and other time
 #########################################################
 def fetch_baseline_time(
     level_name: str, problem_id: int, dataset: list[str], baseline_time_filepath: str
 ) -> dict:
     """
     Fetch the baseline time from the time
+
+    Note: might be better to just run the refernece using torch eager and compile sometimes
     """
     if not os.path.exists(baseline_time_filepath):
         raise FileNotFoundError(

From 05d408faec732ea05d1c78875e2d1f104588bf60 Mon Sep 17 00:00:00 2001
From: Pietro <marsella@stanford.edu>
Date: Fri, 12 Dec 2025 05:02:36 +0000
Subject: [PATCH 06/13] run_and_check compatible

---
 scripts/generate_baseline_time.py | 14 ++++++++++----
 src/eval.py                       |  2 +-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py
index 5a68ea08..0a1f608b 100644
--- a/scripts/generate_baseline_time.py
+++ b/scripts/generate_baseline_time.py
@@ -2,11 +2,13 @@
 import numpy as np
 from src.eval import (
     load_original_model_and_inputs,
-    time_execution_with_cuda_event,
-    get_timing_stats,
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
+from src.timing import (
+    get_timing_function,
+    get_timing_stats,
+)
 from src.dataset import construct_problem_dataset_from_problem_dir
 from src.utils import read_file
 import os
@@ -81,6 +83,7 @@ def measure_program_time(
         torch_compile_options: str="default",
         device: torch.device="cuda:0",
         verbose: bool = False,
+        timing_method: str = "cuda_event",
 ) -> dict:
     """
     Measure the time of a KernelBench reference architecture
@@ -116,8 +119,11 @@ def measure_program_time(
             
             model = model.cuda(device=device)
             torch.cuda.synchronize(device=device)
-            elapsed_times = time_execution_with_cuda_event(
-                model, *inputs, num_trials=num_trials, verbose=verbose, device=device
+
+            # run chosen timing function
+            timing_fn = get_timing_function(timing_method)
+            elapsed_times = timing_fn(
+                model, inputs, num_trials=num_trials, verbose=verbose, device=device
             )
             runtime_stats = get_timing_stats(elapsed_times, device=device)
 
diff --git a/src/eval.py b/src/eval.py
index 2573da9c..e117d7dc 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -22,7 +22,7 @@
 
 
 # import cupy as cp   
-import utils, timing
+from . import utils, timing
 
 REPO_TOP_PATH = os.path.abspath(
     os.path.join(

From 2be968a5a1a5165af1e007eadbae59f70326fd31 Mon Sep 17 00:00:00 2001
From: Pietro <marsella@stanford.edu>
Date: Fri, 12 Dec 2025 05:10:13 +0000
Subject: [PATCH 07/13] revert eval and add only necessary changes

---
 src/eval.py | 154 ++++++++++++++++++----------------------------------
 1 file changed, 53 insertions(+), 101 deletions(-)

diff --git a/src/eval.py b/src/eval.py
index e117d7dc..f157a6b7 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -15,13 +15,12 @@
 from io import StringIO
 from typing import Union, Optional
 
+import numpy as np
+import requests
 import torch
 import torch.nn as nn
 from pydantic import BaseModel
-from triton import runtime
 
-
-# import cupy as cp   
 from . import utils, timing
 
 REPO_TOP_PATH = os.path.abspath(
@@ -47,7 +46,6 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str
     if isinstance(problem_id, str):
         problem_id = int(problem_id)
 
-     # TODO: replace dataset object @Omar
     problem_path = problems[problem_id]
 
     # problem_path = os.path.join(REPO_ROOT_PATH, problem)
@@ -60,6 +58,7 @@ def fetch_ref_arch_from_problem_id(problem_id, problems, with_name=False) -> str
     else:
         return (problem_path, ref_arch)
 
+
 def fetch_ref_arch_from_level_problem_id(level, problem_id, with_name=False):
     PROBLEM_DIR = os.path.join(KERNEL_BENCH_PATH, "level" + str(level))
     dataset = utils.construct_problem_dataset_from_problem_dir(PROBLEM_DIR)
@@ -71,36 +70,6 @@ def set_seed(seed: int):
     # NOTE: this only sets on current cuda device
     torch.cuda.manual_seed(seed)
 
-
-def clear_l2_cache(device: str = "cuda"):
-    """
-    Clear L2 Cache line by thrashing
-    From GPU mode reference kernel repo:
-    https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163
-
-    We can improve this 
-    TODO; should prob check device_name
-    """
-    # don't reserve space for persisting lines
-    # cp.cuda.runtime.cudaDeviceSetLimit(cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0)
-    
-    # Thrash L2 cache by creating a larger dummy tensor, effectively flushing the cache
-    # 32 * 1024 * 1024 * 8B = 256MB 
-    # NOTE: we can make this more adaptive based on device
-    # L2 cache sizes: A100=40MB, H100=50MB, H200=90MB, RTX4090=72MB, L40S=48MB, Blackwell≈192MB → overwrite >200MB to fully thrash L2
-    dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device=device)
-    # write to tenosr with inplace fill
-    dummy.fill_(42) 
-    del dummy
-
-def clear_l2_cache_triton(cache, device): 
-    # cp.cuda.runtime.cudaDeviceSetLimit(
-    #     cp.cuda.runtime.cudaLimitPersistingL2CacheSize, 0
-    # )
-    cache = runtime.driver.active.get_empty_cache_for_benchmark()
-    runtime.driver.active.clear_cache(cache)
-
-
 def get_torch_dtype_from_string(precision: str) -> torch.dtype:
     """
     Get the torch dtype for specific precision
@@ -138,8 +107,9 @@ def get_tolerance_for_precision(precision: str | torch.dtype) -> float:
 
 class KernelExecResult(BaseModel):
     """
-    Single Kernel Execution - all the information it needs
+    Single Kernel Execution
     """
+
     compiled: bool = False
     correctness: bool = False
     metadata: dict = {}
@@ -417,57 +387,6 @@ def _process_input_tensor(input, device, backend="cuda", precision=torch.float32
     return input_tensor.to(device=device)
 
 
-def load_kernel(
-    verbose: str,
-    backend: str,
-    custom_model_src, 
-    context,
-    build_dir,
-    device,
-    metadata: dict,
-    ) -> tuple[Union[nn.Module, KernelExecResult, None], Optional[tempfile.NamedTemporaryFile]]:
-    '''KernelExecResult means that loading the kernel failed (either because of compilation or something else), ModelNew that we succesfully loaded ModelNew'''
-    if verbose:
-        print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel")
-
-    try:
-        os.environ["TORCH_USE_CUDA_DSA"] = "1"  # compile with device side assertion
-        tempfile = None
-        
-        if backend.lower() in ["triton", "tilelang", "cute"]:
-            # Use tempfile approach for triton, tilelang, and cute
-            # These DSLs require proper module import for JIT decorators to work
-            ModelNew, tempfile = load_custom_model_with_tempfile(
-                custom_model_src, entry_point="ModelNew"
-            )
-        else:
-            # Default CUDA backend
-            ModelNew = load_custom_model(custom_model_src, context, build_dir)
-        torch.cuda.synchronize(device=device)  # not sure if this is too much
-    except Exception as e:
-        print(
-            f"Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}"
-        )
-        # TODO: add metadata for compilation error (how to we get the compilation error message?)
-
-        if "lock" in str(e) or "No such file or directory" in str(e):
-            # this is a lock file error, likely due to concurrent compilation
-            # this does not necessarily mean the compilation failed, but we should retry
-            print(
-                f"[Eval] Lock file error during compilation, Please retry. Error: {e}"
-            )
-            graceful_eval_cleanup(context, device, tempfile)
-            return None, None
-        else:
-            metadata["compilation_error_name"] = get_error_name(e)
-            metadata["compilation_error"] = str(e)
-            graceful_eval_cleanup(context, device, tempfile)
-            return KernelExecResult(
-                compiled=False, metadata=metadata
-            ), None
-    return ModelNew, tempfile
-
-
 def eval_kernel_against_ref(
     original_model_src: str,
     custom_model_src: str,
@@ -551,15 +470,50 @@ def eval_kernel_against_ref(
         assert hasattr(original_model, "forward")
         if verbose:
             print("[Eval] Original Model Loaded")
-    result, tempfile = load_kernel(
-        verbose, backend, custom_model_src, context, build_dir, device, metadata
-    )
-    if isinstance(result, KernelExecResult):
-        return result # loading the kernel failed, return the exec result
-    if result is None:
-        # lockfile / concurrent compilation: retryable failure
-        return None
-    ModelNew = result # we passed loading
+    
+    if verbose:
+        print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel")
+
+    # this is where compilation happens
+    try:
+        os.environ["TORCH_USE_CUDA_DSA"] = "1"  # compile with device side assertion
+        tempfile = None
+        # add hash for later to distinguish between multi-turn kernels
+        
+        backend_lower = backend.lower()
+        if backend_lower in ["triton", "tilelang", "cute"]:
+            # Use tempfile approach for triton, tilelang, and cute
+            # These DSLs require proper module import for JIT decorators to work
+            ModelNew, tempfile = load_custom_model_with_tempfile(
+                custom_model_src, entry_point="ModelNew"
+            )
+        else:
+            # Default CUDA backend
+            ModelNew = load_custom_model(custom_model_src, context, build_dir)
+        torch.cuda.synchronize(device=device)  # not sure if this is too much
+    except Exception as e:
+        print(
+            f"Failed to compile custom CUDA kernel: Record as compilation failure. \nError: {e}"
+        )
+        # TODO: add metadata for compilation error (how to we get the compilation error message?)
+
+        if "lock" in str(e) or "No such file or directory" in str(e):
+            # this is a lock file error, likely due to concurrent compilation
+            # this does not necessarily mean the compilation failed, but we should retry
+            print(
+                f"[Eval] Lock file error during compilation, Please retry. Error: {e}"
+            )
+            graceful_eval_cleanup(context, device, tempfile)
+            return None
+        else:
+            metadata["compilation_error_name"] = get_error_name(e)
+            metadata["compilation_error"] = e
+            graceful_eval_cleanup(context, device, tempfile)
+            return KernelExecResult(
+                compiled=False, metadata=metadata
+            )  # skip further steps
+
+    # at this point we passed compilation
     try:
         with torch.no_grad():
             set_seed(seed_num)  # set seed for reproducible weights
@@ -582,7 +536,7 @@ def eval_kernel_against_ref(
             compiled=True, correctness=False, metadata=metadata
         )  # skip further steps
 
-    # kernel_exec_result = None
+    kernel_exec_result = None
 
     # Check Correctness
     if verbose:
@@ -624,9 +578,9 @@ def eval_kernel_against_ref(
                 model_new = custom_model.to(device=device, dtype=precision)
                 torch.cuda.synchronize(device=device)
 
-                # TODO: replace functions from timing based on we choose
-                # we should pass in which timing method you wanna do
-                elapsed_times = timing.time_execution_with_cuda_event(
+                # support multiple timing backend
+                timing_fn = timing.get_timing_function("cuda_event")
+                elapsed_times = timing_fn(
                     model_new,
                     inputs,
                     num_trials=num_perf_trials,
@@ -673,7 +627,6 @@ def register_and_format_exception(
     return metadata
 
 
-
 def run_and_check_correctness(
     original_model_instance: nn.Module,
     new_model_instance: nn.Module,
@@ -857,7 +810,6 @@ def convert_to_serializable(obj):
 
 
 
-
 # if __name__ == "__main__":
 # fetch_kernel_from_database("kernelbench_prompt_v2_level_2", 1, 1, "http://localhost:9091")
 # print(fetch_ref_arch_from_level_problem_id("2", 1, with_name=True))

From 936f22149f5918496a980a08ac62bb94f6d07807 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Mon, 15 Dec 2025 03:12:42 +0000
Subject: [PATCH 08/13] top_level eval entry point to set timing_method

---
 scripts/eval_from_generations.py              |  6 ++++++
 scripts/generate_and_eval_single_sample.py    |  2 ++
 .../generate_and_eval_single_sample_modal.py  |  6 ++++--
 scripts/run_and_check.py                      | 20 +++++++++++++++----
 src/eval.py                                   |  9 +++++++--
 5 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index 2e39e3be..b28a3be0 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -113,6 +113,7 @@ def __init__(self):
         self.num_perf_trials = 100
         self.timeout = 180  # in seconds
         self.measure_performance = True
+        self.timing_method = "cuda_event"
 
         # Eval Flow setting
         # To speedup evaluation, you can start building the kernel on CPU on disk as cache
@@ -173,6 +174,7 @@ def evaluate_single_sample_modal(
         num_correct_trials: int = 5,
         num_perf_trials: int = 100,
         measure_performance: bool = True,
+        timing_method: str = "cuda_event",
         verbose: bool = False,
         backend: str = "cuda",
         precision: str = "fp32",
@@ -212,6 +214,7 @@ def evaluate_single_sample_modal(
                 original_model_src=ref_arch_src,
                 custom_model_src=kernel_src,
                 measure_performance=measure_performance,
+                timing_method=timing_method,
                 verbose=verbose,
                 num_correct_trials=num_correct_trials,
                 num_perf_trials=num_perf_trials,
@@ -324,6 +327,7 @@ def evaluate_single_sample(
             original_model_src=ref_arch_src,
             custom_model_src=kernel_src,
             measure_performance=configs.measure_performance,
+            timing_method=configs.timing_method,
             verbose=configs.verbose,
             num_correct_trials=configs.num_correct_trials,
             num_perf_trials=configs.num_perf_trials,
@@ -384,6 +388,7 @@ def evaluate_single_sample_modal_direct(
             num_correct_trials=configs.num_correct_trials,
             num_perf_trials=configs.num_perf_trials,
             measure_performance=configs.measure_performance,
+            timing_method=configs.timing_method,
             verbose=configs.verbose,
         )
         return eval_result
@@ -502,6 +507,7 @@ def batch_eval_modal(
                             num_correct_trials=config.num_correct_trials,
                             num_perf_trials=config.num_perf_trials,
                             measure_performance=config.measure_performance,
+                            timing_method=config.timing_method,
                             verbose=config.verbose,
                             backend=config.backend,
                             precision=config.precision,
diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index 2b2d5301..2e110932 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -73,6 +73,7 @@ def __init__(self):
         self.log_eval_result = False
 
         self.backend = "cuda"
+        self.timing_method = "cuda_event"  # see timing.py
 
         # Prompt construction
         self.prompt_option = "one_shot"  # choices: zero_shot, one_shot, few_shot
@@ -267,6 +268,7 @@ def main(config: EvalConfig):
         custom_kernel,
         verbose=config.verbose,
         measure_performance=True,
+        timing_method=config.timing_method,
         num_correct_trials=5,
         num_perf_trials=100,
         backend=config.backend,
diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index 7628e0bf..9dee518a 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -75,6 +75,7 @@ def __init__(self):
         self.log_eval_result = False
 
         self.backend = "cuda"
+        self.timing_method = "cuda_event"  # see timing.py
         # Prompt generation settings
         self.prompt_option = "one_shot"  # zero_shot, one_shot, few_shot
         self.include_hardware_info = False
@@ -110,7 +111,7 @@ def __repr__(self):
 class EvalFunc:
 
     @modal.method()
-    def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision):
+    def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision, timing_method):
         # 3. Evaluate Kernel
         # NOTE: no need to wrap around process here as only a single sample
         # see batch eval for examples of process isolation
@@ -121,6 +122,7 @@ def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arc
         modal_set_gpu_arch(gpu_arch)
         return eval_kernel_against_ref(
             ref_arch_src, custom_kernel, verbose=verbose, measure_performance=True, 
+            timing_method=timing_method,
             num_correct_trials=5, num_perf_trials=100, backend=backend, precision=get_torch_dtype_from_string(precision)
         )
 
@@ -274,7 +276,7 @@ def main(config: EvalConfig):
 
     with app.run():
         kernel_exec_result = EvalFunc.with_options(gpu=config.gpu)().eval_single_sample_modal.remote(
-            ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision
+            ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision, config.timing_method
         )
         
         print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")
diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
index 316b96ee..e0492938 100644
--- a/scripts/run_and_check.py
+++ b/scripts/run_and_check.py
@@ -57,6 +57,8 @@
 Usage:
 1. PyTorch reference is a local file (local eval)
 python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/model_ex_add.py kernel_src_path=src/prompts/model_new_ex_add.py eval_mode=local
+python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/few_shot/model_ex_tiled_matmul.py kernel_src_path=src/prompts/few_shot/model_new_ex_tiled_matmul.py eval_mode=local
+
 
 2. PyTorch reference is a kernelbench problem (local eval)
 python3 scripts/run_and_check.py ref_origin=kernelbench level=<level> problem_id=<problem_id> kernel_src_path=<path to model-generated kernel> eval_mode=local
@@ -101,6 +103,7 @@ def __init__(self):
         # verbose logging
         self.verbose = False
         self.measure_performance = True
+        self.timing_method = "cuda_event"  # see timing.py
         self.build_dir_prefix = "" # if you want to specify a custom build directory
         self.clear_cache = False # TODO
 
@@ -128,18 +131,23 @@ def evaluate_single_sample_src(ref_arch_src: str, kernel_src: str, configs: dict
     num_perf_trials = configs["num_perf_trials"]    
     verbose = configs["verbose"]
     measure_performance = configs["measure_performance"]
+    timing_method = configs["timing_method"]
+    backend = configs["backend"]
+    precision = kernel_eval.get_torch_dtype_from_string(configs["precision"])
+    
     try:
         eval_result = kernel_eval.eval_kernel_against_ref(
         original_model_src=ref_arch_src,
             custom_model_src=kernel_src,
             measure_performance=measure_performance,
+            timing_method=timing_method,
             verbose=verbose,
             num_correct_trials=num_correct_trials,
             num_perf_trials=num_perf_trials,
             build_dir=build_dir,
             device=device,
-            backend=configs["backend"],
-            precision=kernel_eval.get_torch_dtype_from_string(configs["precision"])
+            backend=backend,
+            precision=precision
         )
         return eval_result
     except Exception as e:
@@ -180,17 +188,21 @@ def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, c
         num_perf_trials = configs["num_perf_trials"]
         verbose = configs["verbose"]
         measure_performance = configs["measure_performance"]
+        timing_method = configs["timing_method"]
+        backend = configs["backend"]
+        precision = kernel_eval.get_torch_dtype_from_string(configs["precision"])
 
         eval_result = eval_kernel_against_ref(
             original_model_src=ref_arch_src,
             custom_model_src=kernel_src,
             measure_performance=measure_performance,
+            timing_method=timing_method,
             verbose=verbose,
             num_correct_trials=num_correct_trials,
             num_perf_trials=num_perf_trials,
             device=device,
-            backend=configs["backend"],
-            precision=get_torch_dtype_from_string(configs["precision"])
+            backend=backend,
+            precision=precision
         )
         return eval_result
 
diff --git a/src/eval.py b/src/eval.py
index f157a6b7..5f1fe8d8 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -393,8 +393,9 @@ def eval_kernel_against_ref(
     seed_num: int = 42,
     num_correct_trials: int = 1,
     num_perf_trials: int = 10,
-    verbose: bool = False,
     measure_performance: bool = False,
+    timing_method: str = "cuda_event", # see timing.py
+    verbose: bool = False,
     build_dir: os.PathLike = None,
     device: Union[torch.device, int] = (
         torch.cuda.current_device() if torch.cuda.is_available() else None
@@ -405,11 +406,15 @@ def eval_kernel_against_ref(
     """
     Evaluate the custom kernel against the original model
 
+    NOTE: we are thinking about refactor this to be more modularized 
+    and we can add more checks as our other ongiong PRs are working on
+
     num_correct_trials: number of trials to initialize different random inputs; correctness pass only if all trials pass
     num_perf_trials: run the evalutation many times to take the average
     device: GPU (cuda) device to run the evalutation on
     backend: str, one of 'cuda', 'triton', 'tilelang', or 'cute'
     precision: torch.dtype for computation (note: tilelang only supports fp16)
+    timing_method: str, method to time kernel, see timing.py for more details 
     """
     # TODO: check device is busy
     assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval"
@@ -579,7 +584,7 @@ def eval_kernel_against_ref(
                 torch.cuda.synchronize(device=device)
 
                 # support multiple timing backend
-                timing_fn = timing.get_timing_function("cuda_event")
+                timing_fn = timing.get_timing_function(timing_method)
                 elapsed_times = timing_fn(
                     model_new,
                     inputs,

From 2c36572c71744a090c40194bf24d0234ded3eb68 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Mon, 15 Dec 2025 03:47:15 +0000
Subject: [PATCH 09/13] remove discard_first for cuda event and updated
 documentation

---
 src/timing.py                      | 121 ++++++++++++++++++++---------
 src/unit_tests/test_eval_timing.py |  20 +++--
 2 files changed, 98 insertions(+), 43 deletions(-)

diff --git a/src/timing.py b/src/timing.py
index 56cefd09..d07dd54f 100644
--- a/src/timing.py
+++ b/src/timing.py
@@ -5,18 +5,23 @@
 from typing import Any
 import os
 
+
 # we leverage triton's testing functionality for some timing methods
 from triton import runtime as triton_runtime
 from triton import testing as triton_testing    
 
 ################################################################################
-# Performance Eval
+# timing.py
+# Various timing methods and utilities for performance evaluation
+# please make a PR if you have suggestions!
+
+# Try them out at src/unit_tests/test_eval_timing.py
 ################################################################################
 
 def clear_l2_cache(device: str = "cuda"):
     """
-    Clear L2 Cache line by thrashing
-    From GPU mode reference kernel repo:
+    Clear L2 Cache line by thrashing with a large tensor
+    Acknowledge GPU mode reference kernel repo:
     https://github.com/gpu-mode/reference-kernels/commit/7c15075a39286e88939d99d3f3a60be88b8e6223#diff-3a30a71cbf8db2badd224f4d92f9a2546925a5b522632a31d353526b7a5f3338R158-R163
     """
     # don't reserve space for persisting lines
@@ -27,7 +32,7 @@ def clear_l2_cache(device: str = "cuda"):
     # NOTE: we can make this more adaptive based on device
     # L2 cache sizes: A100=40MB, H100=50MB, H200=90MB, RTX4090=72MB, L40S=48MB, Blackwell≈192MB → overwrite >200MB to fully thrash L2
     dummy = torch.empty((32, 1024, 1024), dtype=torch.int64, device=device)
-    # write to tenosr with inplace fill
+    # write to tensor with inplace fill
     dummy.fill_(42) 
     del dummy
 
@@ -45,17 +50,34 @@ def get_timing_function(
     method: str = "cuda_event", # by default 
 ) -> callable:
     """
-    Get the timing function based on different timing methods
+    Get timing function by method name.
+
+    Available methods:
+        - "cuda_event": torch.cuda.event timing (default, explicit trial control)
+        - "do_bench": Use triton's do_bench (adaptive trial count based on time budget)
+        - "do_bench_impl": Mirrors Triton's do_bench implementation (explicit control)
+        - "host_time": Host side wall-clock timing (might include overhead)
+    
+    Args:
+        method: Name of timing method to use
+    
+    Returns:
+        Timing function with signature (kernel_fn, args, num_warmup, num_trials, 
+        discard_first, verbose, device) -> list[float]
     """
     print(
         f"[Profiling] Using timing method: {method}"
     )
+    # NOTE: here are all the timing methods we supporting for now
     match method:
         case "cuda_event":
             return time_execution_with_cuda_event
-        case "do_bench_interface":
+        case "do_bench":
+            # caveat: just using do_bench as it is 
+            # do not have precise control over number of trials
             return time_execution_with_do_bench_interface
         case "do_bench_impl":
+            # do_bench equivalent implementations for transparency and control
             return time_execution_with_do_bench_impl
         case "host_time":
             return time_execution_with_host_time 
@@ -64,10 +86,8 @@ def get_timing_function(
             raise ValueError(f"Unsupported timing method: {method}")
 
 """
-Kernel Timing Functions [Revamp WIP]
-TODO: see our detailed study on how to time kernel execution and benchmarking guide
-we implement a few ways to do timing studies 
-These should be implemnted to be agnostic whether the modules are rather Model (reference kernel) or ModelNew (generated kernel)
+Kernel Timing Functions
+NOTE: we have a WIP blogpost on this topic covering the various timing approaches   
 """
 
 
@@ -76,25 +96,26 @@ def time_execution_with_cuda_event(
     args: list[Any],
     num_warmup: int = 3,
     num_trials: int = 10,
-    discard_first: int = 1,
+    discard_first: int = 1, # not used
     verbose: bool = True,
     device: torch.device = None,
 ) -> list[float]:
     """
-    Time a CUDA kernel function over multiple trials using torch.cuda.Event
-    The first version of KenrelBench used this for evaluation.
+    Time a CUDA kernel function over multiple trials using torch.cuda.event
+    The first version of KernelBench used this for evaluation.
     We care about cold cache performance here.
 
     Args:
         kernel_fn: Function to time
-        *args: Arguments to pass to kernel_fn
+        args: Arguments to pass to kernel_fn
+        num_warmup: Number of warmup iterations before timing
         num_trials: Number of timing trials to run
+        discard_first: not used
         verbose: Whether to print per-trial timing info
-        device: CUDA device to use, if None, use current device
+        device: CUDA device to use, defaults to current device
 
-    TODO: double check this with team 
     Returns:
-        List of elapsed times in milliseconds
+        List of elapsed times in milliseconds (length = num_trials)
     """
     if device is None:
         if verbose:
@@ -115,7 +136,7 @@ def time_execution_with_cuda_event(
     elapsed_times: list[float] = [] # in ms
 
     # Timing trials
-    for trial in range(num_trials + discard_first):
+    for trial in range(num_trials):
         torch.cuda.synchronize(device=device) # block on all streams
 
         # create event marker default is not interprocess
@@ -136,11 +157,9 @@ def time_execution_with_cuda_event(
 
         # Calculate the elapsed time in milliseconds
         elapsed_time_ms = start_event.elapsed_time(end_event)
-        if trial >= discard_first:
-            if verbose:
-                logical_idx = trial - discard_first + 1
-                print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms")
-            elapsed_times.append(elapsed_time_ms)
+        if verbose:
+            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
+        elapsed_times.append(elapsed_time_ms)
 
     return elapsed_times
 
@@ -148,17 +167,34 @@ def time_execution_with_cuda_event(
 def time_execution_with_do_bench_interface(
     kernel_fn: callable,
     args: list[Any],
-    # this is different for triton do_bench
+    # Not used, as triton do_bench handles adaptive trials
     num_warmup: int = 3, 
     num_trials: int = 10,
-    discard_first: int = 1, # not used yet
+    discard_first: int = 1, # not used here
     verbose: bool = True,
     device: torch.device | None = None) -> list[float]:
     """
-    Using triton's default do_bench as it is 
-    Note we don't set num_warmup and num_trials, and we use warmup 25 ms and repetition time 100 ms with Triton's default values
-    See doc: https://triton-lang.org/main/python-api/generated/triton.testing.do_bench.html
-    Benchmark the runtime of the provided function. By default, return the median runtime of fn along with the 20-th and 80-th performance percentile.
+    Wrapper around Triton's do_bench for kernel timing.
+
+    Uses Triton's adaptive benchmarking with fixed time budgets (warmup=25ms, rep=100ms) [Triton do_bench default].
+    The number of trials is determined automatically based on kernel runtime.
+
+    Note: num_warmup, num_trials, discard_first are ignored - included only for 
+    API compatibility with other timing functions.
+
+    Args:
+        kernel_fn: Function to time
+        args: Arguments to pass to kernel_fn
+        num_warmup: (ignored) Triton controls warmup
+        num_trials: (ignored) Triton controls trial count  
+        discard_first: (ignored) Not used
+        verbose: Whether to print timing info
+        device: CUDA device to use
+
+    Returns:
+        List of elapsed times in milliseconds
+
+    See: https://triton-lang.org/main/python-api/generated/triton.testing.do_bench.html
     """
     do_bench_fn = lambda : kernel_fn(*args) # wrap function with arguments
     return triton_testing.do_bench(fn=do_bench_fn,
@@ -174,7 +210,7 @@ def time_execution_with_do_bench_impl(
     args: list[Any],
     num_warmup: int = 3,
     num_trials: int = 10,
-    discard_first: int = 1, # not used yet
+    discard_first: int = 1, # not used here
     verbose: bool = True,
     device: torch.device | None = None) -> list[float]:
     """
@@ -184,14 +220,27 @@ def time_execution_with_do_bench_impl(
 
     Note we duplicate triton's implementation and modify / comment out parts
     to use num_warmup and num_trials that explicitly follows what user define here
-    instead of do_bench's version that computes how many times to run warmup and profile based on total warmup and repetition time
+    instead of do_bench's version that computes how many times to run warmup and 
+    profile based on total warmup and repetition time
+
+    We commented out unused parts and kept only what's needed for kernelbench timing eval
+    Args:
+        kernel_fn: Function to time
+        args: Arguments to pass to kernel_fn
+        num_warmup: Number of warmup iterations
+        num_trials: Number of timing trials
+        discard_first: (not used) Trials to discard
+        verbose: Whether to print timing info
+        device: CUDA device to use, defaults to current device
+    Returns:
+        List of elapsed times in milliseconds (length = num_trials)
     """
 
-    device = torch.cuda.current_device() if device is not None else device
+    device = device if device is not None else torch.cuda.current_device()
     if verbose: 
         print(f"Using do_bench to evaluate kernel on {device}")
 
-    # speicfy device interface (supports both nvidia and amd)
+    # specify device interface (supports both nvidia and amd)
     # under the hood, di is torch.cuda (amd uses a cuda compatible interface)
     di = triton_runtime.driver.active.get_device_interface()
 
@@ -253,7 +302,7 @@ def time_execution_with_host_time(
     args: list[Any],
     num_warmup: int = 3,
     num_trials: int = 10,
-    discard_first: int = 1,
+    discard_first: int = 1, # to reduce impact of initialization overhead
     verbose: bool = True,
     device: torch.device | None = None,
 ) -> list[float]:
@@ -268,13 +317,12 @@ def time_execution_with_host_time(
         kernel_fn: Function to time
         args: Arguments to pass to kernel_fn
         num_trials: Number of timing trials to run
+        discard_first: Number of first few trials to discard (due to some initialization overhead)
         verbose: Whether to print per-trial timing info
         device: CUDA device to use, if None, use current device
 
     Returns:
         List of elapsed times in milliseconds
-
-    Not recommended: 
     """
     if device is None:
         if verbose:
@@ -329,6 +377,7 @@ def fetch_baseline_time(
     Fetch the baseline time from the time
 
     Note: might be better to just run the refernece using torch eager and compile sometimes
+    Will add this as a functionality for eval revamp
     """
     if not os.path.exists(baseline_time_filepath):
         raise FileNotFoundError(
diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py
index 24d0faf2..b212bf78 100644
--- a/src/unit_tests/test_eval_timing.py
+++ b/src/unit_tests/test_eval_timing.py
@@ -37,12 +37,14 @@ def _run_timing_smoke_test_matmul(timing_func_name:str, device:str="cuda"):
         pytest.skip("CUDA not available, skipping timing tests")
     
     # Create simple test matrices
-    size = 512
-    a = torch.randn(size, size, device=device)
-    b = torch.randn(size, size, device=device)
+    M = 2048
+    N = 2048
+    K = 2048
+    a = torch.randn(M, K, device=device)
+    b = torch.randn(K, N, device=device)
     
     num_warmup = 5
-    num_trials = 5
+    num_trials = 100
     
     # Define the kernel function to time
     def matmul_kernel(a, b):
@@ -69,10 +71,14 @@ def matmul_kernel(a, b):
     print(stats)
     
 
-timing_methods = ["cuda_event", "cpu_time", "do_bench_interface", "do_bench_impl"]
+# test all currently available timing methods
+def run_all_timing_tests():
+    timing_methods = ["cuda_event", "host_time", "do_bench", "do_bench_impl"]
 
-for timing_method in timing_methods:
-    _run_timing_smoke_test_matmul(timing_method)
+    for timing_method in timing_methods:
+        _run_timing_smoke_test_matmul(timing_method)
+
+run_all_timing_tests()
 
 
 

From 4909b1dbdde3021ff3018425cefd27c638e75049 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Mon, 15 Dec 2025 04:11:33 +0000
Subject: [PATCH 10/13] add discard_first for cuda_event

---
 src/timing.py                      | 20 ++++++++++++--------
 src/unit_tests/test_eval_timing.py |  5 ++++-
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/timing.py b/src/timing.py
index d07dd54f..2d422711 100644
--- a/src/timing.py
+++ b/src/timing.py
@@ -96,7 +96,7 @@ def time_execution_with_cuda_event(
     args: list[Any],
     num_warmup: int = 3,
     num_trials: int = 10,
-    discard_first: int = 1, # not used
+    discard_first: int = 1, # set to 0 to disable
     verbose: bool = True,
     device: torch.device = None,
 ) -> list[float]:
@@ -110,7 +110,7 @@ def time_execution_with_cuda_event(
         args: Arguments to pass to kernel_fn
         num_warmup: Number of warmup iterations before timing
         num_trials: Number of timing trials to run
-        discard_first: not used
+        discard_first: Number of first trials to discard, for consistency with host_time, set to 0 to disable
         verbose: Whether to print per-trial timing info
         device: CUDA device to use, defaults to current device
 
@@ -126,17 +126,17 @@ def time_execution_with_cuda_event(
     for _ in range(num_warmup):
         kernel_fn(*args)
         torch.cuda.synchronize(device=device)
-
+    
     # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache
     torch.cuda.empty_cache()
-
+    
     print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
     )
 
     elapsed_times: list[float] = [] # in ms
 
     # Timing trials
-    for trial in range(num_trials):
+    for trial in range(num_trials + discard_first):
         torch.cuda.synchronize(device=device) # block on all streams
 
         # create event marker default is not interprocess
@@ -157,9 +157,13 @@ def time_execution_with_cuda_event(
 
         # Calculate the elapsed time in milliseconds
         elapsed_time_ms = start_event.elapsed_time(end_event)
-        if verbose:
-            print(f"Trial {trial + 1}: {elapsed_time_ms:.3g} ms")
-        elapsed_times.append(elapsed_time_ms)
+        
+        if trial >= discard_first:
+            if verbose:
+                logical_idx = trial - discard_first + 1
+                print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms")
+            elapsed_times.append(elapsed_time_ms)
+
 
     return elapsed_times
 
diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py
index b212bf78..24efe8ac 100644
--- a/src/unit_tests/test_eval_timing.py
+++ b/src/unit_tests/test_eval_timing.py
@@ -61,9 +61,12 @@ def matmul_kernel(a, b):
     
     # Validate results
     assert isinstance(elapsed_times, list), "Expected list of elapsed times"
+    
+    # disabled this check as do_bench does not use num_trials
     # assert len(elapsed_times) == num_trials, f"Expected {num_trials} timing results, got {len(elapsed_times)}"
     assert all(isinstance(t, float) for t in elapsed_times), "All timing results should be floats"
     assert all(t > 0 for t in elapsed_times), "All timing results should be positive"
+    # DEBUG print times
     # print(f"smoke test matmul elapsed times with {timing_func_name} (in ms): {elapsed_times}")
 
     stats = timing.get_timing_stats(elapsed_times, device=device)
@@ -74,7 +77,7 @@ def matmul_kernel(a, b):
 # test all currently available timing methods
 def run_all_timing_tests():
     timing_methods = ["cuda_event", "host_time", "do_bench", "do_bench_impl"]
-
+    # timing_methods = ["cuda_event", "do_bench_impl"]
     for timing_method in timing_methods:
         _run_timing_smoke_test_matmul(timing_method)
 

From 6c92786b9248370b4a165cb6d2e638997082a8ee Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Tue, 16 Dec 2025 01:15:22 +0000
Subject: [PATCH 11/13] add device context for profile on particular device

---
 .../generate_and_eval_single_sample_modal.py  |   1 -
 src/timing.py                                 | 200 ++++++++++--------
 src/unit_tests/test_eval_timing.py            |  10 +-
 3 files changed, 116 insertions(+), 95 deletions(-)

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index 9dee518a..f41ba95f 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -14,7 +14,6 @@
 from datasets import load_dataset
 
 #from src.dataset import construct_kernelbench_dataset
-from src.eval import eval_kernel_against_ref
 from src.prompt_constructor_toml import get_prompt_for_backend, get_custom_prompt
 from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
 
diff --git a/src/timing.py b/src/timing.py
index 2d422711..8269feed 100644
--- a/src/timing.py
+++ b/src/timing.py
@@ -105,6 +105,10 @@ def time_execution_with_cuda_event(
     The first version of KernelBench used this for evaluation.
     We care about cold cache performance here.
 
+    Note: this version does not guard against adverserial cuda streams yet.
+    It assumes computation is done on the current stream for current device. 
+    Stay tuned for future PRs. 
+
     Args:
         kernel_fn: Function to time
         args: Arguments to pass to kernel_fn
@@ -122,47 +126,49 @@ def time_execution_with_cuda_event(
             print(f"Using current device: {torch.cuda.current_device()}")
         device = torch.cuda.current_device()
 
-    # Warm ups
-    for _ in range(num_warmup):
-        kernel_fn(*args)
-        torch.cuda.synchronize(device=device)
-    
-    # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache
-    torch.cuda.empty_cache()
-    
-    print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
-    )
-
-    elapsed_times: list[float] = [] # in ms
-
-    # Timing trials
-    for trial in range(num_trials + discard_first):
-        torch.cuda.synchronize(device=device) # block on all streams
-
-        # create event marker default is not interprocess
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
+    with torch.cuda.device(device):
         
-        clear_l2_cache() # measuring cold cache performance
+        # Warm ups
+        for _ in range(num_warmup):
+            kernel_fn(*args)
+            torch.cuda.synchronize(device=device)
         
-        # note cuda events mark event on current stream
-        start_event.record()
-        _ = kernel_fn(*args)
-        end_event.record() 
-
-        # waits for all streams on that device
-        # though it is important to note the events only record time between on current stream
-        # TODO: find ways to check hacks by launching work on additional stream
-        torch.cuda.synchronize(device=device)
-
-        # Calculate the elapsed time in milliseconds
-        elapsed_time_ms = start_event.elapsed_time(end_event)
+        # note this only release PyTorch’s CUDA caching allocator, not necessarily clearing device's L2 cache
+        torch.cuda.empty_cache()
         
-        if trial >= discard_first:
-            if verbose:
-                logical_idx = trial - discard_first + 1
-                print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms")
-            elapsed_times.append(elapsed_time_ms)
+        print(f"[Profiling] Using device: {device} {torch.cuda.get_device_name(device)}, warm up {num_warmup}, trials {num_trials}"
+        )
+
+        elapsed_times: list[float] = [] # in ms
+
+        # Timing trials
+        for trial in range(num_trials + discard_first):
+            torch.cuda.synchronize(device=device) # block on all streams
+
+            # create event marker default is not interprocess
+            start_event = torch.cuda.Event(enable_timing=True)
+            end_event = torch.cuda.Event(enable_timing=True)
+            
+            clear_l2_cache(device=device) # measuring cold cache performance
+            
+            # note cuda events mark event on current stream
+            start_event.record()
+            _ = kernel_fn(*args)
+            end_event.record() 
+
+            # waits for all streams on that device
+            # though it is important to note the events only record time between on current stream
+            # TODO: find ways to check hacks by launching work on additional stream
+            torch.cuda.synchronize(device=device)
+
+            # Calculate the elapsed time in milliseconds
+            elapsed_time_ms = start_event.elapsed_time(end_event)
+            
+            if trial >= discard_first:
+                if verbose:
+                    logical_idx = trial - discard_first + 1
+                    print(f"Trial {logical_idx}: {elapsed_time_ms:.3g} ms")
+                elapsed_times.append(elapsed_time_ms)
 
 
     return elapsed_times
@@ -200,8 +206,15 @@ def time_execution_with_do_bench_interface(
 
     See: https://triton-lang.org/main/python-api/generated/triton.testing.do_bench.html
     """
+    if device is None:
+        if verbose:
+            print(f"Using current device: {torch.cuda.current_device()}")
+        device = torch.cuda.current_device()
+
+
     do_bench_fn = lambda : kernel_fn(*args) # wrap function with arguments
-    return triton_testing.do_bench(fn=do_bench_fn,
+    with torch.cuda.device(device):
+        return triton_testing.do_bench(fn=do_bench_fn,
             warmup=25,
             rep=100, 
             grad_to_none=None, 
@@ -244,58 +257,63 @@ def time_execution_with_do_bench_impl(
     if verbose: 
         print(f"Using do_bench to evaluate kernel on {device}")
 
-    # specify device interface (supports both nvidia and amd)
-    # under the hood, di is torch.cuda (amd uses a cuda compatible interface)
-    di = triton_runtime.driver.active.get_device_interface()
-
-    kernel_fn(*args)
-    di.synchronize(device=device)
-
-    # clear l2 cache
-    cache = triton_runtime.driver.active.get_empty_cache_for_benchmark()
-
-    # do_bench Estimate the runtime of the function 
-    # Here we are not using it not needed since now the warmup and repeat steps are set by the user)
-    # start_event = di.Event(enable_timing=True)
-    # end_event = di.Event(enable_timing=True)
-    # start_event.record()
-    # for _ in range(5):
-    #     triton_runtime.driver.active.clear_cache(cache)
-    #     kernel_fn(*args)
-    # end_event.record()
-    # di.synchronize()
-    # estimate_ms = start_event.elapsed_time(end_event) / 5
-
-    # compute number of warmup and repeat
-    # Change
-    # n_warmup = max(1, int(warmup / estimate_ms))
-    # n_repeat = max(1, int(rep / estimate_ms))
-    # n_warmup = warmup
-    # n_repeat = rep
-    # end of change
-    start_event = [di.Event(enable_timing=True) for i in range(num_trials)]
-    end_event = [di.Event(enable_timing=True) for i in range(num_trials)]
-    # Warm-up
-    for _ in range(num_warmup):
-        kernel_fn(*args)
-    # Benchmark
-    for i in range(num_trials):
-        # All KernelBench functions are forward passes, so we don't need to reset gradients
-        # we don't want `fn` to accumulate gradient values
-        # if it contains a backward pass. So we clear the
-        # provided gradients
-        # if grad_to_none is not None:
-        #     for x in grad_to_none:
-        #         x.grad = None
-        
-        # we clear the L2 cache before each run
-        triton_runtime.driver.active.clear_cache(cache)
-        # record time of `fn`
-        start_event[i].record()
+
+    # added to constraint to this device
+    with torch.cuda.device(device):  
+
+        # specify device interface (supports both nvidia and amd)
+        # under the hood, di is torch.cuda (amd uses a cuda compatible interface)
+        di = triton_runtime.driver.active.get_device_interface()
+
         kernel_fn(*args)
-        end_event[i].record()
-    # Record clocks
-    di.synchronize(device=device)
+        di.synchronize(device=device)
+
+        # clear l2 cache
+        cache = triton_runtime.driver.active.get_empty_cache_for_benchmark()
+
+        # do_bench Estimate the runtime of the function 
+        # Here we are not using it not needed since now the warmup and repeat steps are set by the user)
+        # start_event = di.Event(enable_timing=True)
+        # end_event = di.Event(enable_timing=True)
+        # start_event.record()
+        # for _ in range(5):
+        #     triton_runtime.driver.active.clear_cache(cache)
+        #     kernel_fn(*args)
+        # end_event.record()
+        # di.synchronize()
+        # estimate_ms = start_event.elapsed_time(end_event) / 5
+
+        # compute number of warmup and repeat
+        # Change
+        # n_warmup = max(1, int(warmup / estimate_ms))
+        # n_repeat = max(1, int(rep / estimate_ms))
+        # n_warmup = warmup
+        # n_repeat = rep
+        # end of change
+        start_event = [di.Event(enable_timing=True) for i in range(num_trials)]
+        end_event = [di.Event(enable_timing=True) for i in range(num_trials)]
+        # Warm-up
+        for _ in range(num_warmup):
+            kernel_fn(*args)
+        # Benchmark
+        for i in range(num_trials):
+            # All KernelBench functions are forward passes, so we don't need to reset gradients
+            # we don't want `fn` to accumulate gradient values
+            # if it contains a backward pass. So we clear the
+            # provided gradients
+            # if grad_to_none is not None:
+            #     for x in grad_to_none:
+            #         x.grad = None
+            
+            # we clear the L2 cache before each run
+            triton_runtime.driver.active.clear_cache(cache)
+            # record time of `fn`
+            start_event[i].record()
+            kernel_fn(*args)
+            end_event[i].record()
+        # Record clocks
+        di.synchronize(device=device)
+    
     if verbose: print('Done with do_bench evaluation')
     times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
     return times
@@ -350,7 +368,7 @@ def time_execution_with_host_time(
         torch.cuda.synchronize(device=device)
 
         # focus on cold_cache performance
-        clear_l2_cache()
+        clear_l2_cache(device=device) 
 
         # CPU-side wall clock time using perf_counter (high-resolution timer)
         start_time = time.perf_counter()
diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py
index 24efe8ac..07fca713 100644
--- a/src/unit_tests/test_eval_timing.py
+++ b/src/unit_tests/test_eval_timing.py
@@ -57,6 +57,7 @@ def matmul_kernel(a, b):
         num_warmup=num_warmup,
         num_trials=num_trials,
         verbose=False,
+        device=device    
     )
     
     # Validate results
@@ -75,13 +76,16 @@ def matmul_kernel(a, b):
     
 
 # test all currently available timing methods
-def run_all_timing_tests():
+def run_all_timing_tests(device="cuda"):
     timing_methods = ["cuda_event", "host_time", "do_bench", "do_bench_impl"]
     # timing_methods = ["cuda_event", "do_bench_impl"]
     for timing_method in timing_methods:
-        _run_timing_smoke_test_matmul(timing_method)
+        _run_timing_smoke_test_matmul(timing_method, device=device)
 
-run_all_timing_tests()
+
+
+test_device = torch.device("cuda:5")
+run_all_timing_tests(test_device)
 
 
 

From 8a165d67271cd89bd29b8a2add808155d9d306f1 Mon Sep 17 00:00:00 2001
From: Pietro <marsella@stanford.edu>
Date: Tue, 16 Dec 2025 01:22:22 +0000
Subject: [PATCH 12/13] nit fix ready for merge

---
 src/unit_tests/test_eval_timing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/unit_tests/test_eval_timing.py b/src/unit_tests/test_eval_timing.py
index 07fca713..84921a37 100644
--- a/src/unit_tests/test_eval_timing.py
+++ b/src/unit_tests/test_eval_timing.py
@@ -26,7 +26,7 @@
 def _run_timing_smoke_test_matmul(timing_func_name:str, device:str="cuda"):
     """
     Scaffold function for timing smoke tests.
-    Smoke test for using 512x512 matmul.
+    Smoke test for using 2048x2048x2048 matmul with 5 warmup and 100 trials.
 
     Args:
         timing_fn: The timing function to test

From c063b8127fd6b1b1825b38344bfe7ba4a378604e Mon Sep 17 00:00:00 2001
From: Pietro <marsella@stanford.edu>
Date: Tue, 16 Dec 2025 01:33:32 +0000
Subject: [PATCH 13/13] type annotation for device

---
 src/timing.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/timing.py b/src/timing.py
index 8269feed..8a36522b 100644
--- a/src/timing.py
+++ b/src/timing.py
@@ -18,7 +18,7 @@
 # Try them out at src/unit_tests/test_eval_timing.py
 ################################################################################
 
-def clear_l2_cache(device: str = "cuda"):
+def clear_l2_cache(device: torch.device | str = "cuda"):
     """
     Clear L2 Cache line by thrashing with a large tensor
     Acknowledge GPU mode reference kernel repo:
@@ -150,7 +150,7 @@ def time_execution_with_cuda_event(
             end_event = torch.cuda.Event(enable_timing=True)
             
             clear_l2_cache(device=device) # measuring cold cache performance
-            
+
             # note cuda events mark event on current stream
             start_event.record()
             _ = kernel_fn(*args)
@@ -295,6 +295,8 @@ def time_execution_with_do_bench_impl(
         # Warm-up
         for _ in range(num_warmup):
             kernel_fn(*args)
+        di.synchronize(device=device) 
+        
         # Benchmark
         for i in range(num_trials):
             # All KernelBench functions are forward passes, so we don't need to reset gradients
@@ -313,9 +315,9 @@ def time_execution_with_do_bench_impl(
             end_event[i].record()
         # Record clocks
         di.synchronize(device=device)
-    
+        times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
+
     if verbose: print('Done with do_bench evaluation')
-    times = [s.elapsed_time(e) for s, e in zip(start_event, end_event)]
     return times