ScalingIntelligence · simonguozirui · Dec 16, 2025 · Nov 10, 2025 · Nov 16, 2025 · Dec 12, 2025
diff --git a/.gitignore b/.gitignore
diff --git a/requirements.txt b/requirements.txt
@@ -10,13 +10,15 @@ modal
 # DSLs
 nvidia-cutlass-dsl
 tilelang
+triton
 
 # helper
 tqdm
 packaging
 pydra_config
 pytest
 ninja
+cupy-cuda12x
 
 # Numerics
 einops

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
@@ -113,6 +113,7 @@ def __init__(self):
         self.num_perf_trials = 100
         self.timeout = 180  # in seconds
         self.measure_performance = True
+        self.timing_method = "cuda_event"
 
         # Eval Flow setting
         # To speedup evaluation, you can start building the kernel on CPU on disk as cache
@@ -173,6 +174,7 @@ def evaluate_single_sample_modal(
         num_correct_trials: int = 5,
         num_perf_trials: int = 100,
         measure_performance: bool = True,
+        timing_method: str = "cuda_event",
         verbose: bool = False,
         backend: str = "cuda",
         precision: str = "fp32",
@@ -212,6 +214,7 @@ def evaluate_single_sample_modal(
                 original_model_src=ref_arch_src,
                 custom_model_src=kernel_src,
                 measure_performance=measure_performance,
+                timing_method=timing_method,
                 verbose=verbose,
                 num_correct_trials=num_correct_trials,
                 num_perf_trials=num_perf_trials,
@@ -324,6 +327,7 @@ def evaluate_single_sample(
             original_model_src=ref_arch_src,
             custom_model_src=kernel_src,
             measure_performance=configs.measure_performance,
+            timing_method=configs.timing_method,
             verbose=configs.verbose,
             num_correct_trials=configs.num_correct_trials,
             num_perf_trials=configs.num_perf_trials,
@@ -384,6 +388,7 @@ def evaluate_single_sample_modal_direct(
             num_correct_trials=configs.num_correct_trials,
             num_perf_trials=configs.num_perf_trials,
             measure_performance=configs.measure_performance,
+            timing_method=configs.timing_method,
             verbose=configs.verbose,
         )
         return eval_result
@@ -502,6 +507,7 @@ def batch_eval_modal(
                             num_correct_trials=config.num_correct_trials,
                             num_perf_trials=config.num_perf_trials,
                             measure_performance=config.measure_performance,
+                            timing_method=config.timing_method,
                             verbose=config.verbose,
                             backend=config.backend,
                             precision=config.precision,

diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
@@ -73,6 +73,7 @@ def __init__(self):
         self.log_eval_result = False
 
         self.backend = "cuda"
+        self.timing_method = "cuda_event"  # see timing.py
 
         # Prompt construction
         self.prompt_option = "one_shot"  # choices: zero_shot, one_shot, few_shot
@@ -267,6 +268,7 @@ def main(config: EvalConfig):
         custom_kernel,
         verbose=config.verbose,
         measure_performance=True,
+        timing_method=config.timing_method,
         num_correct_trials=5,
         num_perf_trials=100,
         backend=config.backend,

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
@@ -14,7 +14,6 @@
 from datasets import load_dataset
 
 #from src.dataset import construct_kernelbench_dataset
-from src.eval import eval_kernel_against_ref
 from src.prompt_constructor_toml import get_prompt_for_backend, get_custom_prompt
 from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
 
@@ -75,6 +74,7 @@ def __init__(self):
         self.log_eval_result = False
 
         self.backend = "cuda"
+        self.timing_method = "cuda_event"  # see timing.py
         # Prompt generation settings
         self.prompt_option = "one_shot"  # zero_shot, one_shot, few_shot
         self.include_hardware_info = False
@@ -110,7 +110,7 @@ def __repr__(self):
 class EvalFunc:
 
     @modal.method()
-    def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision):
+    def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision, timing_method):
         # 3. Evaluate Kernel
         # NOTE: no need to wrap around process here as only a single sample
         # see batch eval for examples of process isolation
@@ -121,6 +121,7 @@ def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arc
         modal_set_gpu_arch(gpu_arch)
         return eval_kernel_against_ref(
             ref_arch_src, custom_kernel, verbose=verbose, measure_performance=True, 
+            timing_method=timing_method,
             num_correct_trials=5, num_perf_trials=100, backend=backend, precision=get_torch_dtype_from_string(precision)
         )
 
@@ -274,7 +275,7 @@ def main(config: EvalConfig):
 
     with app.run():
         kernel_exec_result = EvalFunc.with_options(gpu=config.gpu)().eval_single_sample_modal.remote(
-            ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision
+            ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision, config.timing_method
         )
 
         print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")

diff --git a/scripts/generate_baseline_time.py b/scripts/generate_baseline_time.py
@@ -2,11 +2,13 @@
 import numpy as np
 from src.eval import (
     load_original_model_and_inputs,
-    time_execution_with_cuda_event,
-    get_timing_stats,
     set_seed,
     fetch_ref_arch_from_problem_id,
 )
+from src.timing import (
+    get_timing_function,
+    get_timing_stats,
+)
 from src.dataset import construct_problem_dataset_from_problem_dir
 from src.utils import read_file
 import os
@@ -81,6 +83,7 @@ def measure_program_time(
         torch_compile_options: str="default",
         device: torch.device="cuda:0",
         verbose: bool = False,
+        timing_method: str = "cuda_event",
 ) -> dict:
     """
     Measure the time of a KernelBench reference architecture
@@ -116,8 +119,11 @@ def measure_program_time(
 
             model = model.cuda(device=device)
             torch.cuda.synchronize(device=device)
-            elapsed_times = time_execution_with_cuda_event(
-                model, *inputs, num_trials=num_trials, verbose=verbose, device=device
+
+            # run chosen timing function
+            timing_fn = get_timing_function(timing_method)
+            elapsed_times = timing_fn(
+                model, inputs, num_trials=num_trials, verbose=verbose, device=device
             )
             runtime_stats = get_timing_stats(elapsed_times, device=device)
 

diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
@@ -57,6 +57,8 @@
 Usage:
 1. PyTorch reference is a local file (local eval)
 python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/model_ex_add.py kernel_src_path=src/prompts/model_new_ex_add.py eval_mode=local
+python3 scripts/run_and_check.py ref_origin=local ref_arch_src_path=src/prompts/few_shot/model_ex_tiled_matmul.py kernel_src_path=src/prompts/few_shot/model_new_ex_tiled_matmul.py eval_mode=local
+
 
 2. PyTorch reference is a kernelbench problem (local eval)
 python3 scripts/run_and_check.py ref_origin=kernelbench level=<level> problem_id=<problem_id> kernel_src_path=<path to model-generated kernel> eval_mode=local
@@ -101,6 +103,7 @@ def __init__(self):
         # verbose logging
         self.verbose = False
         self.measure_performance = True
+        self.timing_method = "cuda_event"  # see timing.py
         self.build_dir_prefix = "" # if you want to specify a custom build directory
         self.clear_cache = False # TODO
 
@@ -128,18 +131,23 @@ def evaluate_single_sample_src(ref_arch_src: str, kernel_src: str, configs: dict
     num_perf_trials = configs["num_perf_trials"]    
     verbose = configs["verbose"]
     measure_performance = configs["measure_performance"]
+    timing_method = configs["timing_method"]
+    backend = configs["backend"]
+    precision = kernel_eval.get_torch_dtype_from_string(configs["precision"])
+
     try:
         eval_result = kernel_eval.eval_kernel_against_ref(
         original_model_src=ref_arch_src,
             custom_model_src=kernel_src,
             measure_performance=measure_performance,
+            timing_method=timing_method,
             verbose=verbose,
             num_correct_trials=num_correct_trials,
             num_perf_trials=num_perf_trials,
             build_dir=build_dir,
             device=device,
-            backend=configs["backend"],
-            precision=kernel_eval.get_torch_dtype_from_string(configs["precision"])
+            backend=backend,
+            precision=precision
         )
         return eval_result
     except Exception as e:
@@ -180,17 +188,21 @@ def evaluate_single_sample_src_modal(self, ref_arch_src: str, kernel_src: str, c
         num_perf_trials = configs["num_perf_trials"]
         verbose = configs["verbose"]
         measure_performance = configs["measure_performance"]
+        timing_method = configs["timing_method"]
+        backend = configs["backend"]
+        precision = kernel_eval.get_torch_dtype_from_string(configs["precision"])
 
         eval_result = eval_kernel_against_ref(
             original_model_src=ref_arch_src,
             custom_model_src=kernel_src,
             measure_performance=measure_performance,
+            timing_method=timing_method,
             verbose=verbose,
             num_correct_trials=num_correct_trials,
             num_perf_trials=num_perf_trials,
             device=device,
-            backend=configs["backend"],
-            precision=get_torch_dtype_from_string(configs["precision"])
+            backend=backend,
+            precision=precision
         )
         return eval_result