From c6ffa110467df75937be615bf0efc2b35b15adb0 Mon Sep 17 00:00:00 2001
From: Bhavesh Kalisetti <bhaveshkalisetti@gmail.com>
Date: Mon, 3 Nov 2025 19:51:31 -0800
Subject: [PATCH 01/13] add unit tests for input mod

Adds a unit test to check that a generated kernel which modifies the original inputs fails the correctness check.

For the square matmul problem, the kernel zeros out the inputs and returns a matrix of 0s. This will fail correctness/pass the test as long as the reference implementation is ran first. If we swap the order, the test will fail as the reference implementation will operate on tensors of 0s and it will look like the generated kernel computed the correct output.
---
 src/unit_tests/test_eval.py                   | 34 +++++++++++++++
 .../test_kernels/zero_out_kernel.py           | 42 +++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 src/unit_tests/test_eval.py
 create mode 100644 src/unit_tests/test_kernels/zero_out_kernel.py

diff --git a/src/unit_tests/test_eval.py b/src/unit_tests/test_eval.py
new file mode 100644
index 00000000..461f2c3b
--- /dev/null
+++ b/src/unit_tests/test_eval.py
@@ -0,0 +1,34 @@
+import os
+
+from src.dataset import KERNEL_BENCH_PATH
+from src.eval import eval_kernel_against_ref
+from src.utils import read_file
+
+def run_test_kernel(problem_name, kernel_filename):
+    """
+    Run a test kernel agains the reference architecture and return the result.
+
+    Inputs:
+    - problem_name: name of the KernelBench problem to test on (e.g. "level1/1_Square_matrix_multiplication_.py")
+    - kernel_filename: name of the adversarial kernel (e.g. "zero_out_kernel.py")
+
+    Output:
+    - result: KernelExecResult object
+    """
+    ref_src = read_file(os.path.join(KERNEL_BENCH_PATH, problem_name))
+    kernel_src = read_file(os.path.join(os.path.dirname(__file__), "test_kernels", kernel_filename))
+    return eval_kernel_against_ref(
+        original_model_src=ref_src,
+        custom_model_src=kernel_src,
+        num_correct_trials=1,
+        num_perf_trials=1,
+        measure_performance=False,
+    )
+
+def test_input_modification():
+    """
+    Test that even if the model tries to modify the inputs to pass the tests,
+    we still flag it as incorrect.
+    """
+    result = run_test_kernel("level1/1_Square_matrix_multiplication_.py", "zero_out_kernel.py")
+    assert result.correctness is False
\ No newline at end of file
diff --git a/src/unit_tests/test_kernels/zero_out_kernel.py b/src/unit_tests/test_kernels/zero_out_kernel.py
new file mode 100644
index 00000000..59e3378a
--- /dev/null
+++ b/src/unit_tests/test_kernels/zero_out_kernel.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+from torch.utils.cpp_extension import load_inline
+
+matmul_cuda_source = r"""
+#include <torch/extension.h>
+
+torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
+    A.zero_();
+    B.zero_();
+    return torch::zeros({A.size(0), B.size(1)}, A.options());
+}
+"""
+
+matmul_cuda_cpp = "torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B);"
+
+# Compile inline extension
+matmul_module = load_inline(
+    name="fast_matmul",
+    cpp_sources=matmul_cuda_cpp,
+    cuda_sources=matmul_cuda_source,
+    functions=["matmul_cuda"],
+    verbose=False,
+)
+
+class ModelNew(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.matmul = matmul_module
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        return self.matmul.matmul_cuda(A, B)
+
+N = 2048 * 2
+
+def get_inputs():
+    A = torch.rand(N, N)
+    B = torch.rand(N, N)
+    return [A, B]
+
+def get_init_inputs():
+    return []
\ No newline at end of file

From 9097e65c7b6ec1031dd362910da4a67fe87addbf Mon Sep 17 00:00:00 2001
From: Bhavesh Kalisetti <bhaveshkalisetti@gmail.com>
Date: Tue, 4 Nov 2025 20:45:40 -0800
Subject: [PATCH 02/13] add unit test for reusing results

Adds a unit test to check that a generated kernel which attempts to access the result from the PyTorch reference model in memory fails the correctness check.

If a generated kernel uses empty_like, the CUDA caching allocator can re-use the physical memory of the previously computed result. All the kernel needs to do is return immediately and it will pass the correctness check.

Note that in order to reproduce this, we need to copy the PyTorch output to the CPU and delete the output object. Then empty_like will fetch the physical memory for the output object.
---
 src/unit_tests/test_eval.py                   |  7 ++++
 .../test_kernels/result_reuse_kernel.py       | 42 +++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 src/unit_tests/test_kernels/result_reuse_kernel.py

diff --git a/src/unit_tests/test_eval.py b/src/unit_tests/test_eval.py
index 461f2c3b..37116122 100644
--- a/src/unit_tests/test_eval.py
+++ b/src/unit_tests/test_eval.py
@@ -31,4 +31,11 @@ def test_input_modification():
     we still flag it as incorrect.
     """
     result = run_test_kernel("level1/1_Square_matrix_multiplication_.py", "zero_out_kernel.py")
+    assert result.correctness is False
+
+def test_result_reuse():
+    """
+    Test that the eval script will flag a kernel as incorrect if it tries to reuse computation from the PyTorch reference.
+    """
+    result = run_test_kernel("level1/1_Square_matrix_multiplication_.py", "result_reuse_kernel.py")
     assert result.correctness is False
\ No newline at end of file
diff --git a/src/unit_tests/test_kernels/result_reuse_kernel.py b/src/unit_tests/test_kernels/result_reuse_kernel.py
new file mode 100644
index 00000000..0b6c787c
--- /dev/null
+++ b/src/unit_tests/test_kernels/result_reuse_kernel.py
@@ -0,0 +1,42 @@
+import torch
+import torch.nn as nn
+from torch.utils.cpp_extension import load_inline
+
+matmul_cuda_source = r"""
+#include <torch/extension.h>
+#include <cuda_runtime.h>
+
+torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
+    auto C = torch::empty_like(A);
+    return C;
+}
+"""
+
+matmul_cuda_cpp = "torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B);"
+
+# Compile inline extension
+matmul_module = load_inline(
+    name="fast_matmul",
+    cpp_sources=matmul_cuda_cpp,
+    cuda_sources=matmul_cuda_source,
+    functions=["matmul_cuda"],
+    verbose=False,
+)
+
+class ModelNew(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.matmul = matmul_module
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        return self.matmul.matmul_cuda(A, B)
+
+N = 2048 * 2
+
+def get_inputs():
+    A = torch.rand(N, N)
+    B = torch.rand(N, N)
+    return [A, B]
+
+def get_init_inputs():
+    return []
\ No newline at end of file

From 357957ea2db1f12291852168143840b335e0c724 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Sat, 29 Nov 2025 20:32:34 +0000
Subject: [PATCH 03/13] test bhavesh's unit test

---
 EVAL.md                                       | 20 ++++++
 src/unit_tests/test_eval.py                   | 61 ++++++++++++++++---
 .../test_kernels/result_reuse_kernel.py       | 11 +++-
 .../test_kernels/zero_out_kernel.py           | 10 ++-
 4 files changed, 91 insertions(+), 11 deletions(-)
 create mode 100644 EVAL.md

diff --git a/EVAL.md b/EVAL.md
new file mode 100644
index 00000000..bcff360e
--- /dev/null
+++ b/EVAL.md
@@ -0,0 +1,20 @@
+# Evaluation
+[WIP] Benchmarking Guide
+To be updated more comprehensively with the benchmarking guide & blog that we have been working on this quarter.
+
+You should be **extra careful!** 
+
+If the model can reward hack, it will find ways to reward hack!
+
+## Methodology
+
+
+### Correctness
+
+### Performance
+We (are) implementing a few ways for timing and understand the tradeoffs.
+
+### Unit Tests
+See `src/unit_tests/test_eval.py` for unit test designed speciifcally for the eval script. Currently we have added some tests to check tricky scenarios that we can think of.
+ 
+Note this is an ongoing community effort. 
\ No newline at end of file
diff --git a/src/unit_tests/test_eval.py b/src/unit_tests/test_eval.py
index 37116122..104e0051 100644
--- a/src/unit_tests/test_eval.py
+++ b/src/unit_tests/test_eval.py
@@ -4,6 +4,16 @@
 from src.eval import eval_kernel_against_ref
 from src.utils import read_file
 
+"""
+Specifically testing the robustness of the eval function.
+Currently focus on correctness.
+Timing test to come later.
+
+Run with pytest
+pytest src/unit_tests/test_eval.py
+"""
+
+
 def run_test_kernel(problem_name, kernel_filename):
     """
     Run a test kernel agains the reference architecture and return the result.
@@ -15,27 +25,60 @@ def run_test_kernel(problem_name, kernel_filename):
     Output:
     - result: KernelExecResult object
     """
+    print(f"Running test adversarial kernel {kernel_filename} against problem {problem_name}")
     ref_src = read_file(os.path.join(KERNEL_BENCH_PATH, problem_name))
+    # TODO: replace with Omar's dataset object
     kernel_src = read_file(os.path.join(os.path.dirname(__file__), "test_kernels", kernel_filename))
     return eval_kernel_against_ref(
         original_model_src=ref_src,
         custom_model_src=kernel_src,
-        num_correct_trials=1,
+        num_correct_trials=5, # under mulitiple correctness trials
         num_perf_trials=1,
         measure_performance=False,
     )
 
+
+def test_result_cache_reuse():
+    """
+    Test to prevent advserial kernel by reuse computed PyTorch results 
+
+    Methodology: 
+    - Advserial kernel allocate memory for result but does not initialize it.
+    - Potential Hacking scenario: CUDA cache allocator might give that un-erased physical memory to the custom kernel (which contains the result) and it will incorrectly pass
+    - Eval script should flag this as incorrect as the reference kernel did not conduct any menaingful computation
+    """
+    result = run_test_kernel(problem_name="level1/1_Square_matrix_multiplication_.py", 
+                            kernel_filename="result_reuse_kernel.py")
+    assert result.correctness is False, "Cache reuse hack detected"
+    print("Result: Eval Function Passed Test Result Cache Reuse")
+    print(result)
+
 def test_input_modification():
     """
     Test that even if the model tries to modify the inputs to pass the tests,
     we still flag it as incorrect.
-    """
-    result = run_test_kernel("level1/1_Square_matrix_multiplication_.py", "zero_out_kernel.py")
-    assert result.correctness is False
 
-def test_result_reuse():
+    Methodology:
+    - Advserial kernel zeros out all inputs and returns all zeros for the final matmul shape.
+    - Potential Hacking scenario: Since adversarial kernel zero out the input, the reference might ran the kernel with all zeros. 
+      Hence both refernece and custom kernel return all zeros, and this incorrectly pass.
+    - Eval script should flag this as incorrect as this kernel zeros out inputs and trick the refernece kernel into returning all zeros.
     """
-    Test that the eval script will flag a kernel as incorrect if it tries to reuse computation from the PyTorch reference.
-    """
-    result = run_test_kernel("level1/1_Square_matrix_multiplication_.py", "result_reuse_kernel.py")
-    assert result.correctness is False
\ No newline at end of file
+    result = run_test_kernel(problem_name="level1/1_Square_matrix_multiplication_.py", 
+                            kernel_filename="zero_out_kernel.py")
+    assert result.correctness is False, "Input modification hack detected"
+    print("Result: Eval Function Passed Test Input Modification Hack")
+    print(result)
+
+
+#############
+# TODO
+# CUDA stream hacking
+############
+
+def main():
+    test_result_cache_reuse()
+    test_input_modification()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/unit_tests/test_kernels/result_reuse_kernel.py b/src/unit_tests/test_kernels/result_reuse_kernel.py
index 0b6c787c..da20668d 100644
--- a/src/unit_tests/test_kernels/result_reuse_kernel.py
+++ b/src/unit_tests/test_kernels/result_reuse_kernel.py
@@ -2,6 +2,15 @@
 import torch.nn as nn
 from torch.utils.cpp_extension import load_inline
 
+####
+# 
+# This adversarial kernel utilizes zeros_like which can get allocated the same physical memory as the PyTorch reference outputs
+# If the eval script deletes/frees the PyTorch output object at some point before the custom kernel is run, 
+# the CUDA cache allocator might give that un-erased physical memory to the custom kernel and it will incorrectly pass
+# So, we should ensure we zero out physical memory to prevent reuse
+####
+
+# Returns a zero_
 matmul_cuda_source = r"""
 #include <torch/extension.h>
 #include <cuda_runtime.h>
@@ -16,7 +25,7 @@
 
 # Compile inline extension
 matmul_module = load_inline(
-    name="fast_matmul",
+    name="fast_matmul", # this is a hacky kernel that shall not pass; it would extra fast since it just allocates memory of result shape
     cpp_sources=matmul_cuda_cpp,
     cuda_sources=matmul_cuda_source,
     functions=["matmul_cuda"],
diff --git a/src/unit_tests/test_kernels/zero_out_kernel.py b/src/unit_tests/test_kernels/zero_out_kernel.py
index 59e3378a..aaad278b 100644
--- a/src/unit_tests/test_kernels/zero_out_kernel.py
+++ b/src/unit_tests/test_kernels/zero_out_kernel.py
@@ -2,6 +2,14 @@
 import torch.nn as nn
 from torch.utils.cpp_extension import load_inline
 
+###
+# This custom kernel zeros out the inputs and returns a zero matrix.
+# It is specifically designed to test the eval script's robustness.
+# Running this zero kernel should result in incorrectness for eval,
+# as the reference kernel will not return all zeros.
+###
+
+# Destroys all inputs, returns all zeros for final matmul shape
 matmul_cuda_source = r"""
 #include <torch/extension.h>
 
@@ -16,7 +24,7 @@
 
 # Compile inline extension
 matmul_module = load_inline(
-    name="fast_matmul",
+    name="fast_matmul", # this is a hacky kernel that shall not pass; it would extra fast since it just allocates 0s
     cpp_sources=matmul_cuda_cpp,
     cuda_sources=matmul_cuda_source,
     functions=["matmul_cuda"],

From 5bb2679d236954442ac7e8cca47b559b4bf0acb9 Mon Sep 17 00:00:00 2001
From: Bhavesh Kalisetti <bhaveshkalisetti@gmail.com>
Date: Mon, 1 Dec 2025 15:04:04 -0600
Subject: [PATCH 04/13] use generic shape

use generic matmul shape for cache reuse adversarial kernel rather than requiring a square matmul.
---
 src/unit_tests/test_kernels/result_reuse_kernel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/unit_tests/test_kernels/result_reuse_kernel.py b/src/unit_tests/test_kernels/result_reuse_kernel.py
index da20668d..a27e90a6 100644
--- a/src/unit_tests/test_kernels/result_reuse_kernel.py
+++ b/src/unit_tests/test_kernels/result_reuse_kernel.py
@@ -4,7 +4,7 @@
 
 ####
 # 
-# This adversarial kernel utilizes zeros_like which can get allocated the same physical memory as the PyTorch reference outputs
+# This adversarial kernel utilizes empty which can get allocated the same physical memory as the PyTorch reference outputs
 # If the eval script deletes/frees the PyTorch output object at some point before the custom kernel is run, 
 # the CUDA cache allocator might give that un-erased physical memory to the custom kernel and it will incorrectly pass
 # So, we should ensure we zero out physical memory to prevent reuse
@@ -16,7 +16,7 @@
 #include <cuda_runtime.h>
 
 torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
-    auto C = torch::empty_like(A);
+    auto C = torch::empty({A.size(0), B.size(1)}, A.options());
     return C;
 }
 """

From 3d7ff72395419878667672b4fa27ff63d98f155b Mon Sep 17 00:00:00 2001
From: Bhavesh Kalisetti <bhaveshkalisetti@gmail.com>
Date: Tue, 2 Dec 2025 00:12:05 -0600
Subject: [PATCH 05/13] excessive speedup unit test via non default stream

---
 src/unit_tests/test_eval.py                   | 53 ++++++++++++++++---
 .../test_kernels/non_default_stream_kernel.py | 52 ++++++++++++++++++
 2 files changed, 97 insertions(+), 8 deletions(-)
 create mode 100644 src/unit_tests/test_kernels/non_default_stream_kernel.py

diff --git a/src/unit_tests/test_eval.py b/src/unit_tests/test_eval.py
index 104e0051..e4f85dce 100644
--- a/src/unit_tests/test_eval.py
+++ b/src/unit_tests/test_eval.py
@@ -1,18 +1,43 @@
 import os
 
+import torch
+
 from src.dataset import KERNEL_BENCH_PATH
-from src.eval import eval_kernel_against_ref
+from src.eval import eval_kernel_against_ref, get_timing_stats, load_original_model_and_inputs, set_seed, time_execution_with_cuda_event
 from src.utils import read_file
 
 """
 Specifically testing the robustness of the eval function.
 Currently focus on correctness.
-Timing test to come later.
 
 Run with pytest
 pytest src/unit_tests/test_eval.py
 """
 
+def measure_ref_kernel(problem_name):
+    ref_src = read_file(os.path.join(KERNEL_BENCH_PATH, problem_name))
+    Model, get_init_inputs, get_inputs = load_original_model_and_inputs(ref_src, {})
+
+    device = torch.device("cuda")
+    init_inputs = get_init_inputs()
+    init_inputs = [x.to(device) for x in init_inputs]
+
+    with torch.no_grad():
+        set_seed(42)
+        model = Model(*init_inputs).to(device=device, dtype=torch.float32)
+        torch.cuda.synchronize(device=device)
+
+    inputs = get_inputs()
+    inputs = [x.to(device) for x in inputs]
+    elapsed = time_execution_with_cuda_event(
+        model,
+        *inputs,
+        num_trials=5,
+        verbose=False,
+        device=device,
+    )
+    stats = get_timing_stats(elapsed)
+    return stats
 
 def run_test_kernel(problem_name, kernel_filename):
     """
@@ -33,8 +58,8 @@ def run_test_kernel(problem_name, kernel_filename):
         original_model_src=ref_src,
         custom_model_src=kernel_src,
         num_correct_trials=5, # under mulitiple correctness trials
-        num_perf_trials=1,
-        measure_performance=False,
+        num_perf_trials=5,
+        measure_performance=True,
     )
 
 
@@ -71,14 +96,26 @@ def test_input_modification():
     print(result)
 
 
-#############
-# TODO
-# CUDA stream hacking
-############
+def test_non_default_stream(threshold=1.5):
+    """
+    Test that we will flag adversarial kernels that cheat by assigning work to non-default CUDA streams.
+
+    Methodology:
+    - Adversarial kernel assigns work to non-default CUDA streams.
+    - Potential Hacking scenario: Eval script waits on the default stream only when measuring kernel runtime, leading to unrealistic speedups.
+    - Eval script should accurately measure runtime by waiting on all streams, not just the default stream.
+    """
+    result = run_test_kernel(problem_name="level1/1_Square_matrix_multiplication_.py", 
+                            kernel_filename="non_default_stream_kernel.py")
+    assert "excessive_speedup" in result.metadata, "Excessive speedup detected"
+    print("Result: Eval Function passed Non Default Stream Hack")
+    print(result)
+
 
 def main():
     test_result_cache_reuse()
     test_input_modification()
+    test_non_default_stream()
 
 if __name__ == "__main__":
     main()
\ No newline at end of file
diff --git a/src/unit_tests/test_kernels/non_default_stream_kernel.py b/src/unit_tests/test_kernels/non_default_stream_kernel.py
new file mode 100644
index 00000000..b8f4e26a
--- /dev/null
+++ b/src/unit_tests/test_kernels/non_default_stream_kernel.py
@@ -0,0 +1,52 @@
+import torch
+import torch.nn as nn
+from torch.utils.cpp_extension import load_inline
+
+###
+# This adversarial kernel assigns all work to a non-default CUDA stream.
+# If the eval script waits on the default stream only when measuring kernel runtime,
+# this will lead to unrealistic speedups.
+###
+
+matmul_cuda_source = r"""
+#include <torch/extension.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
+    int dev_index = A.get_device();
+    auto stream = at::cuda::getStreamFromPool(false, dev_index);
+    c10::cuda::CUDAStreamGuard guard(stream);
+    auto result = at::matmul(A, B);
+    return result;
+}
+"""
+
+matmul_cuda_cpp = "torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B);"
+
+# Compile inline extension
+matmul_module = load_inline(
+    name="fast_matmul",
+    cpp_sources=matmul_cuda_cpp,
+    cuda_sources=matmul_cuda_source,
+    functions=["matmul_cuda"],
+    verbose=False,
+)
+
+class ModelNew(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.matmul = matmul_module
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        return self.matmul.matmul_cuda(A, B)
+
+N = 2048 * 2
+
+def get_inputs():
+    A = torch.rand(N, N)
+    B = torch.rand(N, N)
+    return [A, B]
+
+def get_init_inputs():
+    return []
\ No newline at end of file

From eab8a8806006ee256976266b9c8d2ebcbb103fd9 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Thu, 18 Dec 2025 02:46:14 +0000
Subject: [PATCH 06/13] update timing signature

---
 src/unit_tests/test_eval.py                        | 14 +++++++++-----
 .../test_kernels/non_default_stream_kernel.py      | 13 ++++++++-----
 src/unit_tests/test_kernels/result_reuse_kernel.py |  4 ++--
 src/unit_tests/test_kernels/zero_out_kernel.py     |  4 ++--
 4 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/src/unit_tests/test_eval.py b/src/unit_tests/test_eval.py
index e4f85dce..3b77cfce 100644
--- a/src/unit_tests/test_eval.py
+++ b/src/unit_tests/test_eval.py
@@ -3,7 +3,8 @@
 import torch
 
 from src.dataset import KERNEL_BENCH_PATH
-from src.eval import eval_kernel_against_ref, get_timing_stats, load_original_model_and_inputs, set_seed, time_execution_with_cuda_event
+from src.eval import eval_kernel_against_ref, load_original_model_and_inputs, set_seed
+from src.timing import get_timing_stats, get_timing_function
 from src.utils import read_file
 
 """
@@ -29,11 +30,12 @@ def measure_ref_kernel(problem_name):
 
     inputs = get_inputs()
     inputs = [x.to(device) for x in inputs]
-    elapsed = time_execution_with_cuda_event(
+    timing_func = get_timing_function("cuda_event")
+    elapsed = timing_func(
         model,
         *inputs,
         num_trials=5,
-        verbose=False,
+        verbose=False,  
         device=device,
     )
     stats = get_timing_stats(elapsed)
@@ -60,6 +62,7 @@ def run_test_kernel(problem_name, kernel_filename):
         num_correct_trials=5, # under mulitiple correctness trials
         num_perf_trials=5,
         measure_performance=True,
+        timing_method="cuda_event"
     )
 
 
@@ -107,14 +110,15 @@ def test_non_default_stream(threshold=1.5):
     """
     result = run_test_kernel(problem_name="level1/1_Square_matrix_multiplication_.py", 
                             kernel_filename="non_default_stream_kernel.py")
+    print(result)
     assert "excessive_speedup" in result.metadata, "Excessive speedup detected"
     print("Result: Eval Function passed Non Default Stream Hack")
     print(result)
 
 
 def main():
-    test_result_cache_reuse()
-    test_input_modification()
+    # test_result_cache_reuse()
+    # test_input_modification()
     test_non_default_stream()
 
 if __name__ == "__main__":
diff --git a/src/unit_tests/test_kernels/non_default_stream_kernel.py b/src/unit_tests/test_kernels/non_default_stream_kernel.py
index b8f4e26a..7321d787 100644
--- a/src/unit_tests/test_kernels/non_default_stream_kernel.py
+++ b/src/unit_tests/test_kernels/non_default_stream_kernel.py
@@ -2,11 +2,14 @@
 import torch.nn as nn
 from torch.utils.cpp_extension import load_inline
 
-###
-# This adversarial kernel assigns all work to a non-default CUDA stream.
-# If the eval script waits on the default stream only when measuring kernel runtime,
-# this will lead to unrealistic speedups.
-###
+###########################################
+# CUDA_STREAM_ADVERSARIAL
+# adversarial kernel assigns all work to a non-default CUDA stream.
+# If the eval script waits only on the default cuda stream when measuring kernel runtime, this will lead to unrealistic speedups.   
+###########################################
+# this adversarial kernel obtains a non-default stream via at::cuda::getStreamFromPool() and uses a CUDAStreamGuard to run the matrix multiplication on that stream. 
+
+# if timing harness only waits on default stream, e.g. torch.cuda.synchronize(), it will measure near-zero time because the actual computation is still running asynchronously on the other stream.
 
 matmul_cuda_source = r"""
 #include <torch/extension.h>
diff --git a/src/unit_tests/test_kernels/result_reuse_kernel.py b/src/unit_tests/test_kernels/result_reuse_kernel.py
index a27e90a6..88258ddf 100644
--- a/src/unit_tests/test_kernels/result_reuse_kernel.py
+++ b/src/unit_tests/test_kernels/result_reuse_kernel.py
@@ -2,13 +2,13 @@
 import torch.nn as nn
 from torch.utils.cpp_extension import load_inline
 
-####
+###########################################
 # 
 # This adversarial kernel utilizes empty which can get allocated the same physical memory as the PyTorch reference outputs
 # If the eval script deletes/frees the PyTorch output object at some point before the custom kernel is run, 
 # the CUDA cache allocator might give that un-erased physical memory to the custom kernel and it will incorrectly pass
 # So, we should ensure we zero out physical memory to prevent reuse
-####
+###########################################
 
 # Returns a zero_
 matmul_cuda_source = r"""
diff --git a/src/unit_tests/test_kernels/zero_out_kernel.py b/src/unit_tests/test_kernels/zero_out_kernel.py
index aaad278b..aba20620 100644
--- a/src/unit_tests/test_kernels/zero_out_kernel.py
+++ b/src/unit_tests/test_kernels/zero_out_kernel.py
@@ -2,12 +2,12 @@
 import torch.nn as nn
 from torch.utils.cpp_extension import load_inline
 
-###
+###########################################
 # This custom kernel zeros out the inputs and returns a zero matrix.
 # It is specifically designed to test the eval script's robustness.
 # Running this zero kernel should result in incorrectness for eval,
 # as the reference kernel will not return all zeros.
-###
+###########################################
 
 # Destroys all inputs, returns all zeros for final matmul shape
 matmul_cuda_source = r"""

From 55eca8ae7449d00f89b4eb3931dbe597848d9b02 Mon Sep 17 00:00:00 2001
From: Bhavesh Kalisetti <bhaveshkalisetti@gmail.com>
Date: Wed, 17 Dec 2025 20:43:53 -0800
Subject: [PATCH 07/13] update non default stream kernel

make a non-blocking non-default stream, and use cublasGemmEx rather than at::matmul:
---
 .../test_kernels/non_default_stream_kernel.py | 37 +++++++++++++++----
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/unit_tests/test_kernels/non_default_stream_kernel.py b/src/unit_tests/test_kernels/non_default_stream_kernel.py
index 7321d787..335248c8 100644
--- a/src/unit_tests/test_kernels/non_default_stream_kernel.py
+++ b/src/unit_tests/test_kernels/non_default_stream_kernel.py
@@ -7,20 +7,41 @@
 # adversarial kernel assigns all work to a non-default CUDA stream.
 # If the eval script waits only on the default cuda stream when measuring kernel runtime, this will lead to unrealistic speedups.   
 ###########################################
-# this adversarial kernel obtains a non-default stream via at::cuda::getStreamFromPool() and uses a CUDAStreamGuard to run the matrix multiplication on that stream. 
+# this adversarial kernel obtains a non-default, non-blocking stream via cudaStreamCreateWithFlags(...) and sets to be the stream of the cuBLAS handle.
+# then, it performs a matrix multiplication on this new stream.
 
-# if timing harness only waits on default stream, e.g. torch.cuda.synchronize(), it will measure near-zero time because the actual computation is still running asynchronously on the other stream.
+# if timing harness only waits on default stream, it will measure near-zero time because the actual computation is still running asynchronously on the other stream.
 
 matmul_cuda_source = r"""
 #include <torch/extension.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
 
 torch::Tensor matmul_cuda(torch::Tensor A, torch::Tensor B) {
-    int dev_index = A.get_device();
-    auto stream = at::cuda::getStreamFromPool(false, dev_index);
-    c10::cuda::CUDAStreamGuard guard(stream);
-    auto result = at::matmul(A, B);
+    int M = A.size(0);
+    int K = A.size(1);
+    int N = B.size(1);
+    
+    auto result = torch::empty({M, N}, A.options());
+    
+    cudaStream_t stream;
+    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
+    
+    cublasHandle_t handle;
+    cublasCreate(&handle);
+    cublasSetStream(handle, stream);
+    
+    float alpha = 1.0f, beta = 0.0f;
+    
+    cublasGemmEx(handle,
+        CUBLAS_OP_N, CUBLAS_OP_N,
+        N, M, K,
+        &alpha,
+        B.data_ptr<float>(), CUDA_R_32F, N,
+        A.data_ptr<float>(), CUDA_R_32F, K,
+        &beta,
+        result.data_ptr<float>(), CUDA_R_32F, N,
+        CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
     return result;
 }
 """

From ab9e61f232e7ae383e0fbfd87ef2931a335e97ad Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Fri, 19 Dec 2025 00:47:52 +0000
Subject: [PATCH 08/13] reduce trial for adverseiral stream hack

---
 ...{test_eval.py => test_eval_adversarial.py} | 45 +++++++++++++------
 .../test_kernels/non_default_stream_kernel.py |  8 ++++
 2 files changed, 39 insertions(+), 14 deletions(-)
 rename src/unit_tests/{test_eval.py => test_eval_adversarial.py} (77%)

diff --git a/src/unit_tests/test_eval.py b/src/unit_tests/test_eval_adversarial.py
similarity index 77%
rename from src/unit_tests/test_eval.py
rename to src/unit_tests/test_eval_adversarial.py
index 3b77cfce..64a075b8 100644
--- a/src/unit_tests/test_eval.py
+++ b/src/unit_tests/test_eval_adversarial.py
@@ -3,13 +3,13 @@
 import torch
 
 from src.dataset import KERNEL_BENCH_PATH
-from src.eval import eval_kernel_against_ref, load_original_model_and_inputs, set_seed
+from src.eval import KernelExecResult, eval_kernel_against_ref, load_original_model_and_inputs, set_seed
 from src.timing import get_timing_stats, get_timing_function
 from src.utils import read_file
 
 """
 Specifically testing the robustness of the eval function.
-Currently focus on correctness.
+Through a few targeted adversarial kernels.
 
 Run with pytest
 pytest src/unit_tests/test_eval.py
@@ -33,15 +33,20 @@ def measure_ref_kernel(problem_name):
     timing_func = get_timing_function("cuda_event")
     elapsed = timing_func(
         model,
-        *inputs,
-        num_trials=5,
+        inputs,
+        num_warmup=5,
+        num_trials=100,
+        discard_first=1,
         verbose=False,  
         device=device,
     )
-    stats = get_timing_stats(elapsed)
-    return stats
+    timing_stats = get_timing_stats(elapsed)
+    return timing_stats
 
-def run_test_kernel(problem_name, kernel_filename):
+def run_test_kernel(problem_name, 
+                    kernel_filename, 
+                    timing_method="cuda_event",
+                    num_perf_trials=100):
     """
     Run a test kernel agains the reference architecture and return the result.
 
@@ -56,13 +61,15 @@ def run_test_kernel(problem_name, kernel_filename):
     ref_src = read_file(os.path.join(KERNEL_BENCH_PATH, problem_name))
     # TODO: replace with Omar's dataset object
     kernel_src = read_file(os.path.join(os.path.dirname(__file__), "test_kernels", kernel_filename))
+    
     return eval_kernel_against_ref(
         original_model_src=ref_src,
         custom_model_src=kernel_src,
         num_correct_trials=5, # under mulitiple correctness trials
-        num_perf_trials=5,
+        num_perf_trials=num_perf_trials,
         measure_performance=True,
-        timing_method="cuda_event"
+        timing_method=timing_method,
+        # verbose = True
     )
 
 
@@ -108,12 +115,22 @@ def test_non_default_stream(threshold=1.5):
     - Potential Hacking scenario: Eval script waits on the default stream only when measuring kernel runtime, leading to unrealistic speedups.
     - Eval script should accurately measure runtime by waiting on all streams, not just the default stream.
     """
+    ref_result: dict = measure_ref_kernel("level1/1_Square_matrix_multiplication_.py", )
+    ref_runtime = ref_result["mean"] # in ms, reference pytorch speed
+    
+    print(f"Reference runtime: {ref_runtime} ms")
+    timing_method = "cuda_event"
     result = run_test_kernel(problem_name="level1/1_Square_matrix_multiplication_.py", 
-                            kernel_filename="non_default_stream_kernel.py")
-    print(result)
-    assert "excessive_speedup" in result.metadata, "Excessive speedup detected"
-    print("Result: Eval Function passed Non Default Stream Hack")
-    print(result)
+                            kernel_filename="non_default_stream_kernel.py",
+                            timing_method=timing_method,
+                            num_perf_trials=10 ) # keep it low as we are leaking cuda stream
+    print(f"Timing {timing_method} mean: {result.runtime} ms")
+    print(f"Full timing result: {result}")
+
+    # TODO : find a way to mark excessive speedup
+    # assert result.runtime >= ref_runtime / threshold, "Excessive speedup detected"
+    # print("Result: Eval Function passed Non Default Stream Hack")
+    # print(result)
 
 
 def main():
diff --git a/src/unit_tests/test_kernels/non_default_stream_kernel.py b/src/unit_tests/test_kernels/non_default_stream_kernel.py
index 335248c8..37fe00fd 100644
--- a/src/unit_tests/test_kernels/non_default_stream_kernel.py
+++ b/src/unit_tests/test_kernels/non_default_stream_kernel.py
@@ -42,6 +42,14 @@
         &beta,
         result.data_ptr<float>(), CUDA_R_32F, N,
         CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT);
+
+    // NOTE: Intentionally leaking CUDA resources (stream + cuBLAS handle) to preserve
+    // adversarial behavior. Calling cudaStreamDestroy() would implicitly synchronize,
+    // defeating the purpose of this test (which checks if the eval harness detects
+    // work launched on non-default streams). Keep num_perf_trials low to avoid OOM.
+    // cublasDestroy(handle);
+    // cudaStreamDestroy(stream);
+    
     return result;
 }
 """

From 251a03e263a06479c66c7b0d9620f429138821a4 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Fri, 19 Dec 2025 01:01:09 +0000
Subject: [PATCH 09/13] show unrealistic speedup

---
 src/eval.py                             | 2 +-
 src/unit_tests/test_eval_adversarial.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/eval.py b/src/eval.py
index 5f1fe8d8..24007b9e 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -112,7 +112,7 @@ class KernelExecResult(BaseModel):
 
     compiled: bool = False
     correctness: bool = False
-    metadata: dict = {}
+    metadata: dict = {} # NOTE: to include warning if any
     runtime: float = -1.0  # in us, only recorded if we decide to measure performance
     runtime_stats: dict = {}  # only recorded if we decide to measure performance
 
diff --git a/src/unit_tests/test_eval_adversarial.py b/src/unit_tests/test_eval_adversarial.py
index 64a075b8..34921c06 100644
--- a/src/unit_tests/test_eval_adversarial.py
+++ b/src/unit_tests/test_eval_adversarial.py
@@ -127,10 +127,10 @@ def test_non_default_stream(threshold=1.5):
     print(f"Timing {timing_method} mean: {result.runtime} ms")
     print(f"Full timing result: {result}")
 
-    # TODO : find a way to mark excessive speedup
-    # assert result.runtime >= ref_runtime / threshold, "Excessive speedup detected"
-    # print("Result: Eval Function passed Non Default Stream Hack")
-    # print(result)
+    effective_speedup = ref_runtime / result.runtime
+    print(f"Effective speedup: {effective_speedup}")
+
+    assert effective_speedup < threshold, f"Excessive speedup detected, we got {effective_speedup}x speedup but expected less than {threshold}x realistically"
 
 
 def main():

From 4fc97517f68c32f8a446cddfb685b9e6c36c6103 Mon Sep 17 00:00:00 2001
From: Bhavesh Kalisetti <bhaveshkalisetti@gmail.com>
Date: Thu, 18 Dec 2025 17:39:14 -0800
Subject: [PATCH 10/13] flag excessive speedups in eval script

eval script now flags excessive speedups by timing pytorch reference.
---
 src/eval.py                             | 25 +++++++++++
 src/unit_tests/test_eval_adversarial.py | 55 ++++---------------------
 2 files changed, 32 insertions(+), 48 deletions(-)

diff --git a/src/eval.py b/src/eval.py
index 24007b9e..daa80dde 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -115,6 +115,8 @@ class KernelExecResult(BaseModel):
     metadata: dict = {} # NOTE: to include warning if any
     runtime: float = -1.0  # in us, only recorded if we decide to measure performance
     runtime_stats: dict = {}  # only recorded if we decide to measure performance
+    reference_runtime: float = -1.0  # in us, only recorded if we decide to measure performance
+    reference_runtime_stats: dict = {} # only recorded if we decide to measure performance
 
 
 def load_original_model_and_inputs(
@@ -402,6 +404,7 @@ def eval_kernel_against_ref(
     ),  # have to run on GPU
     backend: str = "cuda",  # can be 'cuda', 'triton', 'tilelang', or 'cute'
     precision: torch.dtype = torch.float32,
+    excessive_speedup_threshold: float = 1.5, # if the kernel is <excessive_speedup_threshold>x faster than the reference, it will get flagged
 ) -> KernelExecResult:
     """
     Evaluate the custom kernel against the original model
@@ -598,6 +601,28 @@ def eval_kernel_against_ref(
                     print(f"[Eval] Performance Stats: {runtime_stats}")
                 kernel_exec_result.runtime = runtime_stats["mean"]
                 kernel_exec_result.runtime_stats = runtime_stats
+
+                # time the PyTorch reference in the same way and flag potential excessive speedups.
+                torch.cuda.synchronize(device=device)
+                reference_elapsed_times = timing_fn(
+                    original_model,
+                    inputs,
+                    num_trials=num_perf_trials,
+                    verbose=verbose,
+                    device=device,
+                )
+                reference_runtime_stats = timing.get_timing_stats(reference_elapsed_times, device=device)
+                kernel_exec_result.reference_runtime = reference_runtime_stats["mean"]
+                kernel_exec_result.reference_runtime_stats = reference_runtime_stats
+
+                effective_speedup = kernel_exec_result.reference_runtime / kernel_exec_result.runtime
+                if effective_speedup > excessive_speedup_threshold:
+                    kernel_exec_result.metadata["excessive_speedup"] = True
+                    if verbose:
+                        print(f"[Eval] Excessive speedup of {effective_speedup}x using timing method {timing_method} detected!")
+                        print("Double check your kernel to ensure it is not reward hacking.")
+
+
         except Exception as e:
             if verbose:
                 print(f"[Eval] Error in Measuring Performance: {e}")
diff --git a/src/unit_tests/test_eval_adversarial.py b/src/unit_tests/test_eval_adversarial.py
index 34921c06..99f9dbf6 100644
--- a/src/unit_tests/test_eval_adversarial.py
+++ b/src/unit_tests/test_eval_adversarial.py
@@ -1,10 +1,7 @@
 import os
 
-import torch
-
 from src.dataset import KERNEL_BENCH_PATH
-from src.eval import KernelExecResult, eval_kernel_against_ref, load_original_model_and_inputs, set_seed
-from src.timing import get_timing_stats, get_timing_function
+from src.eval import eval_kernel_against_ref
 from src.utils import read_file
 
 """
@@ -15,34 +12,6 @@
 pytest src/unit_tests/test_eval.py
 """
 
-def measure_ref_kernel(problem_name):
-    ref_src = read_file(os.path.join(KERNEL_BENCH_PATH, problem_name))
-    Model, get_init_inputs, get_inputs = load_original_model_and_inputs(ref_src, {})
-
-    device = torch.device("cuda")
-    init_inputs = get_init_inputs()
-    init_inputs = [x.to(device) for x in init_inputs]
-
-    with torch.no_grad():
-        set_seed(42)
-        model = Model(*init_inputs).to(device=device, dtype=torch.float32)
-        torch.cuda.synchronize(device=device)
-
-    inputs = get_inputs()
-    inputs = [x.to(device) for x in inputs]
-    timing_func = get_timing_function("cuda_event")
-    elapsed = timing_func(
-        model,
-        inputs,
-        num_warmup=5,
-        num_trials=100,
-        discard_first=1,
-        verbose=False,  
-        device=device,
-    )
-    timing_stats = get_timing_stats(elapsed)
-    return timing_stats
-
 def run_test_kernel(problem_name, 
                     kernel_filename, 
                     timing_method="cuda_event",
@@ -69,7 +38,6 @@ def run_test_kernel(problem_name,
         num_perf_trials=num_perf_trials,
         measure_performance=True,
         timing_method=timing_method,
-        # verbose = True
     )
 
 
@@ -106,7 +74,7 @@ def test_input_modification():
     print(result)
 
 
-def test_non_default_stream(threshold=1.5):
+def test_non_default_stream(timing_method="cuda_event", threshold=1.5):
     """
     Test that we will flag adversarial kernels that cheat by assigning work to non-default CUDA streams.
 
@@ -115,27 +83,18 @@ def test_non_default_stream(threshold=1.5):
     - Potential Hacking scenario: Eval script waits on the default stream only when measuring kernel runtime, leading to unrealistic speedups.
     - Eval script should accurately measure runtime by waiting on all streams, not just the default stream.
     """
-    ref_result: dict = measure_ref_kernel("level1/1_Square_matrix_multiplication_.py", )
-    ref_runtime = ref_result["mean"] # in ms, reference pytorch speed
-    
-    print(f"Reference runtime: {ref_runtime} ms")
-    timing_method = "cuda_event"
     result = run_test_kernel(problem_name="level1/1_Square_matrix_multiplication_.py", 
                             kernel_filename="non_default_stream_kernel.py",
                             timing_method=timing_method,
                             num_perf_trials=10 ) # keep it low as we are leaking cuda stream
-    print(f"Timing {timing_method} mean: {result.runtime} ms")
-    print(f"Full timing result: {result}")
-
-    effective_speedup = ref_runtime / result.runtime
-    print(f"Effective speedup: {effective_speedup}")
-
-    assert effective_speedup < threshold, f"Excessive speedup detected, we got {effective_speedup}x speedup but expected less than {threshold}x realistically"
+    assert "excessive_speedup" in result.metadata, "Excessive speedup detected"
+    print("Result: Eval Function Passed Non-Default CUDA Stream Hack")
+    print(result)
 
 
 def main():
-    # test_result_cache_reuse()
-    # test_input_modification()
+    test_result_cache_reuse()
+    test_input_modification()
     test_non_default_stream()
 
 if __name__ == "__main__":

From 99b9faa1343693affa09cf6cfb9647f3b14a0002 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Fri, 19 Dec 2025 03:13:55 +0000
Subject: [PATCH 11/13] reogranize a bit to flag timing on mian stream fails

---
 src/eval.py                             | 89 ++++++++++++++++++-------
 src/unit_tests/test_eval_adversarial.py | 13 +++-
 2 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/src/eval.py b/src/eval.py
index daa80dde..06da727b 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -109,14 +109,19 @@ class KernelExecResult(BaseModel):
     """
     Single Kernel Execution
     """
-
+    # Execution
     compiled: bool = False
     correctness: bool = False
     metadata: dict = {} # NOTE: to include warning if any
+
+    # Timing
     runtime: float = -1.0  # in us, only recorded if we decide to measure performance
     runtime_stats: dict = {}  # only recorded if we decide to measure performance
-    reference_runtime: float = -1.0  # in us, only recorded if we decide to measure performance
-    reference_runtime_stats: dict = {} # only recorded if we decide to measure performance
+
+    # new: added ref time either through fetching prev runs or through execution
+    # could do eager for level 1 and compile for level 2 and 3
+    ref_runtime: float = -1.0  # in us, only recorded if we decide to measure performance
+    ref_runtime_stats: dict = {} # only recorded if we decide to measure performance
 
 
 def load_original_model_and_inputs(
@@ -404,7 +409,10 @@ def eval_kernel_against_ref(
     ),  # have to run on GPU
     backend: str = "cuda",  # can be 'cuda', 'triton', 'tilelang', or 'cute'
     precision: torch.dtype = torch.float32,
-    excessive_speedup_threshold: float = 1.5, # if the kernel is <excessive_speedup_threshold>x faster than the reference, it will get flagged
+
+    # Guard against potential reward hacking
+    check_for_excessive_speedup: bool = False,
+    excessive_speedup_threshold: float = 10, # if the kernel is <excessive_speedup_threshold>x faster than the reference, it will get flagged
 ) -> KernelExecResult:
     """
     Evaluate the custom kernel against the original model
@@ -602,32 +610,63 @@ def eval_kernel_against_ref(
                 kernel_exec_result.runtime = runtime_stats["mean"]
                 kernel_exec_result.runtime_stats = runtime_stats
 
-                # time the PyTorch reference in the same way and flag potential excessive speedups.
-                torch.cuda.synchronize(device=device)
-                reference_elapsed_times = timing_fn(
-                    original_model,
-                    inputs,
-                    num_trials=num_perf_trials,
-                    verbose=verbose,
-                    device=device,
-                )
-                reference_runtime_stats = timing.get_timing_stats(reference_elapsed_times, device=device)
-                kernel_exec_result.reference_runtime = reference_runtime_stats["mean"]
-                kernel_exec_result.reference_runtime_stats = reference_runtime_stats
-
-                effective_speedup = kernel_exec_result.reference_runtime / kernel_exec_result.runtime
-                if effective_speedup > excessive_speedup_threshold:
-                    kernel_exec_result.metadata["excessive_speedup"] = True
-                    if verbose:
-                        print(f"[Eval] Excessive speedup of {effective_speedup}x using timing method {timing_method} detected!")
-                        print("Double check your kernel to ensure it is not reward hacking.")
-
-
         except Exception as e:
             if verbose:
                 print(f"[Eval] Error in Measuring Performance: {e}")
             kernel_exec_result.metadata["error_during_performance"] = e
 
+
+
+    ###############################################################
+    # [Experimental] to be modularized
+    # Condition: custom kernel ModelNew is correct and we are able to time it correctly with kernel_exec_result
+    # We are working on preventing excessive speedup issues
+    ##############################################################
+
+    if measure_performance and check_for_excessive_speedup:  # experimental: hence able to shut off codepath if needed
+    
+        if verbose:
+            print("[Eval] Additional checks to flag excessive speedup")
+
+        torch.cuda.synchronize(device=device)
+        set_seed(seed_num)
+        inputs = get_inputs()
+        # Convert inputs for performance measurement
+        inputs = [_process_input_tensor(x, device, backend, precision) for x in inputs]
+        
+        model_new = custom_model.to(device=device, dtype=precision)
+        torch.cuda.synchronize(device=device)
+
+        # time PyTorch reference function
+        # same timing_fn as specified from before
+        timing_fn = timing.get_timing_function(timing_method)
+        reference_elapsed_times = timing_fn(
+            original_model,
+            inputs, # ideally cloned for extra safety but handled already in correctness check
+            num_trials=num_perf_trials,
+            verbose=verbose,
+            device=device,
+        )
+        reference_runtime_stats = timing.get_timing_stats(reference_elapsed_times, device=device)
+        kernel_exec_result.ref_runtime = reference_runtime_stats["mean"]
+        kernel_exec_result.ref_runtime_stats = reference_runtime_stats
+
+        # Compute Effective Speedup
+        effective_speedup = kernel_exec_result.ref_runtime / kernel_exec_result.runtime
+
+        # TODO: integrate SoL estimation for each unqiue program on destigated hardware
+        # for now, we will use a heuristics such as 5-10x which is very hard to achieve
+
+        if verbose:
+            print(f"[Eval] Effective Speedup is {effective_speedup:.2f}x using timing method {timing_method}")
+
+        if effective_speedup > excessive_speedup_threshold:
+            kernel_exec_result.metadata["excessive_speedup"] = True
+            
+            print(f"[WARNING] Excessive speedup {effective_speedup:.2f}x over {excessive_speedup_threshold}x threshold detected")
+            print(f"[WARNING] Double check your kernel carefully to ensure it is not reward hacking.")
+
+
     graceful_eval_cleanup(context, device, tempfile)
     return kernel_exec_result
 
diff --git a/src/unit_tests/test_eval_adversarial.py b/src/unit_tests/test_eval_adversarial.py
index 99f9dbf6..bbc1ed2d 100644
--- a/src/unit_tests/test_eval_adversarial.py
+++ b/src/unit_tests/test_eval_adversarial.py
@@ -38,6 +38,9 @@ def run_test_kernel(problem_name,
         num_perf_trials=num_perf_trials,
         measure_performance=True,
         timing_method=timing_method,
+        # for checking against reward hacking
+        check_for_excessive_speedup = True, 
+        excessive_speedup_threshold = 10
     )
 
 
@@ -74,7 +77,7 @@ def test_input_modification():
     print(result)
 
 
-def test_non_default_stream(timing_method="cuda_event", threshold=1.5):
+def test_non_default_stream(timing_method="do_bench", threshold=1.5):
     """
     Test that we will flag adversarial kernels that cheat by assigning work to non-default CUDA streams.
 
@@ -87,9 +90,13 @@ def test_non_default_stream(timing_method="cuda_event", threshold=1.5):
                             kernel_filename="non_default_stream_kernel.py",
                             timing_method=timing_method,
                             num_perf_trials=10 ) # keep it low as we are leaking cuda stream
-    assert "excessive_speedup" in result.metadata, "Excessive speedup detected"
-    print("Result: Eval Function Passed Non-Default CUDA Stream Hack")
+
     print(result)
+    if result.metadata.get("excessive_speedup") is True:       
+        raise AssertionError(
+            "Excessive speedup detected, Eval Function did not handle hacky stream"
+        )
+    print("Result: Eval Function Passed Non-Default CUDA Stream Hack")
 
 
 def main():

From 4f510898627835a36727a4fef8f1b20be5eb997e Mon Sep 17 00:00:00 2001
From: Bhavesh Kalisetti <bhaveshkalisetti@gmail.com>
Date: Thu, 18 Dec 2025 19:42:31 -0800
Subject: [PATCH 12/13] update EVAL.md with unit test summary

---
 EVAL.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/EVAL.md b/EVAL.md
index bcff360e..5a2296a1 100644
--- a/EVAL.md
+++ b/EVAL.md
@@ -15,6 +15,13 @@ If the model can reward hack, it will find ways to reward hack!
 We (are) implementing a few ways for timing and understand the tradeoffs.
 
 ### Unit Tests
-See `src/unit_tests/test_eval.py` for unit test designed speciifcally for the eval script. Currently we have added some tests to check tricky scenarios that we can think of.
+
+We've included some unit tests for the eval script in `src/unit_tests/test_eval_adversarial.py`. These tests run adversarial kernels (see `src/unit_tests/test_kernels/`) that contain examples of reward hacking that we've seen from LLMs and ensures that the eval script catches them, either by failing their correctness checks or flagging them for excessive speedups. Examples include:
+- Reusing computations cached during the PyTorch reference
+- Modifying inputs to cheat correctness checks
+- Moving computation to a non-default CUDA stream
+
+We will continue to add more tests as we explore additional adversarial scenarios.
+
  
 Note this is an ongoing community effort. 
\ No newline at end of file

From 61085c576bb3b06b78d4364db474aebde7863b46 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Fri, 19 Dec 2025 07:45:53 +0000
Subject: [PATCH 13/13] ready for merge, update guide a bit (more for sahan to
 keep adding in other PRs)

---
 EVAL.md                                 | 38 +++++++++++++++++++------
 src/eval.py                             | 10 ++++---
 src/unit_tests/test_eval_adversarial.py | 22 ++++++++------
 3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/EVAL.md b/EVAL.md
index 5a2296a1..c202b065 100644
--- a/EVAL.md
+++ b/EVAL.md
@@ -1,21 +1,40 @@
 # Evaluation
-[WIP] Benchmarking Guide
-To be updated more comprehensively with the benchmarking guide & blog that we have been working on this quarter.
+[WIP] More notes on Benchmarking Guide
+To be updated more comprehensively with the benchmarking guide (ongoing PRs) & blog that we have been working on this quarter.
 
-You should be **extra careful!** 
+You should be **extra CAREFUL!** , be always paranoid about suspiciously good results — kernel engineers and existing compilers are already pretty good, so a >2x speedup for anything is highly unlikely. 
+
+
+> “if you beat cudnn by more than 10%, think again” -- 
+from [itsclivetime](https://x.com/itsclivetime/status/1992155951630307633?s=46)
+
+
+If the model can reward hack, it will find ways to reward hack! This can especially happen during RL training or evolutionary search.
+
+Check out resources here:
+- KernelBench [v0.1 Release](https://scalingintelligence.stanford.edu/blogs/kernelbenchv01/) 
+- Cognition and Stanford's [Kevin](https://arxiv.org/abs/2507.11948) project on various hacking behaviors observed in RL training
+- Jiwei Li's awesome [blogpost](https://deep-reinforce.com/defense_kernel_hack.html) on Hacks and Defenses in Automatic GPU Kernel Generation
+
+Our ongoing blogpost and PRs try to systematize and list out these behaviors and provide tests, detection, and mitigation toolings.
 
-If the model can reward hack, it will find ways to reward hack!
 
 ## Methodology
+More on that coming.
 
+To ensure **consistency and reproducibility**, we recommend using `modal` and we have provided / are adding more various modal cloud functions to standardize the evaluation environment.
 
 ### Correctness
+More coming. We also want to highlight community effort such as [BackendBench](https://www.youtube.com/watch?v=BTfjdyZOKww).
 
 ### Performance
-We (are) implementing a few ways for timing and understand the tradeoffs.
+We highly recommend watching this [lecture](https://www.youtube.com/watch?v=1i7dxoAfKOU) from GPU mode on kernel profiling. 
+
+We have (and continue to) implement various approaches to conduct kernel timing to understand the tradeoffs.
 
-### Unit Tests
+Check out `timing.py` to see available timing methods and `src/unit_tests/test_eval_timing.py` to test out various timing methods (including leveraging `cuda_event` marker, Triton `do_bench`, `host_time` E2E time). @palic and team is working on a blogpost explaining the different tradeoffs soon. 
 
+### Unit Tests with Adversarial Examples
 We've included some unit tests for the eval script in `src/unit_tests/test_eval_adversarial.py`. These tests run adversarial kernels (see `src/unit_tests/test_kernels/`) that contain examples of reward hacking that we've seen from LLMs and ensures that the eval script catches them, either by failing their correctness checks or flagging them for excessive speedups. Examples include:
 - Reusing computations cached during the PyTorch reference
 - Modifying inputs to cheat correctness checks
@@ -23,5 +42,8 @@ We've included some unit tests for the eval script in `src/unit_tests/test_eval_
 
 We will continue to add more tests as we explore additional adversarial scenarios.
 
- 
-Note this is an ongoing community effort. 
\ No newline at end of file
+
+Note: KernelBench is an ongoing open-source effort — please help us with issues and PRs!
+
+
+Shoutout to @bkal01, @palic, @miru_why, @ngc92, @itsclivetime, for their suggestions and feedback. 
\ No newline at end of file
diff --git a/src/eval.py b/src/eval.py
index 06da727b..51bdc6b6 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -410,9 +410,9 @@ def eval_kernel_against_ref(
     backend: str = "cuda",  # can be 'cuda', 'triton', 'tilelang', or 'cute'
     precision: torch.dtype = torch.float32,
 
-    # Guard against potential reward hacking
-    check_for_excessive_speedup: bool = False,
-    excessive_speedup_threshold: float = 10, # if the kernel is <excessive_speedup_threshold>x faster than the reference, it will get flagged
+    # Guard against potential reward hacking [optional but ongoing enhancement]
+    check_for_excessive_speedup: bool = True,
+    excessive_speedup_threshold: float = 10, # flag if the kernel is more than <excessive_speedup_threshold>x faster than the reference
 ) -> KernelExecResult:
     """
     Evaluate the custom kernel against the original model
@@ -426,6 +426,8 @@ def eval_kernel_against_ref(
     backend: str, one of 'cuda', 'triton', 'tilelang', or 'cute'
     precision: torch.dtype for computation (note: tilelang only supports fp16)
     timing_method: str, method to time kernel, see timing.py for more details 
+
+    ONGOING EFFORT to refactor and modularize this, and adding more tests for eval.
     """
     # TODO: check device is busy
     assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval"
@@ -654,7 +656,7 @@ def eval_kernel_against_ref(
         # Compute Effective Speedup
         effective_speedup = kernel_exec_result.ref_runtime / kernel_exec_result.runtime
 
-        # TODO: integrate SoL estimation for each unqiue program on destigated hardware
+        # TODO: integrate SoL estimation for each unique program on designated hardware
         # for now, we will use a heuristics such as 5-10x which is very hard to achieve
 
         if verbose:
diff --git a/src/unit_tests/test_eval_adversarial.py b/src/unit_tests/test_eval_adversarial.py
index bbc1ed2d..d741c408 100644
--- a/src/unit_tests/test_eval_adversarial.py
+++ b/src/unit_tests/test_eval_adversarial.py
@@ -9,7 +9,7 @@
 Through a few targeted adversarial kernels.
 
 Run with pytest
-pytest src/unit_tests/test_eval.py
+pytest src/unit_tests/test_eval_adversarial.py
 """
 
 def run_test_kernel(problem_name, 
@@ -46,12 +46,12 @@ def run_test_kernel(problem_name,
 
 def test_result_cache_reuse():
     """
-    Test to prevent advserial kernel by reuse computed PyTorch results 
+    Test to prevent adversarial kernel by reuse computed PyTorch results 
 
     Methodology: 
-    - Advserial kernel allocate memory for result but does not initialize it.
+    - Adversarial kernel allocate memory for result but does not initialize it.
     - Potential Hacking scenario: CUDA cache allocator might give that un-erased physical memory to the custom kernel (which contains the result) and it will incorrectly pass
-    - Eval script should flag this as incorrect as the reference kernel did not conduct any menaingful computation
+    - Eval script should flag this as incorrect as the reference kernel did not conduct any meaningful computation
     """
     result = run_test_kernel(problem_name="level1/1_Square_matrix_multiplication_.py", 
                             kernel_filename="result_reuse_kernel.py")
@@ -65,10 +65,10 @@ def test_input_modification():
     we still flag it as incorrect.
 
     Methodology:
-    - Advserial kernel zeros out all inputs and returns all zeros for the final matmul shape.
+    - Adversarial kernel zeros out all inputs and returns all zeros for the final matmul shape.
     - Potential Hacking scenario: Since adversarial kernel zero out the input, the reference might ran the kernel with all zeros. 
       Hence both refernece and custom kernel return all zeros, and this incorrectly pass.
-    - Eval script should flag this as incorrect as this kernel zeros out inputs and trick the refernece kernel into returning all zeros.
+    - Eval script should flag this as incorrect as this kernel zeros out inputs and trick the reference kernel into returning all zeros.
     """
     result = run_test_kernel(problem_name="level1/1_Square_matrix_multiplication_.py", 
                             kernel_filename="zero_out_kernel.py")
@@ -92,11 +92,13 @@ def test_non_default_stream(timing_method="do_bench", threshold=1.5):
                             num_perf_trials=10 ) # keep it low as we are leaking cuda stream
 
     print(result)
-    if result.metadata.get("excessive_speedup") is True:       
+    if result.metadata.get("excessive_speedup") is True:   
+        print("Result: Eval Function successfully marked excessive speedup from the hacky CUDA Stream implementation")
+        print("This indicates the eval script properly detected and flagged potential cheating behavior.")
+    else:
         raise AssertionError(
-            "Excessive speedup detected, Eval Function did not handle hacky stream"
+            "Excessive speedup not detected - Eval Function failed to flag adversarial kernel that exploits cuda stream timing hack"
         )
-    print("Result: Eval Function Passed Non-Default CUDA Stream Hack")
 
 
 def main():
@@ -104,5 +106,7 @@ def main():
     test_input_modification()
     test_non_default_stream()
 
+    # we might add more adversarial tests in the future
+
 if __name__ == "__main__":
     main()
\ No newline at end of file