ScalingIntelligence · simonguozirui · Nov 5, 2025 · Nov 3, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/README.md b/README.md
@@ -1,16 +1,19 @@
 # KernelBench: Can LLMs Write Efficient GPU Kernels? [ICML '25]
-[arXiv](https://arxiv.org/html/2502.10517v1) | [blog post](https://scalingintelligence.stanford.edu/blogs/kernelbench/) | [HuggingFace Dataset](https://huggingface.co/datasets/ScalingIntelligence/KernelBench) | 
+A benchmark for evaluating LLMs' ability to generate efficient GPU kernels
+
+[arXiv](https://arxiv.org/html/2502.10517v1) | [blog post](https://scalingintelligence.stanford.edu/blogs/kernelbench/) | [HuggingFace Dataset](https://huggingface.co/datasets/ScalingIntelligence/KernelBench) 
+
+<img src="./assets/figures/KernelBenchMascot.png" width="200">
 
 ## Versions
-The huggingface dataset is updated to v0.1.
-- [v0.1](https://github.com/ScalingIntelligence/KernelBench/tree/v0.1) - Latest version (also main branch)
+The latest stable version will be on `main` branch. We continue to update and improve the repo. 
+- [v0.1](https://github.com/ScalingIntelligence/KernelBench/tree/v0.1) - See [blog](https://scalingintelligence.stanford.edu/blogs/kernelbenchv01/)
 - [v0](https://github.com/ScalingIntelligence/KernelBench/tree/v0) - Original Release
 
-A benchmark for evaluating LLMs' ability to generate efficient GPU kernels
 
-<img src="./assets/figures/KernelBenchMascot.png" width="200">
+The Huggingface [dataset](https://huggingface.co/datasets/ScalingIntelligence/KernelBench) is updated to v0.1.
 
-<!-- See [blog post](https://scalingintelligence.stanford.edu/blogs/kernelbench/) and [arXiv paper](https://arxiv.org/html/2502.10517v1) for more details. -->
+This repo provides core functionality for KernelBench and an easy-to-use set of scripts for evaluation. It is not intended to provide complex agentic scaffolds that solve this task; we recommend cloning and modifying this repo for your experiment, or using it as a git submodule.
 
 ## 👋 Task Description
 We structure the problem for LLM to transpile operators described in PyTorch to CUDA kernels, at whatever level of granularity it desires to.
@@ -26,7 +29,7 @@ We construct KernelBench to have 4 Levels of categories:
 - **Level 4 🤗**:  Level Hugging Face 
     Optimize whole model architectures from HuggingFace
 
-We are actively extending KernelBench to other DSLs beyond `cuda` as well.
+We are actively extending KernelBench to other DSLs beyond `cuda` as well (see below).
 
 ## ⚖️ Evaluation
 #### Methodology
@@ -98,7 +101,12 @@ python3 scripts/generate_and_eval_single_sample.py dataset_src="huggingface" lev
 # add .verbose_logging for more visbility
 ```
 
-We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support (`cuda`, `triton`, `cute`).
+**What you might need to modify**
+* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware.
+* **`precision`** - You can specify the precision of tensor by `precision=fp32`. Currently all of our reported results are `fp32` but we added support for `fp16` & `bf16`.
+*  **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support DSLs: `cuda`, `triton`, `cute`, `tilelang`.
+
+Check the config fields for comprehensive set of options.
 
 ### Run on all problems 
 

diff --git a/requirements.txt b/requirements.txt
@@ -7,6 +7,8 @@ modal
 
 # DSLs
 nvidia-cutlass-dsl
+tilelang
+apache-tvm
 
 # helper
 tqdm

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
@@ -145,6 +145,10 @@ def __init__(self):
 
         # Backend to use for kernel implementation (cuda or triton)
         self.backend = "cuda"
+
+        # Precision for computation: "fp32", "fp16", "bf16"
+        self.precision = "fp32"
+
         # Number of samples per problem to evaluate for pass@k analysis
         self.num_samples_per_problem = 1  # Default to 1 sample per problem
 
@@ -188,11 +192,13 @@ def evaluate_single_sample_modal(
         num_perf_trials: int = 100,
         measure_performance: bool = True,
         verbose: bool = False,
+        backend: str = "cuda",
+        precision: str = "fp32",
     ):
         """
         Evaluate a single sample on Modal GPU with automatic retries for GPU attachment failures
         """
-        from src.eval import eval_kernel_against_ref
+        from src.eval import eval_kernel_against_ref, get_torch_dtype_from_string
         from src.utils import set_gpu_arch
         import torch
         import time
@@ -225,6 +231,8 @@ def evaluate_single_sample_modal(
             num_perf_trials=num_perf_trials,
             build_dir=None,  # Modal doesn't need persistent build dir
             device=torch.device("cuda:0"),  # Modal has one GPU per container
+            backend=backend,
+            precision=get_torch_dtype_from_string(precision),
         )
 
         # Force cleanup and exit to prevent container reuse and memory leaks
@@ -321,6 +329,7 @@ def evaluate_single_sample(
             build_dir=build_dir,
             device=device,
             backend=configs.backend,
+            precision=eval.get_torch_dtype_from_string(configs.precision),
         )
         return eval_result
     except Exception as e:
@@ -491,6 +500,8 @@ def batch_eval_modal(
                             num_perf_trials=config.num_perf_trials,
                             measure_performance=config.measure_performance,
                             verbose=config.verbose,
+                            backend=config.backend,
+                            precision=config.precision,
                         )
                         futures.append(future)
 

diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
@@ -18,7 +18,7 @@
     read_file,
     set_gpu_arch,
 )
-
+from src.eval import get_torch_dtype_from_string
 """
 Generate and evaluate a single sample
 Easiest way to get started, to test a single problem for experimentation or debugging
@@ -48,6 +48,7 @@ def __init__(self):
         # Construct this from mapping from architecture name to torch cuda arch list in the future
         # you can either specify SM version or just use the name
         self.gpu_arch = ["Ada"]
+        self.precision = "fp32" # options ["fp32", "fp16", "bf16"]
 
         # Inference config
         self.server_type = None
@@ -171,11 +172,11 @@ def main(config: EvalConfig):
     # Use appropriate prompt constructor based on backend
     if config.backend == "cuda":
         custom_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
-    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+    elif config.backend in ["triton", "tilelang", "cute"]:
         custom_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
     else:
         raise ValueError(
-            f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'."
+            f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', 'tilelang', or 'cute'."
         )
 
     if config.log_prompt:
@@ -219,6 +220,7 @@ def main(config: EvalConfig):
         num_correct_trials=5,
         num_perf_trials=100,
         backend=config.backend,
+        precision=get_torch_dtype_from_string(config.precision),
     )
 
     print(

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
@@ -53,7 +53,7 @@ def __init__(self):
         # you can either specify SM version or just use the name
         self.gpu = "L40S"
         self.gpu_arch = ['Ada']
-
+        self.precision = "fp32" # options ["fp32", "fp16", "bf16"]
 
         # Inference config
         self.server_type = None
@@ -110,8 +110,8 @@ def __repr__(self):
         "pytest",
         "ninja",
         "utils",
-        # "tilelang",  # commented out - not working currently
-        #"apache-tvm",
+        "tilelang",
+        "apache-tvm",
         "python-dotenv",
         "nvidia-cutlass-dsl",
         "litellm[proxy]",  # Unified LLM interface
@@ -125,17 +125,18 @@ def __repr__(self):
 class EvalFunc:
 
     @modal.method()
-    def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend):
+    def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend, precision):
         # 3. Evaluate Kernel
         # NOTE: no need to wrap around process here as only a single sample
         # see batch eval for examples of process isolation
         from src.eval import eval_kernel_against_ref
+        from src.eval import get_torch_dtype_from_string
         # Use utility function to set the GPU architecture in the modal environment
         from src.utils import set_gpu_arch as modal_set_gpu_arch
         modal_set_gpu_arch(gpu_arch)
         return eval_kernel_against_ref(
             ref_arch_src, custom_kernel, verbose=verbose, measure_performance=True, 
-            num_correct_trials=5, num_perf_trials=100, backend=backend
+            num_correct_trials=5, num_perf_trials=100, backend=backend, precision=get_torch_dtype_from_string(precision)
         )
 
 @pydra.main(base=EvalConfig)
@@ -216,10 +217,10 @@ def main(config: EvalConfig):
     # Use appropriate prompt constructor based on backend
     if config.backend == "cuda":
         custom_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
-    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+    elif config.backend in ["triton", "tilelang", "cute"]:
         custom_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
     else:
-        raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'.")
+        raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', 'tilelang', or 'cute'.")
 
     if config.log_prompt:
         with open(os.path.join(config.logdir, f"prompt_level_{config.level}_problem_{config.problem_id}.txt"), "w") as f:
@@ -238,7 +239,7 @@ def main(config: EvalConfig):
 
     with app.run():
         kernel_exec_result = EvalFunc.with_options(gpu=config.gpu)().eval_single_sample_modal.remote(
-            ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend
+            ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend, config.precision
         )
 
         print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")

diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
@@ -78,6 +78,8 @@ def __init__(self):
         self.log_prompt = False
 
         self.backend = "cuda"
+
+        self.precision = "fp32"
 
     def greedy(self):
         # For greedy decoding, epsecially baseline eval
@@ -129,11 +131,11 @@ def generate_sample_single(
         custom_cuda_prompt = prompt_generate_custom_cuda_from_prompt_template(
             ref_arch_src
         )
-    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+    elif config.backend in ["triton", "cute", "tilelang"]:
         custom_cuda_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
     else:
         raise ValueError(
-            f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'."
+            f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', 'cute', or 'tilelang'."
         )
     if config.log_prompt:
         prompt_path = os.path.join(

diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py
@@ -9,7 +9,6 @@
 from src import eval as kernel_eval
 from src import utils as kernel_utils
 from scripts.generate_baseline_time import measure_program_time
-
 from src.utils import read_file
 
 """
@@ -68,6 +67,8 @@ def __init__(self):
 
         # Replace with your NVIDIA GPU architecture, e.g. ["Hopper"]
         self.gpu_arch = ["Ada"] 
+        self.precision = "fp32"
+        self.backend = "cuda"
 
     def __repr__(self):
         return f"ScriptConfig({self.to_dict()})"
@@ -97,7 +98,9 @@ def evaluate_single_sample_src(ref_arch_src: str, kernel_src: str, configs: dict
             num_correct_trials=num_correct_trials,
             num_perf_trials=num_perf_trials,
             build_dir=build_dir,
-            device=device
+            device=device,
+            backend=configs["backend"],
+            precision=kernel_eval.get_torch_dtype_from_string(configs["precision"])
         )
         return eval_result
     except Exception as e: