From 76f4eb7de3056e3472adf19f7f1c1ccb9b308d0d Mon Sep 17 00:00:00 2001
From: 01xjw <220233704@seu.edu.cn>
Date: Wed, 21 Jan 2026 08:31:54 +0800
Subject: [PATCH 1/3] Add AMD Radeon GPU support

---
 README.md                                     | 188 ++++++---
 pyproject.toml                                |  58 ---
 requirements.txt                              |  53 +--
 setup.py                                      |   9 +
 src/kernelbench/eval.py                       | 249 ++++++++++--
 src/kernelbench/prompt_constructor_toml.py    | 187 ++++++++-
 src/kernelbench/prompts/hardware/gpu_specs.py | 236 ++++++++++-
 src/kernelbench/prompts/prompts.toml          |   8 +-
 src/kernelbench/utils.py                      | 375 +++++++++++++-----
 9 files changed, 1055 insertions(+), 308 deletions(-)
 delete mode 100644 pyproject.toml
 create mode 100644 setup.py
diff --git a/README.md b/README.md
index 7343e73b..1a01270b 100644
--- a/README.md
+++ b/README.md
@@ -1,19 +1,16 @@
 # KernelBench: Can LLMs Write Efficient GPU Kernels? [ICML '25]
-A benchmark for evaluating LLMs' ability to generate efficient GPU kernels
-
-[arXiv](https://arxiv.org/html/2502.10517v1) | [blog post](https://scalingintelligence.stanford.edu/blogs/kernelbench/) | [HuggingFace Dataset](https://huggingface.co/datasets/ScalingIntelligence/KernelBench) 
-
-<img src="./assets/figures/KernelBenchMascot.png" width="200">
+[arXiv](https://arxiv.org/html/2502.10517v1) | [blog post](https://scalingintelligence.stanford.edu/blogs/kernelbench/) | [HuggingFace Dataset](https://huggingface.co/datasets/ScalingIntelligence/KernelBench) | 
 
 ## Versions
-The latest stable version will be on `main` branch. We continue to update and improve the repo. 
-- [v0.1](https://github.com/ScalingIntelligence/KernelBench/tree/v0.1) - See [blog](https://scalingintelligence.stanford.edu/blogs/kernelbenchv01/)
+The huggingface dataset is updated to v0.1.
+- [v0.1](https://github.com/ScalingIntelligence/KernelBench/tree/v0.1) - Latest version (also main branch)
 - [v0](https://github.com/ScalingIntelligence/KernelBench/tree/v0) - Original Release
 
+A benchmark for evaluating LLMs' ability to generate efficient GPU kernels
 
-The Huggingface [dataset](https://huggingface.co/datasets/ScalingIntelligence/KernelBench) is updated to v0.1.
+<img src="./assets/figures/KernelBenchMascot.png" width="200">
 
-This repo provides core functionality for KernelBench and an easy-to-use set of scripts for evaluation. It is not intended to provide complex agentic scaffolds that solve this task; we recommend cloning and modifying this repo for your experiment, or using it as a git submodule.
+<!-- See [blog post](https://scalingintelligence.stanford.edu/blogs/kernelbench/) and [arXiv paper](https://arxiv.org/html/2502.10517v1) for more details. -->
 
 ## 👋 Task Description
 We structure the problem for LLM to transpile operators described in PyTorch to CUDA kernels, at whatever level of granularity it desires to.
@@ -29,7 +26,7 @@ We construct KernelBench to have 4 Levels of categories:
 - **Level 4 🤗**:  Level Hugging Face 
     Optimize whole model architectures from HuggingFace
 
-We are actively extending KernelBench to other DSLs beyond `cuda` as well (see below).
+We are actively extending KernelBench to other DSLs beyond `cuda` as well.
 
 ## ⚖️ Evaluation
 #### Methodology
@@ -37,9 +34,9 @@ To evaluate model-generated kernels, we need to check if they:
 - **is correct ✅**: check against reference torch operators `n_correctness` times on randomized inputs.
 - **is performant ⏱️**: compare against reference torch operators `n_trial` times to measure speedup between runtimes.
 
-Check out `src/eval.py` for details on how we implement correctness check and timing and `EVAL.md` for notes on evaluation and benchmarking guidelines [WIP].
+Check out `src/eval.py` for details on how we implement correctness check and timing. 
 
-We provide a convenient script `scripts/run_and_check.py` to evaluate one single sample source code against a reference source code, check correctness and compute speedup. You can use this to evaluate a kernel either locally or remotely by setting `eval_mode=local` or `eval_mode=modal`.
+We provide a convenient script `scripts/run_and_check.py` to evaluate one single sample source code against a reference source code, check correctness and compute speedup. You can use this to evaluate a model-generated kernel. 
 
 #### Overall Benchmark Metric
 
@@ -66,89 +63,164 @@ We organize the repo into the following structure:
 KernelBench/
 ├── assets/
 ├── KernelBench/ # Benchmark dataset files
-├── src/kernelbench/ # KernelBench logic code
+├── src/ # KernelBench logic code
 │   ├── unit_tests/  
 │   ├── prompts/
 │   ├── ....
 ├── scripts/ # helpful scripts to run the benchmark
 ├── results/ # baseline times across hardware 
 ├── runs/ # where your runs will be stored
-├── notebooks/ # example notebooks for analysis
-├── pyproject.toml # Project configuration and dependencies
 ```
 
 ## 🔧 Set up
+```
+conda create --name kernel-bench python=3.10
+conda activate kernel-bench
+pip install -r requirements.txt
+pip install -e . 
+```
 
-We have transitioned to using `pyproject.toml` and `uv` for dependency management. Install [uv](https://docs.astral.sh/uv/getting-started/installation/) if you haven't already
+### GPU Setup
+Running and profiling kernels require a GPU.
+If you don't have GPU available locally, you can set up [Modal](https://modal.com/). Set up your modal token after creating an account by running `modal token new`. Then, use the `generate_and_eval_single_sample_modal.py` script.
 
-```bash
-# Install base dependencies (works without a local GPU)
-uv sync
+#### NVIDIA (CUDA)
+- Use default backend `cuda` (recommended).
+- Ensure a CUDA-enabled PyTorch install.
 
-# Install with GPU dependencies (for local GPU evaluation)
-uv sync --extra gpu
+#### AMD ROCm (Radeon / MI-Series)
+KernelBench can run on AMD GPUs via ROCm (HIP) using the same PyTorch `torch.cuda` API.
 
-# Run commands with uv (which invoke the right env)
-uv run python scripts/<script_name>.py ...
+1) Install ROCm-enabled PyTorch (pick the correct ROCm version for your system):
+```
+# Example (adjust ROCm version as needed)
+pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/rocm6.4
 ```
 
-You can still use `conda (python=3.10)` to create your environment and install dependencies with `requirements.txt`.
+2) Verify GPU visibility:
+```
+python - <<'PY'
+import torch
+print("HIP:", torch.version.hip)
+print("GPU:", torch.cuda.get_device_name(0))
+print(torch.cuda.get_device_properties(0))
+PY
+```
 
-We use `litellm` for API calls. Please set your keys by creating a `.env` following our `.env.example`.
+3) Optional: select specific GPU(s)
+```
+export HIP_VISIBLE_DEVICES=0
+export ROCR_VISIBLE_DEVICES=0
+```
 
-Running and profiling kernels require a GPU.
-If you don't have a GPU available locally, you can set up [Modal](https://modal.com/) for cloud serverless GPU evaluation. Set up your modal token after creating an account by running `modal token new`. Then, use the `generate_and_eval_single_sample_modal.py` script.
+> Note: For AMD, use `backend=triton` or `backend=helion` where applicable. CUDA backend is NVIDIA-only.
 
-You can also try out our [tutorial notebook](https://bit.ly/kernelbench-neurips-colab) (also in notebooks/tutorial.ipynb) with Google Colab.
+##### AMD ROCm Tips
+- **What works**: AMD hardware-aware prompts, Triton backend generation, and ROCm-friendly timing.
+- **What does not (by default)**: CUDA backend evaluation on ROCm is blocked to avoid CUDA-only compile paths.
+- **Troubleshooting**: Ensure Triton is ROCm-enabled and PyTorch is a ROCm build.
 
-## 🚀 Usage
-### Run on a single problem 
-It is easier to get started with a single problem. This will fetch the problem, generate a sample, and evaluate the sample. 
+To call LLM API providers, set the provider API key in your environment:
+```
+export OPENAI_API_KEY="your_api_key_here"
+```
 
-```bash
-# for example, run level 2 problem 40 from huggingface and use google gemini 2.5 flash for generation
+## 🚀 Usage
+### Run on a single problem
+This will fetch the problem, generate a sample, and evaluate the sample.
 
-uv run python scripts/generate_and_eval_single_sample.py dataset_src=huggingface level=2 problem_id=40 server_type=google model_name=gemini/gemini-2.5-flash
+```
+# Example: run level 2 problem 40 from Hugging Face
+python3 scripts/generate_and_eval_single_sample.py dataset_src="huggingface" level=2 problem_id=40
 
 # dataset_src could be "local" or "huggingface"
-# add .verbose_logging for more visbility
+# add .verbose_logging for more visibility
 ```
 
-**What you might need to modify**
-* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware.
-* **`precision`** - You can specify the precision of tensor by `precision=fp32`. Currently all of our reported results are `fp32` but we added support for `fp16` & `bf16`.
-*  **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support DSLs: `cuda`, `triton`, `cute`, `tilelang`, `thunderkittens`.
+We also support other GPU programming languages beyond `cuda`. Set `backend=triton`, `backend=cute`, or `backend=helion` as needed.
 
+#### AMD ROCm Example Commands
+Use `backend=triton` (recommended) or `backend=helion` on AMD GPUs:
+```
+# Triton on AMD ROCm (single problem)
+python3 scripts/generate_and_eval_single_sample.py \
+  dataset_src="huggingface" level=2 problem_id=40 \
+  backend=triton
+
+# Helion on AMD ROCm (single problem) (still in progress)
+python3 scripts/generate_and_eval_single_sample.py \
+  dataset_src="huggingface" level=2 problem_id=40 \
+  backend=helion
+```
 
-Note on setting up ThunderKittens (TK) locally: to use `backend=thunderkittens`, you need to git clone the ThunderKittens repo and set the following environment variable to point to your local ThunderKittens directory, `export THUNDERKITTENS_ROOT=<PATH to ThunderKittens folder>`, and all ThunderKitten programs as shown in the [example](src/kernelbench/prompts/model_new_ex_add_thunderkittens.py), should contain `tk_root = os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens")`, which enable the kernel to include the right TK primitives. In addition, we only support BF16 for TK right now.
+If you want to target a specific AMD GPU:
+```
+HIP_VISIBLE_DEVICES=0 ROCR_VISIBLE_DEVICES=0 \
+python3 scripts/generate_and_eval_single_sample.py \
+  dataset_src="huggingface" level=2 problem_id=40 \
+  backend=triton
+```
 
-Check the config fields for comprehensive set of options. Note we provide the model with a one-shot example by default along with the minimum set of info; you can check out other prompt settings or construct your own in `src/prompt_constructor_toml.py`.
+##### Optional: Force AMD Prompt Inputs
+Some scripts auto-detect GPU vendor/name. You can override:
+```
+python3 scripts/generate_and_eval_single_sample.py \
+  dataset_src=huggingface \
+  level=1 \
+  problem_id=1 \
+  backend=triton \
+  gpu_vendor=amd \
+  gpu_name=MI355X
+```
 
-### Run on all problems 
+### Run on all problems
 
-```bash
+```
 # 1. Generate responses and store kernels locally to runs/{run_name} directory
-uv run python scripts/generate_samples.py run_name=test_hf_level_1 dataset_src=huggingface level=1 num_workers=50 server_type=deepseek model_name=deepseek-chat temperature=0
+python3 scripts/generate_samples.py \
+  run_name=test_hf_level_1 dataset_src=huggingface level=1 num_workers=50 \
+  server_type=deepseek model_name=deepseek-chat temperature=0
+
+# If you use LLM_GATEWAY_KEY (AMD gateway), set server_type=openai and temperature=1
 
-# 2. Evaluate on all generated kernels in runs/{run_name} directory
-uv run python scripts/eval_from_generations.py run_name=test_hf_level_1 dataset_src=local level=1 num_gpu_devices=8 timeout=300
+# 2. Evaluate all generated kernels in runs/{run_name}
+python3 scripts/eval_from_generations.py run_name=test_hf_level_1 dataset_src=local level=1 num_gpu_devices=8 timeout=300
 
-# If you like to speedup evaluation, you can use parallelize compilation on CPUs before getting to evaluation on GPUs
-# add build_cache=True and num_cpu_workers=<num_cpu_workers> to the command
+# To speed up evaluation, parallelize compilation on CPUs before GPU evaluation.
+# Add build_cache=True and num_cpu_workers=<num_cpu_workers> to the command.
 ```
-### Analyze the eval results to compute Benchmark Performance
-We provide `scripts/benchmark_eval_analysis.py` to analyze the eval results to compute success rate, timing metric, and overall benchmark performance  `fast_p`. 
 
-```bash
-uv run python scripts/benchmark_eval_analysis.py run_name=test_hf_level_1 level=1 hardware=L40S_matx3 baseline=baseline_time_torch
+##### AMD Triton Quick Start (batch)
+```
+python3 scripts/generate_samples.py \
+  run_name=amd_test \
+  dataset_src=huggingface \
+  level=1 \
+  backend=triton
+
+python3 scripts/eval_from_generations.py \
+  run_name=amd_test \
+  dataset_src=huggingface \
+  level=1 \
+  backend=triton \
+  eval_mode=local
 ```
-If you are using a different hardware, you can generate the baseline time with `scripts/generate_baseline_time.py` script.
-We provide some reference baseline times a variety of NVIDIA GPUs across generations in `results/timing`, but we recommend you to generate your own baseline time for more accurate results (cluster power, software version, all affects timing result). See `results/timing/README.md` for more details.
 
-### Multi-Turn Framework & Integrations
-We have also releaed the test-time framework [Caesar](https://github.com/ScalingIntelligence/caesar) that are used in the multi-turn / iterative refinement experiments in our paper. You can use or modify this framework for high-throughput test-time scaling (both sequential and parallel) targeting KernelBench problems.
+##### AMD Baseline Timing
+```
+python3 scripts/get_baseline_time_single_problem.py
+```
+### Analyze the eval results to compute Benchmark Performance
+Use `scripts/benchmark_eval_analysis.py` to compute success rate, timing metrics, and overall benchmark performance `fast_p`.
 
-You can also use KernelBench as a library for your projects, for example: `from kernelbench import timing`, `from kernelbench import eval as kb_eval`, or `from kernelbench.utils import set_gpu_arch`.
+```
+python3 scripts/benchmark_eval_analysis.py run_name=test_hf_level_1 level=1 hardware=L40S_matx3 baseline=baseline_time_torch
+```
+If you use different hardware, generate a baseline with `scripts/generate_baseline_time.py`.
+We provide reference baselines for various NVIDIA GPUs in `results/timing`, but we recommend generating your own for accuracy (cluster power and software versions affect timing). See `results/timing/README.md` for details.
+
+### Multi-Turn Framework
+We have also releaed the test-time framework [Caesar](https://github.com/simonguozirui/caesar) that are used in the multi-turn / iterative refinement experiments in our paper. You can use or modify this framework for high-throughput test-time scaling (both sequential and parallel) targeting KernelBench problems. 
 
 ## 🛣️ Upcoming Roadmap
 Check out our [roadmap](https://github.com/ScalingIntelligence/KernelBench/issues/74) for what we plan to add as features. We welcome community contirbutions in these directions. 
@@ -156,8 +228,6 @@ Check out our [roadmap](https://github.com/ScalingIntelligence/KernelBench/issue
 ## 🔍 Known Usage
 Since release, we have gotten a lot of interest from researchers, research labs, and companies that use KernelBench to explore this direction. We have documented [known usage](https://docs.google.com/document/d/e/2PACX-1vTjS-UMH1HB5n_PENq2k-3YRfXIXkqKIKeNC2zcWMyLPdl4Jrwvdk4dNDVSsM8ybKrCxZB7GJq1slZF/pub) of KernelBench and related efforts towards automated kernel generations. If you are using KernelBench, we love to hear more about it!
 
-Disclaimer: KernelBench is designed as an open-source evaluation framework and toolkit. The KernelBench team does not review, validate, or endorse individual kernels or reported results. Users are responsible for independently verifying any results obtained using the framework. Please check out `EVAL.md` for more guidance on benchmarking and evaluating kernels.
-
 ## 🪪 License
 MIT. Check `LICENSE.md` for more details.
 
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index bed37150..00000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,58 +0,0 @@
-[build-system]
-requires = ["setuptools>=61.0"]
-build-backend = "setuptools.build_meta"
-
-# this should be our single source of truth for versioning
-
-[project]
-name = "kernelbench"
-version = "0.2.0.dev0"
-requires-python = "==3.10.*"
-dependencies = [
-    # Frameworks
-    "torch==2.9.0",
-
-    "transformers",
-    "datasets",
-    "modal",
-
-    # helper
-    "tqdm",
-    "packaging",
-    "setuptools",
-    "pydra-config",
-    "ninja",
-    "tomli",
-    "tabulate",
-
-    # Numerics
-    "einops",
-    "python-dotenv",
-    "numpy",
-
-    # LLM providers
-    "openai",
-    "litellm[proxy]",
-]
-
-[project.optional-dependencies]
-gpu = [
-    # GPU-specific dependencies (requires CUDA)
-    "triton",
-    "nvidia-cutlass-dsl",
-    "tilelang",
-    "cupy-cuda12x",
-    "nsight-python",
-]
-dev = [
-    "pytest",
-    "ruff",
-]
-
-
-[tool.setuptools.packages.find]
-where = ["src"]
-include = ["kernelbench*"]
-
-[tool.setuptools.package-data]
-kernelbench = ["prompts/**/*"]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 07603a86..805c61be 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,35 +1,38 @@
-# ARCHIVED: We are transitioning to pyproject.toml and uv-based project management
-# However, we provide this as a backup for now
-
 # Frameworks
-# we use latest PyTorch stable release
-torch==2.9.*
-triton==3.5.*
-
+# torch==2.5.0
 # we shall upgrade torch for blackwell when it is stable
-transformers>=4.57.3
-datasets>=4.4.2
-modal>=1.3.0
+# AMD ROCm note: install ROCm-enabled torch from the PyTorch ROCm index.
+# Current ROCm env:
+# torch==2.8.0+rocm7.1.1.gitcba8b9d2
+# HIP==7.1.52802-26aae437f6
+# ROCm SMI (concise):
+# Device IDs: 0x7551 x4
+transformers
+datasets
+modal
 
 # DSLs
-nvidia-cutlass-dsl
-tilelang
+# nvidia-cutlass-dsl
+# triton (required for AMD ROCm kernels)
+# helion (optional, Helion DSL; install separately if needed)
 
 # helper
-tqdm>=4.67.1
+tqdm
 packaging
-pydra-config
-ninja>=1.13.0
-cupy-cuda12x==13.6.0
-tomli>=2.3.0
-tabulate>=0.9.0
-nsight-python
+pydra_config
+dill>=0.3.7,<0.4
+pytest
+ninja
 
 # Numerics
-einops>=0.8.1
-python-dotenv>=1.2.1
-numpy==2.4.0
+einops
+dotenv
+numpy
+
+# to deprecate with litellm
+google-generativeai
+together
+openai
+anthropic
+pydantic==2.12.4
 
-# use litellm for cloud providers and openai for local
-openai>=2.14.0
-litellm[proxy]>=1.80.10
\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..83d82456
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,9 @@
+from setuptools import setup, find_packages
+
+if __name__ == "__main__":
+    setup(
+        name="kernelbench",
+        version="0.2.0",
+        package_dir={"": "src"},
+        packages=find_packages(where="src"),
+    )
diff --git a/src/kernelbench/eval.py b/src/kernelbench/eval.py
index dd79b2c0..5ccfa708 100644
--- a/src/kernelbench/eval.py
+++ b/src/kernelbench/eval.py
@@ -1,5 +1,9 @@
 """
 Helpers for Evaluations
+
+Supports both NVIDIA CUDA and AMD ROCm GPUs.
+ROCm support is provided through PyTorch's HIP backend, which exposes
+the same torch.cuda API for AMD GPUs.
 """
 
 import hashlib
@@ -13,7 +17,7 @@
 import traceback
 from contextlib import redirect_stderr, redirect_stdout
 from io import StringIO
-from typing import Union, Optional
+from typing import Union, Optional, Literal
 
 import numpy as np
 import requests
@@ -23,6 +27,103 @@
 
 from . import timing, dataset
 
+
+################################################################################
+# GPU Detection and Compatibility
+################################################################################
+
+def is_rocm_available() -> bool:
+    """
+    Check if ROCm (AMD GPU) is available.
+    ROCm uses PyTorch's HIP backend which exposes torch.cuda API.
+    """
+    if not torch.cuda.is_available():
+        return False
+    # Check for HIP version (ROCm indicator)
+    return hasattr(torch.version, 'hip') and torch.version.hip is not None
+
+
+def is_cuda_available() -> bool:
+    """
+    Check if NVIDIA CUDA is available (not ROCm).
+    """
+    if not torch.cuda.is_available():
+        return False
+    return not is_rocm_available()
+
+
+def get_gpu_vendor() -> Literal["nvidia", "amd", "unknown"]:
+    """
+    Detect the GPU vendor (NVIDIA or AMD).
+    """
+    if not torch.cuda.is_available():
+        return "unknown"
+    if is_rocm_available():
+        return "amd"
+    return "nvidia"
+
+
+def get_gpu_info(device: torch.device = None) -> dict:
+    """
+    Get GPU information including vendor, name, and memory.
+    
+    Returns:
+        dict with keys: vendor, name, memory_total_gb, compute_capability (NVIDIA only)
+    """
+    if device is None:
+        device = torch.cuda.current_device()
+    
+    info = {
+        "vendor": get_gpu_vendor(),
+        "name": torch.cuda.get_device_name(device),
+        "memory_total_gb": torch.cuda.get_device_properties(device).total_memory / (1024**3),
+    }
+    
+    # Add compute capability for NVIDIA GPUs
+    if info["vendor"] == "nvidia":
+        props = torch.cuda.get_device_properties(device)
+        info["compute_capability"] = f"{props.major}.{props.minor}"
+    
+    # Add ROCm-specific info for AMD GPUs
+    if info["vendor"] == "amd":
+        info["hip_version"] = torch.version.hip
+        # Try to get architecture info
+        try:
+            props = torch.cuda.get_device_properties(device)
+            info["gcn_arch"] = getattr(props, 'gcnArchName', 'unknown')
+        except:
+            pass
+    
+    return info
+
+
+def check_gpu_available(verbose: bool = False) -> bool:
+    """
+    Check if any GPU (CUDA or ROCm) is available.
+    
+    Args:
+        verbose: If True, print GPU information
+    
+    Returns:
+        True if GPU is available, False otherwise
+    """
+    if not torch.cuda.is_available():
+        if verbose:
+            print("[GPU] No GPU available")
+        return False
+    
+    if verbose:
+        gpu_info = get_gpu_info()
+        vendor_name = "AMD ROCm" if gpu_info["vendor"] == "amd" else "NVIDIA CUDA"
+        print(f"[GPU] {vendor_name} available: {gpu_info['name']}")
+        print(f"[GPU] Memory: {gpu_info['memory_total_gb']:.1f} GB")
+        if gpu_info["vendor"] == "amd":
+            print(f"[GPU] HIP Version: {gpu_info.get('hip_version', 'unknown')}")
+        else:
+            print(f"[GPU] Compute Capability: {gpu_info.get('compute_capability', 'unknown')}")
+    
+    return True
+
 REPO_TOP_PATH = os.path.abspath(
     os.path.join(
         os.path.dirname(__file__),
@@ -63,9 +164,19 @@ def fetch_ref_arch_from_level_problem_id(level, problem_id, with_name=False):
 
 
 def set_seed(seed: int):
+    """
+    Set random seed for reproducibility.
+    Works with both NVIDIA CUDA and AMD ROCm GPUs.
+    """
     torch.manual_seed(seed)
-    # NOTE: this only sets on current cuda device
-    torch.cuda.manual_seed(seed)
+    # NOTE: this sets on current GPU device (CUDA or ROCm via HIP)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)  # for multi-GPU
+    # Set deterministic behavior
+    # NOTE: cudnn settings may not be fully supported on ROCm
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
 
 def get_torch_dtype_from_string(precision: str) -> torch.dtype:
     """
@@ -225,24 +336,39 @@ def graceful_eval_cleanup(
     tempfile: tempfile.NamedTemporaryFile = None,
 ):
     """
-    Clean up env, gpu cache, and compiled CUDA extensions after evaluation
-    """  # delete ran-specific function definitions before next eval run
+    Clean up environment, GPU cache, and compiled extensions after evaluation.
+    Works with both NVIDIA CUDA and AMD ROCm GPUs.
+    """
+    # Clean up linecache entries
+    fake_filenames = [k for k in linecache.cache.keys() if k.startswith(("<generated_model_", "<original_model_"))]
+    for fname in fake_filenames:
+        del linecache.cache[fname]
+    
+    # Clean up temp_module from sys.modules if it exists (used by Helion)
+    if "temp_module" in sys.modules:
+        del sys.modules["temp_module"]
+    
     del curr_context
-    # Clear CUDA cache and reset GPU state
-    with torch.cuda.device(device):
-        torch.cuda.empty_cache()
-
-        # does this help?
-        torch.cuda.reset_peak_memory_stats(device=device)
-
-        torch.cuda.synchronize(
-            device=device
-        )  # Wait for all CUDA operations to complete
+    
+    # Clean up GPU memory (works for both CUDA and ROCm)
+    if torch.cuda.is_available():
+        try:
+            with torch.cuda.device(device):
+                torch.cuda.empty_cache()
+                torch.cuda.reset_peak_memory_stats(device=device)
+                torch.cuda.synchronize(device=device)
+        except Exception:
+            # Ignore cleanup errors
+            pass
+    
+    # Clean up temporary file
     if tempfile:
-        tempfile.close()
-        os.remove(tempfile.name)
-
-    # _cleanup_cuda_extensions() # SIMON NOTE: is this necessary?
+        try:
+            tempfile.close()
+            if os.path.exists(tempfile.name):
+                os.remove(tempfile.name)
+        except Exception:
+            pass
 
 
 def build_compile_cache_legacy(
@@ -404,7 +530,7 @@ def eval_kernel_against_ref(
     device: Union[torch.device, int] = (
         torch.cuda.current_device() if torch.cuda.is_available() else None
     ),  # have to run on GPU
-    backend: str = "cuda",  # can be 'cuda', 'triton', 'tilelang', or 'cute'
+    backend: str = "cuda",  # can be 'cuda', 'triton', 'tilelang', 'cute'
     precision: torch.dtype = torch.float32,
 
     # Guard against potential reward hacking [optional but ongoing enhancement]
@@ -412,22 +538,41 @@ def eval_kernel_against_ref(
     excessive_speedup_threshold: float = 10, # flag if the kernel is more than <excessive_speedup_threshold>x faster than the reference
 ) -> KernelExecResult:
     """
-    Evaluate the custom kernel against the original model
+    Evaluate the custom kernel against the original model.
+    
+    Supports both NVIDIA CUDA and AMD ROCm GPUs.
 
     NOTE: we are thinking about refactor this to be more modularized 
-    and we can add more checks as our other ongiong PRs are working on
+    and we can add more checks as our other ongoing PRs are working on
 
-    num_correct_trials: number of trials to initialize different random inputs; correctness pass only if all trials pass
-    num_perf_trials: run the evalutation many times to take the average
-    device: GPU (cuda) device to run the evalutation on
-    backend: str, one of 'cuda', 'triton', 'tilelang', or 'cute'
-    precision: torch.dtype for computation (note: tilelang only supports fp16)
-    timing_method: str, method to time kernel, see timing.py for more details 
+    Args:
+        original_model_src: Source code of the reference PyTorch model
+        custom_model_src: Source code of the optimized model with custom kernels
+        seed_num: Random seed for reproducibility
+        num_correct_trials: Number of trials with different random inputs; pass only if all pass
+        num_perf_trials: Run the evaluation many times to take the average
+        measure_performance: Whether to measure and compare performance
+        timing_method: Method to time kernel, see timing.py for more details
+        verbose: Enable verbose logging
+        build_dir: Directory for caching compiled kernels
+        device: GPU device to run evaluation on (CUDA or ROCm)
+        backend: One of 'cuda', 'triton', 'tilelang', 'cute', or 'helion'
+        precision: torch.dtype for computation (note: tilelang only supports fp16)
+        check_for_excessive_speedup: Guard against potential reward hacking
+        excessive_speedup_threshold: Flag if kernel is more than this faster than reference
+
+    Returns:
+        KernelExecResult with compilation status, correctness, and performance metrics
 
     ONGOING EFFORT to refactor and modularize this, and adding more tests for eval.
     """
-    # TODO: check device is busy
-    assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval"
+    # Check GPU availability (works for both CUDA and ROCm)
+    if not check_gpu_available(verbose=verbose):
+        raise RuntimeError("No GPU available (CUDA or ROCm), cannot run Eval")
+    
+    # Get GPU vendor info for metadata
+    gpu_vendor = get_gpu_vendor()
+    gpu_info = get_gpu_info(device if isinstance(device, int) else None)
     
     if backend.lower() == "tilelang":
         assert precision == torch.float16 or precision == torch.bfloat16, "TileLang only supports fp16 or bfloat16"
@@ -439,35 +584,56 @@ def eval_kernel_against_ref(
         linewidth=80,  # Maximum width before wrapping
     )
 
-    # set CUDA device
+    # Set GPU device (works for both CUDA and ROCm via HIP)
     torch.cuda.set_device(device)
     
-    # Backends that use tempfile approach and need CUDA_VISIBLE_DEVICES
-    # TileLang, Triton, and CuTe all use tempfile for proper module loading
-    uses_tempfile = backend.lower() in ["triton", "tilelang", "cute"]
+    # Backends that use tempfile approach
+    # - triton: @triton.jit decorator requires file-based import
+    # - cute: CUTLASS requires file-based compilation
+    # - helion: @helion.kernel decorator requires inspect.getsource()
+    # - tilelang: JIT requires file-based import
+    backend_lower = backend.lower()
+    uses_tempfile = backend_lower in ["triton", "tilelang", "cute", "helion"]
     
     metadata = {}  # for storing result metadata
     metadata["hardware"] = torch.cuda.get_device_name(device=device)
-    metadata["device"] = str(device)  # for debugging
+    metadata["device"] = str(device)
+    metadata["gpu_vendor"] = gpu_vendor
+    metadata["backend"] = backend_lower
+    
+    # Add vendor-specific info
+    if gpu_vendor == "amd":
+        metadata["hip_version"] = gpu_info.get("hip_version", "unknown")
+        metadata["gcn_arch"] = gpu_info.get("gcn_arch", "unknown")
+    else:
+        metadata["compute_capability"] = gpu_info.get("compute_capability", "unknown")
 
     if uses_tempfile:
-        # need to set env var for triton/cute code to guarantee no wrong device shenanigans
+        # Set device visibility for triton/cute/helion/tilelang
         if isinstance(device, int):
             device_num = device
         elif isinstance(device, torch.device):
             assert (
                 device.type == "cuda"
-            ), "CUDA is not availible on device, cannot run Eval"
-            device_num = device.index
+            ), "GPU is not available on device, cannot run Eval"
+            device_num = device.index if device.index is not None else 0
         else:
             raise ValueError(
                 f"device must be an int or torch.device, got {type(device)}"
             )
-        os.environ["CUDA_VISIBLE_DEVICES"] = str(device_num)
+        
+        # Set device visibility
+        # For ROCm, use HIP_VISIBLE_DEVICES; for CUDA, use CUDA_VISIBLE_DEVICES
+        if gpu_vendor == "amd":
+            os.environ["HIP_VISIBLE_DEVICES"] = str(device_num)
+            os.environ["ROCR_VISIBLE_DEVICES"] = str(device_num)
+        else:
+            os.environ["CUDA_VISIBLE_DEVICES"] = str(device_num)
     context = {}
 
     if verbose:
-        print(f"[Eval] Start Evalulation! on device: {device}")
+        vendor_str = "AMD ROCm" if gpu_vendor == "amd" else "NVIDIA CUDA"
+        print(f"[Eval] Start Evaluation on device: {device} ({vendor_str})")
         print("[Eval] Loading Original Model")
 
     Model, get_init_inputs, get_inputs = load_original_model_and_inputs(
@@ -495,8 +661,7 @@ def eval_kernel_against_ref(
         tempfile = None
         # add hash for later to distinguish between multi-turn kernels
         
-        backend_lower = backend.lower()
-        if backend_lower in ["triton", "tilelang", "cute"]:
+        if backend_lower in ["triton", "tilelang", "cute", "helion"]:
             # Use tempfile approach for triton, tilelang, and cute
             # These DSLs require proper module import for JIT decorators to work
             ModelNew, tempfile = load_custom_model_with_tempfile(
diff --git a/src/kernelbench/prompt_constructor_toml.py b/src/kernelbench/prompt_constructor_toml.py
index 4349a74d..82dcdda3 100644
--- a/src/kernelbench/prompt_constructor_toml.py
+++ b/src/kernelbench/prompt_constructor_toml.py
@@ -24,6 +24,15 @@
     "hardware_best_practices",
 ]
 
+# AMD-specific hardware component keys
+AMD_HARDWARE_COMPONENT_KEYS = [
+    "hardware_header",
+    "hardware_specs",
+    "hardware_definitions",
+    "hardware_best_practices",
+    "amd_optimization_guidance",
+]
+
 @dataclass
 class PromptConfig:
     """
@@ -88,41 +97,124 @@ def compose_blocks(self, keys: List[str]) -> str:
         
         return "\n".join(text_parts).strip() + "\n"
 
-def _gpu_context_from_gpu_specs(py_path: str, gpu_name: str) -> Dict[str, str]:
+def _gpu_context_from_gpu_specs(py_path: str, gpu_name: str, vendor: str = "nvidia") -> Dict[str, str]:
     """
     Load GPU_* dicts from the GPU specs file (no exec of raw strings; use runpy).
+    
+    Supports both NVIDIA and AMD GPUs.
+    
     Expected globals:
-      - GPU_SPEC_INFO: dict[str, dict]
-      - GPU_DEFINITIONS: dict[str, str]
-      - GPU_BEST_PRACTICES: list[str]  OR {"list": [...]} for compatibility
+      For NVIDIA:
+        - GPU_SPEC_INFO: dict[str, dict]
+        - GPU_DEFINITIONS: dict[str, str]
+        - GPU_BEST_PRACTICES: list[str]  OR {"list": [...]} for compatibility
+      For AMD:
+        - AMD_GPU_SPEC_INFO: dict[str, dict]
+        - AMD_GPU_DEFINITIONS: dict[str, str]
+        - AMD_GPU_BEST_PRACTICES: list[str]
+    Args:
+        py_path: Path to the gpu_specs.py file
+        gpu_name: GPU name to look up (e.g., "L40S", "MI355X", "R9700")
+        vendor: GPU vendor ("nvidia" or "amd")
+    
+    Returns:
+        Dict with context variables for prompt rendering
     """
     mod = runpy.run_path(py_path)
-    spec_info = mod.get("GPU_SPEC_INFO", {})
-    definitions = mod.get("GPU_DEFINITIONS", {})
-    best = mod.get("GPU_BEST_PRACTICES", [])
+    
+    is_amd = vendor.lower() == "amd"
+    
+    if is_amd:
+        # Load AMD-specific specs
+        spec_info = mod.get("AMD_GPU_SPEC_INFO", {})
+        definitions = mod.get("AMD_GPU_DEFINITIONS", {})
+        best = mod.get("AMD_GPU_BEST_PRACTICES", [])
+        
+        # AMD-specific prompts-INPROGRESS
+
+        
+    else:
+        # Load NVIDIA specs
+        spec_info = mod.get("GPU_SPEC_INFO", {})
+        definitions = mod.get("GPU_DEFINITIONS", {})
+        best = mod.get("GPU_BEST_PRACTICES", [])
 
     if not spec_info or not definitions or best is None:
-        raise ValueError("GPU_SPEC_INFO / GPU_DEFINITIONS / GPU_BEST_PRACTICES missing in gpu specs .py")
+        vendor_name = "AMD" if is_amd else "NVIDIA"
+        raise ValueError(f"{vendor_name} GPU_SPEC_INFO / GPU_DEFINITIONS / GPU_BEST_PRACTICES missing in gpu specs .py")
 
     if isinstance(best, dict) and "list" in best:
         best = best["list"]
 
     if gpu_name not in spec_info:
-        raise KeyError(f"GPU name {gpu_name} not found in GPU_SPEC_INFO")
+        # For AMD, try to find a matching key by partial match
+        if is_amd:
+            matched_key = None
+            for key in spec_info.keys():
+                if key.lower() in gpu_name.lower() or gpu_name.lower() in key.lower():
+                    matched_key = key
+                    break
+            if matched_key is None and spec_info:
+                matched_key = next(iter(spec_info))  # Use first entry as fallback
+            if matched_key:
+                gpu_name = matched_key
+            else:
+                raise KeyError(f"GPU name {gpu_name} not found in AMD_GPU_SPEC_INFO")
+        else:
+            raise KeyError(f"GPU name {gpu_name} not found in GPU_SPEC_INFO")
 
     curr = spec_info[gpu_name]
     gpu_architecture = curr.get("GPU Architecture", "Unknown")
-    specs_bullets = "\n".join([f"- We have {v} of {k}." for k, v in curr.items() if k != "GPU Architecture"])
+    
+    if is_amd:
+        specs_bullets = "\n".join([f"- {k}: {v}" for k, v in curr.items()])
+        vendor_display = "AMD"
+    else:
+        specs_bullets = "\n".join([f"- We have {v} of {k}." for k, v in curr.items() if k != "GPU Architecture"])
+        vendor_display = "NVIDIA"
+    
     defs_bullets = "\n".join([f"- {k}: {v}" for k, v in definitions.items()])
     best_bullets = "\n".join([f"- {x}" for x in (best or [])])
 
-    return {
+    context = {
         "gpu_name": gpu_name,
         "gpu_architecture": gpu_architecture,
         "gpu_specs_bullets": specs_bullets,
         "gpu_definitions_bullets": defs_bullets,
         "gpu_best_practices_bullets": best_bullets,
+        "gpu_vendor": vendor.lower(),
+        "gpu_vendor_display": vendor_display,
     }
+    
+    # Add AMD-specific prompts to context-In progress
+    # if is_amd:
+    #     context["amd_high_correct_prompt"] = high_correct_prompt
+    #     context["amd_rdna4_prompt"] = rdna4_prompt
+    #     context["amd_quant_op_prompt"] = quant_op_prompt
+    #     context["amd_helion_prompt"] = helion_prompt
+        
+    #     # Determine which AMD prompts to include based on GPU architecture
+    #     gpu_name_lower = gpu_name.lower()
+    #     gpu_arch_lower = gpu_architecture.lower()
+        
+    #     amd_extra_guidance = ""
+    #     amd_extra_guidance += "\n\n### AMD GPU Optimization Guidance\n\n"
+    #     amd_extra_guidance += high_correct_prompt
+        
+    #     # RDNA4 specific guidance
+    #     if any(x in gpu_name_lower or x in gpu_arch_lower for x in ["gfx12", "rdna4", "rx 9", "rx9", "r9700"]):
+    #         amd_extra_guidance += "\n\n" + rdna4_prompt
+        
+    #     # CDNA / MI series specific guidance
+    #     if any(x in gpu_name_lower or x in gpu_arch_lower for x in ["mi300", "mi355", "mi3", "gfx9"]):
+    #         amd_extra_guidance += "\n\n" + quant_op_prompt
+        
+    #     # Helion guidance for all AMD GPUs
+    #     amd_extra_guidance += "\n\n" + helion_prompt
+        
+    #     context["amd_optimization_guidance"] = amd_extra_guidance
+    
+    return context
 
 def render_prompt_by_option(
     *,
@@ -135,10 +227,13 @@ def render_prompt_by_option(
     precision: Optional[str] = None,
     include_hardware: bool = False,
     components_override: Optional[List[str]] = None,
+    vendor: str = "nvidia",
 ) -> str:
     """
     Render a prompt using backends.X and options.Y structure from TOML.
     
+    Supports both NVIDIA and AMD GPUs.
+    
     Args:
         prompts_toml: Path to the prompts.toml file
         backend: The kernel backend (triton, cuda, cute, tilelang)
@@ -154,12 +249,15 @@ def render_prompt_by_option(
         components_override: When provided, users can arrange prompt components from the toml
                              file in any order they want.
                              Components must exist under templates.common or be hardware_* entries.
+        vendor: GPU vendor ("nvidia" or "amd") - affects hardware info and prompt content
     
     Returns:
         The rendered prompt string
     """
     cfg = PromptConfig.from_toml(prompts_toml)
     
+    is_amd = vendor.lower() == "amd"
+    
     # Get backend-specific content
     try:
         backend_data = cfg.data["backends"][backend]
@@ -172,15 +270,19 @@ def render_prompt_by_option(
     except KeyError:
         raise KeyError(f"Unknown option: {option}")
 
+    # Determine which hardware component keys to use
+    hardware_keys = AMD_HARDWARE_COMPONENT_KEYS if is_amd else HARDWARE_COMPONENT_KEYS
+    
     component_sequence = list(components_override or option_data["components"])
     if include_hardware:
         if components_override is None:
             insert_idx = component_sequence.index("arch_block") if "arch_block" in component_sequence else len(component_sequence)
-            component_sequence[insert_idx:insert_idx] = HARDWARE_COMPONENT_KEYS
+            component_sequence[insert_idx:insert_idx] = hardware_keys
         else:
             # Custom sequences must explicitly have hardware blocks present in their prompt if they 
             # have set they are including hardware info. 
-            if not any(component in HARDWARE_COMPONENT_KEYS for component in component_sequence):
+            all_hardware_keys = set(HARDWARE_COMPONENT_KEYS) | set(AMD_HARDWARE_COMPONENT_KEYS)
+            if not any(component in all_hardware_keys for component in component_sequence):
                 raise ValueError(
                     "components_override must contain at least one hardware_* entry when include_hardware=True"
                 )
@@ -288,7 +390,7 @@ def render_example_entry(input_code: str, output_code: str, example_label: str)
             raise ValueError(
                 f"Hardware info requested for option '{option}'; provide gpu_specs_py and gpu_name"
             )
-        context = {**context, **_gpu_context_from_gpu_specs(resolve_path(gpu_specs_py), gpu_name)}
+        context = {**context, **_gpu_context_from_gpu_specs(resolve_path(gpu_specs_py), gpu_name, vendor=vendor)}
     
     # Builds the prompt from the components in the toml file. 
     prompt_parts = []
@@ -326,17 +428,21 @@ def get_prompt_for_backend(
     precision: Optional[str] = None,
     include_hardware: bool = False,
     gpu_name: Optional[str] = None,
+    vendor: str = "nvidia",
 ) -> str:
     """
     Generate a prompt for a specific backend and option.
     
+    Supports both NVIDIA and AMD GPUs.
+    
     Args:
         ref_arch_src: The reference architecture source code
         backend: The kernel backend (triton, cuda, cute, tilelang)
         option: The prompt option (zero_shot, one_shot, few_shot)
         precision: Optional precision (fp32, fp16, bf16) - defaults to fp32 if not provided
         include_hardware: When True, append hardware guidance blocks (requires gpu_name)
-        gpu_name: GPU identifier used when include_hardware is True (e.g., "A100")
+        gpu_name: GPU identifier used when include_hardware is True (e.g., "A100", "MI355X")
+        vendor: GPU vendor ("nvidia" or "amd")
     """
     return render_prompt_by_option(
         prompts_toml=PROMPTS_TOML,
@@ -347,6 +453,7 @@ def get_prompt_for_backend(
         include_hardware=include_hardware,
         gpu_specs_py=GPU_SPECS_PY if include_hardware else None,
         gpu_name=gpu_name,
+        vendor=vendor,
     )
 
 
@@ -360,11 +467,25 @@ def get_custom_prompt(
     include_hardware: bool = False,
     gpu_name: Optional[str] = None,
     prompts_toml: str = PROMPTS_TOML,
+    vendor: str = "nvidia",
 ) -> str:
     """
     Render a prompt defined under [custom_prompts.<custom_key>] in prompts.toml.
     Must still provide backend/option/precision settings just like
-    get_prompt_for_backend. 
+    get_prompt_for_backend.
+    
+    Supports both NVIDIA and AMD GPUs.
+    
+    Args:
+        custom_key: The custom prompt key in prompts.toml
+        ref_arch_src: The reference architecture source code
+        backend: The kernel backend (triton, cuda, cute, tilelang)
+        option: The prompt option (zero_shot, one_shot, few_shot)
+        precision: Optional precision (fp32, fp16, bf16)
+        include_hardware: When True, include hardware guidance
+        gpu_name: GPU identifier (e.g., "A100", "MI355X")
+        prompts_toml: Path to prompts.toml file
+        vendor: GPU vendor ("nvidia" or "amd")
     """
     if not ref_arch_src:
         raise ValueError(f"Custom prompt '{custom_key}' requires ref_arch_src.")
@@ -386,6 +507,7 @@ def get_custom_prompt(
         gpu_specs_py=GPU_SPECS_PY if include_hardware else None,
         gpu_name=gpu_name,
         components_override=components_override,
+        vendor=vendor,
     )
 
 __all__ = [
@@ -404,7 +526,7 @@ def log_prompt(prompt: str, dir_path: str, file_name: str):
 
 def test_prompt():
     """
-    Demonstrate baseline, few-shot, DSL, hardware-aware, and custom prompt
+    Demonstrate baseline, few-shot, DSL, hardware-aware, AMD, and custom prompt
     generation. Customize the reference architecture or custom_prompt_key
     if you want to try different inputs.
     """
@@ -413,6 +535,7 @@ def test_prompt():
 
     print("Testing prompt construction...")
     scratch_dir = os.path.join(REPO_TOP_PATH, "scratch")
+    
     # baseline prompt
     baseline_prompt = get_prompt_for_backend(
         ref_arch_src=ref_arch_src,
@@ -441,7 +564,7 @@ def test_prompt():
     )
     log_prompt(dsl_prompt, os.path.join(scratch_dir), "dsl_prompt.txt")
 
-    # hardware prompt
+    # NVIDIA hardware prompt
     hardware_prompt = get_prompt_for_backend(
         ref_arch_src=ref_arch_src,
         backend="cute",
@@ -449,9 +572,34 @@ def test_prompt():
         precision="fp32",
         include_hardware=True,
         gpu_name="L40S",
+        vendor="nvidia",
     )
     log_prompt(hardware_prompt, os.path.join(scratch_dir), "hardware_prompt.txt")
 
+    # AMD hardware prompt (MI355X - CDNA)
+    amd_cdna_prompt = get_prompt_for_backend(
+        ref_arch_src=ref_arch_src,
+        backend="triton",
+        option="one_shot",
+        precision="fp32",
+        include_hardware=True,
+        gpu_name="MI355X",
+        vendor="amd",
+    )
+    log_prompt(amd_cdna_prompt, os.path.join(scratch_dir), "amd_cdna_prompt.txt")
+    
+    # AMD hardware prompt (RDNA4 - R9700)
+    amd_rdna4_prompt = get_prompt_for_backend(
+        ref_arch_src=ref_arch_src,
+        backend="triton",
+        option="one_shot",
+        precision="fp32",
+        include_hardware=True,
+        gpu_name="R9700",
+        vendor="amd",
+    )
+    log_prompt(amd_rdna4_prompt, os.path.join(scratch_dir), "amd_rdna4_prompt.txt")
+
     # custom prompt defined in prompts.toml
     custom_prompt = get_custom_prompt(
         # the key is whatever you name the prompt in the custom_prompts section of the toml file
@@ -463,8 +611,11 @@ def test_prompt():
         precision="fp32",
         include_hardware=True,
         gpu_name="L40S",
+        vendor="nvidia",
     )
     log_prompt(custom_prompt, os.path.join(scratch_dir), "custom_prompt.txt")
     
+    print("All prompts generated successfully!")
+    
 if __name__ == "__main__":
     test_prompt()
\ No newline at end of file
diff --git a/src/kernelbench/prompts/hardware/gpu_specs.py b/src/kernelbench/prompts/hardware/gpu_specs.py
index 800f20ef..6e28cc82 100644
--- a/src/kernelbench/prompts/hardware/gpu_specs.py
+++ b/src/kernelbench/prompts/hardware/gpu_specs.py
@@ -1,9 +1,14 @@
 """
 A List of GPU Specs to include in the prompt
 
+Supports both NVIDIA and AMD GPUs.
 """
 
 
+# =============================================================================
+# NVIDIA GPU Specifications
+# =============================================================================
+
 GPU_SPEC_INFO = {
     "L40S": {
         "GPU Architecture": "Ada",
@@ -121,20 +126,173 @@
     }
 }
 
-# Basic GPU concept definitions
+# =============================================================================
+# AMD GPU Specifications
+# =============================================================================
+
+AMD_GPU_SPEC_INFO = {
+    # Based on provided rocminfo for AMD Radeon 9700 (gfx1201)
+    "R9700": {
+        "GPU Name": "AMD Radeon 9700 (gfx1201)",
+        "GPU Architecture": "AMD RDNA4 (gfx1201)",
+        "Compute Units": 64,
+        "SIMDs per CU": 2,
+        "Shader Engines": 4,
+        "Shader Arrays per Engine": 2,
+        "Wavefront Size": "Wave32",
+        "Max Clock (MHz)": 2350,
+        "Workgroup Max Size": 1024,
+        "Max Waves per CU": 32,
+        "Stream Processors": 4096,
+        "Ray Accelerators": 64,
+        "AI Accelerators": 128,
+        "ROPs": 128,
+        "Transistors": "53.9 Billion",
+        "Peak Pixel Fill Rate": "373.76 GP/s",
+        "L1 Cache": "32 KB",
+        "L2 Cache": "8 MB",
+        "L3 Cache": "64 MB",
+        "Cacheline Size": "256 B",
+        "LDS (Workgroup Local Memory)": "64 KB",
+        "VRAM": "32,061,259,776 B (~29.85 GiB)",
+        "Memory Bandwidth": "Unknown",
+        "FP32 Vector TFLOPS": "47.8",
+        "FP16 Vector TFLOPS": "95.7",
+        "FP16 Matrix TFLOPS": "191 (383 w/ sparsity)",
+        "FP8 Matrix TFLOPS": "383 (766 w/ sparsity)",
+        "INT8 Matrix TOPS": "383 (766 w/ sparsity)",
+        "INT4 Matrix TOPS": "766 (1531 w/ sparsity)",
+        "Max Registers per Block": 196608,
+        "Max Shared Memory per Block": 65536,
+        "Max Threads per Block": 1024,
+        "Max Threads per CU": 2048,
+        "Shared Memory per CU": 2097152,
+        "Warp Size": 32,
+        "MFMA": "Unknown",
+    },
+    # Based on provided rocminfo for AMD Instinct MI355X (gfx950)
+    "MI355X": {
+        "GPU Name": "AMD Instinct MI355X (gfx950)",
+        "GPU Architecture": "gfx950 (CDNA family)",
+        "Compute Units": 256,
+        "SIMDs per CU": 4,
+        "Shader Engines": 32,
+        "Shader Arrays per Engine": 1,
+        "Wavefront Size": "Wave64",
+        "Max Clock (MHz)": 2400,
+        "Peak Engine Clock": "2.4 GHz",
+        "Workgroup Max Size": 1024,
+        "Max Waves per CU": 32,
+        "Max Work-item per CU": 2048,
+        "Matrix Cores": 1024,
+        "Stream Processors": 16384,
+        "L1 Cache": "32 KB",
+        "L2 Cache": "4 MB",
+        "L3 Cache": "256 MB",
+        "Cacheline Size": "128 B",
+        "LDS (Workgroup Local Memory)": "160 KB",
+        "VRAM": "288 GB HBM3E (309,220,868,096 B)",
+        "Memory Bandwidth": "8 TB/s",
+        "Memory Interface": "8192 bits",
+        "Infinity Cache (Last Level)": "256 MB",
+        "FP16 Vector TFLOPS": "157.3",
+        "FP16 Matrix PFLOPS": "2.5166 (5.0332 w/ sparsity)",
+        "BF16 Matrix PFLOPS": "2.5166 (5.0332 w/ sparsity)",
+        "INT8 Matrix POPS": "5.0332 (10.0664 w/ sparsity)",
+        "MXFP8 PFLOPS": "5.0332",
+        "OCP-FP8 PFLOPS": "5.0332 (10.0664 w/ sparsity)",
+        "MXFP6 PFLOPS": "10.0663",
+        "MXFP4 PFLOPS": "10.0663",
+        "FP64 Vector TFLOPS": "78.6",
+        "FP32 Vector TFLOPS": "157.3",
+        "FP64 Matrix TFLOPS": "78.6",
+        "FP32 Matrix TFLOPS": "157.3",
+        "Max Registers per Block": 131072,
+        "Max Shared Memory per Block": 163840,
+        "Max Threads per Block": 1024,
+        "Max Threads per CU": 2048,
+        "Shared Memory per CU": 41943040,
+        "Warp Size": 64,
+        "MFMA": "Unknown",
+    },
+    # Based on provided rocminfo + HIP query for AMD Radeon PRO W7900D (gfx1100)
+    "W7900D": {
+        "GPU Name": "AMD Radeon PRO W7900D (gfx1100)",
+        "GPU Architecture": "AMD RDNA3 (gfx1100)",
+        "Compute Units": 96,
+        "SIMDs per CU": 2,
+        "Shader Engines": 6,
+        "Shader Arrays per Engine": 2,
+        "Wavefront Size": "Wave32",
+        "Max Clock (MHz)": 1760,
+        "Workgroup Max Size": 1024,
+        "Max Waves per CU": 32,
+        "Max Work-item per CU": 1024,
+        "L1 Cache": "32 KB",
+        "L2 Cache": "6 MB",
+        "L3 Cache": "96 MB",
+        "Cacheline Size": "128 B",
+        "LDS (Workgroup Local Memory)": "64 KB",
+        "VRAM": "Unknown",
+        "Memory Bandwidth": "Unknown",
+        "Max Registers per Block": 196608,
+        "Max Shared Memory per Block": 65536,
+        "Max Threads per Block": 1024,
+        "Max Threads per CU": 2048,
+        "Shared Memory per CU": 3145728,
+        "Warp Size": 32,
+        "MFMA": "Unknown",
+    },
+}
+
+# =============================================================================
+# GPU Concept Definitions
+# =============================================================================
+
+# Basic GPU concept definitions (NVIDIA-centric)
 GPU_DEFINITIONS = {
     "Thread": "A thread is a single execution unit that can run a single instruction at a time.",
     "Thread Block": "A thread block is a group of threads that can cooperate with each other.",
     "Warp": "A warp is a group of threads that are scheduled together and execute in parallel.",
+    "SM": "A Streaming Multiprocessor, the core execution unit on NVIDIA GPUs.",
+    "Tensor Core": "Specialized units for mixed-precision matrix operations.",
+    "Occupancy": "The ratio of active warps to the maximum supported on an SM.",
     "Shared Memory": "Shared memory is a memory space that can be accessed by all threads in a thread block.",
+    "Shared Memory Bank": "A subdivision of shared memory that can cause bank conflicts.",
     "Register": "A register is a small memory space that can be accessed by a single thread.",
+    "Global Memory": "Off-chip DRAM accessible by all threads on the GPU.",
+    "Constant Memory": "Read-only cached memory optimized for uniform access.",
+    "Coalesced Access": "Memory access pattern that combines multiple requests into fewer transactions.",
+    "Divergence": "When threads in the same warp take different control paths.",
     "Memory Hierarchy": "Memory hierarchy is a pyramid of memory types with different speeds and sizes.",
     "Memory Bandwidth": "Memory bandwidth is the rate at which data can be read from or stored into memory.",
     "Cache": "Cache is a small memory space that stores frequently accessed data.",
     "HBM": "HBM is a high-bandwidth memory technology that uses 3D-stacked DRAM.",
 }
 
+# AMD GPU concept definitions
+AMD_GPU_DEFINITIONS = {
+    "Wavefront": "AMD's SIMD execution group (Wave32 or Wave64).",
+    "Wave32": "A 32-lane wavefront, common on RDNA architectures.",
+    "Wave64": "A 64-lane wavefront, common on CDNA architectures.",
+    "Compute Unit (CU)": "AMD's equivalent of an NVIDIA SM.",
+    "Work-item": "A single thread in a kernel execution.",
+    "Workgroup": "A group of work-items that can synchronize and share LDS.",
+    "SIMD": "A SIMD unit inside a CU that executes a wavefront.",
+    "LDS": "Local Data Share, AMD's shared memory.",
+    "VGPR": "Vector registers allocated per work-item.",
+    "SGPR": "Scalar registers shared across a wavefront.",
+    "Occupancy": "Number of active waves per CU, limited by registers and LDS.",
+    "Infinity Cache": "AMD's last-level cache that reduces DRAM traffic.",
+    "MFMA": "Matrix Fused Multiply-Add instruction for matrix cores.",
+    "Barrier": "A workgroup synchronization point.",
+}
+
+
 
+# =============================================================================
+# Best Practices
+# =============================================================================
 
 GPU_BEST_PRACTICES = [
     # From https://docs.nvidia.com/cuda/ada-tuning-guide/index.html
@@ -145,6 +303,80 @@
     "Ensure that global memory accesses are coalesced.",
     "Minimize redundant accesses to global memory whenever possible.",
     "Avoid long sequences of diverged execution by threads within the same warp.",
+    "Use shared memory to cache data that is reused within a block.",
+    "Avoid shared memory bank conflicts; pad arrays when needed.",
+    "Balance occupancy against register and shared memory usage.",
+    "Use vectorized loads/stores when they improve bandwidth.",
+    "Prefer tensor cores for matrix operations when supported.",
+    "Use streams to overlap compute and data transfers.",
+    "Use asynchronous copy features (e.g., cp.async) when available.",
     # we added this to reference the specific GPU architecture
     "Use specialized instructions based on the specific GPU architecture",
-]
\ No newline at end of file
+]
+
+AMD_GPU_BEST_PRACTICES = [
+    "Prefer Wave32-friendly configurations on RDNA architectures.",
+    "Prefer Wave64 on CDNA unless the kernel benefits from Wave32.",
+    "Choose workgroup sizes as multiples of the wavefront (32 or 64).",
+    "Start with workgroup sizes in [256, 512, 1024] for 1D kernels.",
+    "Balance VGPR usage and occupancy; avoid register spilling.",
+    "Use LDS for data reuse; pad to avoid LDS bank conflicts.",
+    "Keep global memory access contiguous and aligned (128B where possible).",
+    "Use vectorized loads/stores when it improves bandwidth utilization.",
+    "Use MFMA/matrix cores for GEMM-like operations when available.",
+    "Minimize divergent branches within a wavefront.",
+    "Avoid fp16 for exp/log; cast to fp32 for numerically sensitive ops.",
+]
+
+# =============================================================================
+# AMD-Specific Prompt Templates-In progress
+# =============================================================================
+
+
+
+# =============================================================================
+# Helper Functions for GPU Detection
+# =============================================================================
+
+def get_gpu_vendor() -> str:
+    """
+    Detect the GPU vendor (nvidia or amd).
+    Returns: 'nvidia', 'amd', or 'unknown'
+    """
+    try:
+        import torch
+        if not torch.cuda.is_available():
+            return "unknown"
+        # Check for HIP version (ROCm indicator)
+        if hasattr(torch.version, 'hip') and torch.version.hip is not None:
+            return "amd"
+        return "nvidia"
+    except ImportError:
+        return "unknown"
+
+
+def get_gpu_specs_for_vendor(vendor: str) -> dict:
+    """
+    Get appropriate GPU specs dictionary based on vendor.
+    """
+    if vendor.lower() == "amd":
+        return AMD_GPU_SPEC_INFO
+    return GPU_SPEC_INFO
+
+
+def get_gpu_definitions_for_vendor(vendor: str) -> dict:
+    """
+    Get appropriate GPU definitions dictionary based on vendor.
+    """
+    if vendor.lower() == "amd":
+        return AMD_GPU_DEFINITIONS
+    return GPU_DEFINITIONS
+
+
+def get_gpu_best_practices_for_vendor(vendor: str) -> list:
+    """
+    Get appropriate best practices list based on vendor.
+    """
+    if vendor.lower() == "amd":
+        return AMD_GPU_BEST_PRACTICES
+    return GPU_BEST_PRACTICES
\ No newline at end of file
diff --git a/src/kernelbench/prompts/prompts.toml b/src/kernelbench/prompts/prompts.toml
index 2768aa11..4e264d80 100644
--- a/src/kernelbench/prompts/prompts.toml
+++ b/src/kernelbench/prompts/prompts.toml
@@ -142,6 +142,7 @@ other placeholder supported in the shared context.
 
 # -------------------------------------------------------------------------
 # Hardware Templates: GPU-specific information blocks
+# Supports both NVIDIA and AMD GPUs via {gpu_vendor_display} placeholder
 # -------------------------------------------------------------------------
 [templates.hardware]
 hardware_header = """
@@ -149,7 +150,7 @@ Here is some information about the underlying hardware that you should keep in m
 """
 
 hardware_specs = """
-The GPU that will run the kernel is NVIDIA {gpu_name}, {gpu_architecture} architecture.
+The GPU that will run the kernel is {gpu_vendor_display} {gpu_name}, {gpu_architecture} architecture.
 
 {gpu_specs_bullets}
 """
@@ -166,6 +167,11 @@ Here are some best practices for writing kernels on GPU:
 {gpu_best_practices_bullets}
 """
 
+# AMD-specific optimization guidance (only included for AMD GPUs)
+amd_optimization_guidance = """
+{amd_optimization_guidance}
+"""
+
 # -------------------------------------------------------------------------
 # Options: Different prompt construction modes
 # -------------------------------------------------------------------------
diff --git a/src/kernelbench/utils.py b/src/kernelbench/utils.py
index cf8b0ad8..c7efbaec 100644
--- a/src/kernelbench/utils.py
+++ b/src/kernelbench/utils.py
@@ -18,8 +18,10 @@
 from importlib.resources import files, as_file
 
 # API clients
+from together import Together
 from openai import OpenAI
-from litellm import completion
+import google.generativeai as genai
+import anthropic
 
 import numpy as np
 from contextlib import contextmanager
@@ -27,17 +29,41 @@
 import time
 import concurrent
 from functools import cache
-
+from transformers import AutoTokenizer
 
 from concurrent.futures import ProcessPoolExecutor, as_completed
 
-SGLANG_KEY = os.environ.get("SGLANG_API_KEY")
+# Define API key access
+TOGETHER_KEY = os.environ.get("TOGETHER_API_KEY")
+DEEPSEEK_KEY = os.environ.get("DEEPSEEK_API_KEY")
+OPENAI_KEY = os.environ.get("OPENAI_API_KEY")
+GEMINI_KEY = os.environ.get("GEMINI_API_KEY")
+SGLANG_KEY = os.environ.get("SGLANG_API_KEY")  # for Local Deployment
+ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY")
+SAMBANOVA_API_KEY = os.environ.get("SAMBANOVA_API_KEY")
+FIREWORKS_API_KEY = os.environ.get("FIREWORKS_API_KEY")
 
 
 ########################################################
 # Inference Helpers
 ########################################################
 
+@cache
+def load_deepseek_tokenizer():
+    return AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V2", trust_remote_code=True)
+
+# Buffer because deepseek totally blocks us if we send stuff that's too long :(
+TOO_LONG_FOR_DEEPSEEK = 115_000
+
+def is_safe_to_send_to_deepseek(prompt):
+    tokenizer = load_deepseek_tokenizer()
+    if type(prompt) == str:
+        return (
+            len(tokenizer(prompt, verbose=False)["input_ids"]) < TOO_LONG_FOR_DEEPSEEK
+        )
+    else:
+        return len(tokenizer.apply_chat_template(prompt)) < TOO_LONG_FOR_DEEPSEEK
+
 def set_gpu_arch(arch_list: list[str]):
     """
     Set env variable for torch cuda arch list to build kernels for specified architectures
@@ -69,18 +95,231 @@ def query_server(
 ):
     """
     Query various sort of LLM inference API providers
-    Done through liteLLM:
-    - Local Server (SGLang, vLLM, Tokasaurus)
+    Supports:
+    - OpenAI (AMD LLM Gateway)
+    - Deepseek
+    - Together
+    - Sambanova
+    - Anthropic
+    - Gemini / Google AI Studio
+    - Fireworks (OpenAI compatbility)
+    - SGLang (Local Server)
     """
-    # Local Server (SGLang, vLLM, Tokasaurus) - special handling
-    if server_type == "local":
-        url = f"http://{server_address}:{server_port}"
-        client = OpenAI(
-            api_key=SGLANG_KEY, base_url=f"{url}/v1", timeout=None, max_retries=0
+    # Select model and client based on arguments
+    match server_type:
+        case "sglang":
+            url = f"http://{server_address}:{server_port}"
+            client = OpenAI(
+                api_key=SGLANG_KEY, base_url=f"{url}/v1", timeout=None, max_retries=0
+            )
+            model = "default"
+        case "deepseek":
+            client = OpenAI(
+                api_key=DEEPSEEK_KEY,
+                base_url="https://api.deepseek.com",
+                timeout=10000000,
+                max_retries=3,
+            )
+            model = model_name
+            assert model in ["deepseek-chat", "deepseek-coder", "deepseek-reasoner"], "Only support deepseek-chat or deepseek-coder for now"
+            if not is_safe_to_send_to_deepseek(prompt):
+                raise RuntimeError("Prompt is too long for DeepSeek")
+        case "fireworks":
+            client = OpenAI(
+                api_key=FIREWORKS_API_KEY,
+                base_url="https://api.fireworks.ai/inference/v1",
+                timeout=10000000,
+                max_retries=3,
+            )
+            model = model_name
+
+        case "anthropic":
+            client = anthropic.Anthropic(
+                api_key=ANTHROPIC_KEY,
+            )
+            model = model_name
+        case "google":
+            genai.configure(api_key=GEMINI_KEY)
+            model = model_name
+        case "together":
+            client = Together(api_key=TOGETHER_KEY)
+            model = model_name
+        case "sambanova":
+            client = OpenAI(api_key=SAMBANOVA_API_KEY, base_url="https://api.sambanova.ai/v1")
+            model = model_name
+            
+        case "openai":
+            # AMD LLM Gateway
+            client = OpenAI(
+                base_url="https://llm-api.amd.com/OpenAI",
+                api_key="dummy",
+                default_headers={
+                    "Ocp-Apim-Subscription-Key": os.environ.get("LLM_GATEWAY_KEY"),
+                }
+            )
+            model = model_name
+        case _:
+            raise NotImplementedError(f"Server type {server_type} not supported")
+
+    if server_type != "google":
+        assert client is not None, "Client is not set, cannot proceed to generations"
+    else:
+        print(
+            f"Querying {server_type} {model} with temp {temperature} max tokens {max_tokens}"
         )
-        if isinstance(prompt, str):
+    # Logic to query the LLM
+    if server_type == "anthropic":
+        assert type(prompt) == str
+
+        if is_reasoning_model:
+            # Use beta endpoint with thinking enabled for reasoning models
+            response = client.beta.messages.create(
+                model=model,
+                system=system_prompt,
+                messages=[
+                    {"role": "user", "content": prompt},
+                ],
+                max_tokens=max_tokens,
+                # Claude thinking requires budget_tokens for thinking (reasoning)
+                thinking={"type": "enabled", "budget_tokens": budget_tokens},
+                betas=["output-128k-2025-02-19"],
+            )
+        else:
+            # Use standard endpoint for normal models
+            response = client.messages.create(
+                model=model,
+                system=system_prompt,
+                messages=[
+                    {"role": "user", "content": prompt},
+                ],
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                max_tokens=max_tokens,
+            )
+        outputs = [choice.text for choice in response.content if not hasattr(choice, 'thinking') or not choice.thinking]
+
+    elif server_type == "google":
+        generation_config = {
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "max_output_tokens": max_tokens,
+            "response_mime_type": "text/plain",
+        }
+
+        model = genai.GenerativeModel(
+            model_name=model_name,
+            system_instruction=system_prompt,
+            generation_config=generation_config,
+        )
+
+        response = model.generate_content(prompt)
+
+        return response.text
+
+    elif server_type == "deepseek":
+        
+        if model in ["deepseek-chat", "deepseek-coder"]:
+            # regular deepseek model 
+            response = client.chat.completions.create(
+                    model=model,
+                    messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": prompt},
+                ],
+                stream=False,
+                temperature=temperature,
+                n=num_completions,
+                max_tokens=max_tokens,
+                top_p=top_p,
+            )
+
+        else: # deepseek reasoner
+            assert is_reasoning_model, "Only support deepseek-reasoner for now"
+            assert model == "deepseek-reasoner", "Only support deepseek-reasoner for now"
+            response = client.chat.completions.create(
+                    model=model,
+                    messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": prompt},
+                ],
+                stream=False,
+                n=num_completions,
+                max_tokens=max_tokens,
+                # do not use temperature or top_p
+            )
+        outputs = [choice.message.content for choice in response.choices]
+    elif server_type == "openai":
+        if is_reasoning_model:
+            assert "o1" in model or "o3" in model, "Only support o1 and o3 for now"
+            print(f"Using OpenAI reasoning model: {model} with reasoning effort {reasoning_effort}")
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "user", "content": prompt},
+                ],
+                reasoning_effort=reasoning_effort,
+            )
+        else:
+            response = client.chat.completions.create(
+                model=model,
+                messages=[
+                    {"role": "system", "content": system_prompt},
+                    {"role": "user", "content": prompt},
+                ],
+                stream=False,
+                temperature=temperature,
+                n=num_completions,
+                max_tokens=max_tokens,
+                top_p=top_p,
+            )
+        outputs = [choice.message.content for choice in response.choices]
+    elif server_type == "together":
+        response = client.chat.completions.create(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt},
+            ],
+            top_p=top_p,
+            top_k=top_k,
+            stop=["<|eot_id|>", "<|eom_id|>"],
+            stream=False,
+        )
+        outputs = [choice.message.content for choice in response.choices]
+    elif server_type == "fireworks":
+        response = client.chat.completions.create(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt},
+            ],
+            stop=["<|eot_id|>", "<|eom_id|>"],
+            stream=False,
+        )
+        outputs = [choice.message.content for choice in response.choices]
+    elif server_type == "sambanova":
+        response = client.chat.completions.create(
+            model=model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": prompt},
+            ],
+            top_p=top_p,
+        )
+        outputs = [choice.message.content for choice in response.choices]
+    # for all other kinds of servers, use standard API
+    else:
+        if type(prompt) == str:
             response = client.completions.create(
-                model="default",
+                model=model,
                 prompt=prompt,
                 temperature=temperature,
                 n=num_completions,
@@ -90,7 +329,7 @@ def query_server(
             outputs = [choice.text for choice in response.choices]
         else:
             response = client.chat.completions.create(
-                model="default",
+                model=model,
                 messages=prompt,
                 temperature=temperature,
                 n=num_completions,
@@ -98,105 +337,42 @@ def query_server(
                 top_p=top_p,
             )
             outputs = [choice.message.content for choice in response.choices]
-        
-        # output processing
-        if len(outputs) == 1:
-            return outputs[0]
-        else:
-            return outputs
-    
-    # All other providers - use LiteLLM unified interface
-    # Build messages list with system prompt first (if not already present)
-    messages = []
-    
-    # Check if prompt is already a list with a system message
-    if isinstance(prompt, list) and prompt and prompt[0].get("role") == "system":
-        # Prompt already has system message, use it directly
-        messages = prompt
+
+    # output processing
+    if len(outputs) == 1:
+        return outputs[0]
     else:
-        # Add system prompt first if provided
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
-        
-        # Then add the actual prompt
-        if isinstance(prompt, str):
-            messages.append({"role": "user", "content": prompt})
-        else:
-            messages.extend(prompt)
-    
-    try:
-        completion_kwargs = {
-            "model": model_name,
-            "messages": messages,
-            "max_tokens": max_tokens,
-            "n": num_completions,
-        }
-        
-        # Reasoning models (o1, o3, etc.) don't support standard sampling params
-        if is_reasoning_model:
-            # Note: o1/o3 models don't support temperature, top_p, top_k
-            # LiteLLM will pass through reasoning_effort for OpenAI o1/o3 models
-            if reasoning_effort:
-                completion_kwargs["reasoning_effort"] = reasoning_effort
-            # Claude extended thinking uses "thinking" parameter with dict structure
-            # Format: {"type": "enabled", "budget_tokens": <int>}
-            if budget_tokens > 0 and "anthropic" in model_name.lower():
-                completion_kwargs["thinking"] = {"type": "enabled", "budget_tokens": budget_tokens}
-        else:
-            # Standard models support temperature and top_p
-            completion_kwargs["temperature"] = temperature
-            completion_kwargs["top_p"] = top_p
-            
-            # top_k is not supported by OpenAI models
-            if "openai/" not in model_name.lower() and "gpt" not in model_name.lower():
-                completion_kwargs["top_k"] = top_k
-        
-        response = completion(**completion_kwargs)
-        
-        # output processing
-        if num_completions == 1:
-            content = response.choices[0].message.content
-            if content is None:
-                raise ValueError(f"LLM returned None content for model {model_name}. finish_reason: {response.choices[0].finish_reason}")
-            return content
-        else:
-            contents = [choice.message.content for choice in response.choices]
-            if any(c is None for c in contents):
-                raise ValueError(f"LLM returned None content in one or more completions for model {model_name}")
-            return contents
-    except Exception as e:
-        print(f"Error in query_server for model {model_name}: {e}")
-        raise
+        return outputs
 
 
 # a list of presets for API server configs
 SERVER_PRESETS = {
     "deepseek": {
         "temperature": 1.6, 
-        "model_name": "deepseek/deepseek-coder",
+        "model_name": "deepseek-chat",
         "max_tokens": 4096
     },
     "google": {
-        "model_name": "gemini/gemini-2.5-flash",
+        "model_name": "gemini-1.5-flash-002",
         "temperature": 0.7, # need to experiment with temperature
-        "max_tokens": 16384,
+        "max_tokens": 8192,
     },
     "together": { # mostly for Llama 3.1
-        "model_name": "together_ai/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
+        "model_name": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
         # "model_name": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
         "temperature": 0.7,
         "max_tokens": 4096,
     },
-    "local": {  # this is for running locally (SGLang, vLLM, Tokasaurus), mostly for Llama
+    "sglang": {  # this is for running locally, mostly for Llama
         "temperature": 0.8, # human eval pass@N temperature
         "server_port": 10210,
         "server_address": "matx2.stanford.edu",
         "max_tokens": 8192,
     },
-    "anthropic": {  # for Claude 3.7 Sonnet
-        "model_name": "anthropic/claude-3-7-sonnet-20250219",
+    "anthropic": {  # for Claude 3.5 Sonnet
+        "model_name": "claude-3-5-sonnet-20241022",
         "temperature": 0.8,
-        "max_tokens": 8192,
+        "max_tokens": 4096,
     },
     "openai": {
         "model_name": "gpt-4o-2024-08-06",
@@ -204,10 +380,10 @@ def query_server(
         "temperature": 0.0,
         "max_tokens": 4096,
     },
-    "fireworks": {
-        "model_name": "fireworks_ai/llama-v3p1-70b-instruct",
-        "temperature": 0.7,
-        "max_tokens": 4096,
+    "sambanova": {
+        "model_name": "Meta-Llama-3.1-405B-Instruct",
+        "temperature": 0.1,
+        "max_tokens": 8192,
     },
 }
 
@@ -216,7 +392,6 @@ def create_inference_server_from_presets(server_type: str = None,
                                          greedy_sample: bool = False,   
                                          verbose: bool = False,
                                          time_generation: bool = False,
-                                         model_name: str = None,
                                          **kwargs,
                                          ) -> callable:
     """
@@ -224,21 +399,15 @@ def create_inference_server_from_presets(server_type: str = None,
     """
     def _query_llm(prompt: str | list[dict]):
         server_args = SERVER_PRESETS[server_type].copy()
-        
-        if model_name is not None and model_name != "None":
-            server_args["model_name"] = model_name
-        
+
         if kwargs:
-            filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None and v != "None"}
-            server_args.update(filtered_kwargs)
-        
+            server_args.update(kwargs)
         if greedy_sample:
             server_args["temperature"] = 0.0
             server_args["top_p"] = 1.0
             server_args["top_k"] = 1
-        
         if verbose:
-            print(f"Querying server {server_type} with model {server_args['model_name']} and args: {server_args}")
+            print(f"Querying server {server_type} with args: {server_args}")
         
         if time_generation:
             start_time = time.time()

From 28f6f4069aac73c70baee47c715e0437f4c0a018 Mon Sep 17 00:00:00 2001
From: 01xjw <220233704@seu.edu.cn>
Date: Thu, 29 Jan 2026 07:16:30 +0800
Subject: [PATCH 2/3] Revert REANDME changes

---
 README.md | 190 ++++++++++++++++++------------------------------------
 1 file changed, 62 insertions(+), 128 deletions(-)

diff --git a/README.md b/README.md
index 1a01270b..61f7e8d8 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,19 @@
 # KernelBench: Can LLMs Write Efficient GPU Kernels? [ICML '25]
-[arXiv](https://arxiv.org/html/2502.10517v1) | [blog post](https://scalingintelligence.stanford.edu/blogs/kernelbench/) | [HuggingFace Dataset](https://huggingface.co/datasets/ScalingIntelligence/KernelBench) | 
+A benchmark for evaluating LLMs' ability to generate efficient GPU kernels
+
+[arXiv](https://arxiv.org/html/2502.10517v1) | [blog post](https://scalingintelligence.stanford.edu/blogs/kernelbench/) | [HuggingFace Dataset](https://huggingface.co/datasets/ScalingIntelligence/KernelBench) 
+
+<img src="./assets/figures/KernelBenchMascot.png" width="200">
 
 ## Versions
-The huggingface dataset is updated to v0.1.
-- [v0.1](https://github.com/ScalingIntelligence/KernelBench/tree/v0.1) - Latest version (also main branch)
+The latest stable version will be on `main` branch. We continue to update and improve the repo. 
+- [v0.1](https://github.com/ScalingIntelligence/KernelBench/tree/v0.1) - See [blog](https://scalingintelligence.stanford.edu/blogs/kernelbenchv01/)
 - [v0](https://github.com/ScalingIntelligence/KernelBench/tree/v0) - Original Release
 
-A benchmark for evaluating LLMs' ability to generate efficient GPU kernels
 
-<img src="./assets/figures/KernelBenchMascot.png" width="200">
+The Huggingface [dataset](https://huggingface.co/datasets/ScalingIntelligence/KernelBench) is updated to v0.1.
 
-<!-- See [blog post](https://scalingintelligence.stanford.edu/blogs/kernelbench/) and [arXiv paper](https://arxiv.org/html/2502.10517v1) for more details. -->
+This repo provides core functionality for KernelBench and an easy-to-use set of scripts for evaluation. It is not intended to provide complex agentic scaffolds that solve this task; we recommend cloning and modifying this repo for your experiment, or using it as a git submodule.
 
 ## 👋 Task Description
 We structure the problem for LLM to transpile operators described in PyTorch to CUDA kernels, at whatever level of granularity it desires to.
@@ -26,7 +29,7 @@ We construct KernelBench to have 4 Levels of categories:
 - **Level 4 🤗**:  Level Hugging Face 
     Optimize whole model architectures from HuggingFace
 
-We are actively extending KernelBench to other DSLs beyond `cuda` as well.
+We are actively extending KernelBench to other DSLs beyond `cuda` as well (see below).
 
 ## ⚖️ Evaluation
 #### Methodology
@@ -34,9 +37,9 @@ To evaluate model-generated kernels, we need to check if they:
 - **is correct ✅**: check against reference torch operators `n_correctness` times on randomized inputs.
 - **is performant ⏱️**: compare against reference torch operators `n_trial` times to measure speedup between runtimes.
 
-Check out `src/eval.py` for details on how we implement correctness check and timing. 
+Check out `src/eval.py` for details on how we implement correctness check and timing and `EVAL.md` for notes on evaluation and benchmarking guidelines [WIP].
 
-We provide a convenient script `scripts/run_and_check.py` to evaluate one single sample source code against a reference source code, check correctness and compute speedup. You can use this to evaluate a model-generated kernel. 
+We provide a convenient script `scripts/run_and_check.py` to evaluate one single sample source code against a reference source code, check correctness and compute speedup. You can use this to evaluate a kernel either locally or remotely by setting `eval_mode=local` or `eval_mode=modal`.
 
 #### Overall Benchmark Metric
 
@@ -63,164 +66,93 @@ We organize the repo into the following structure:
 KernelBench/
 ├── assets/
 ├── KernelBench/ # Benchmark dataset files
-├── src/ # KernelBench logic code
+├── src/kernelbench/ # KernelBench logic code
 │   ├── unit_tests/  
 │   ├── prompts/
 │   ├── ....
 ├── scripts/ # helpful scripts to run the benchmark
 ├── results/ # baseline times across hardware 
 ├── runs/ # where your runs will be stored
+├── notebooks/ # example notebooks for analysis
+├── pyproject.toml # Project configuration and dependencies
 ```
 
 ## 🔧 Set up
-```
-conda create --name kernel-bench python=3.10
-conda activate kernel-bench
-pip install -r requirements.txt
-pip install -e . 
-```
 
-### GPU Setup
-Running and profiling kernels require a GPU.
-If you don't have GPU available locally, you can set up [Modal](https://modal.com/). Set up your modal token after creating an account by running `modal token new`. Then, use the `generate_and_eval_single_sample_modal.py` script.
+We have transitioned to using `pyproject.toml` and `uv` for dependency management. Install [uv](https://docs.astral.sh/uv/getting-started/installation/) if you haven't already
 
-#### NVIDIA (CUDA)
-- Use default backend `cuda` (recommended).
-- Ensure a CUDA-enabled PyTorch install.
+```bash
+# Install base dependencies (works without a local GPU)
+uv sync
 
-#### AMD ROCm (Radeon / MI-Series)
-KernelBench can run on AMD GPUs via ROCm (HIP) using the same PyTorch `torch.cuda` API.
+# Install ROCm-enabled PyTorch (pick the correct ROCm version for your system):
 
-1) Install ROCm-enabled PyTorch (pick the correct ROCm version for your system):
-```
-# Example (adjust ROCm version as needed)
-pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/rocm6.4
-```
+uv pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm7.1
 
-2) Verify GPU visibility:
-```
-python - <<'PY'
-import torch
-print("HIP:", torch.version.hip)
-print("GPU:", torch.cuda.get_device_name(0))
-print(torch.cuda.get_device_properties(0))
-PY
-```
+# Install with GPU dependencies (for local GPU evaluation)
+uv sync --extra gpu
 
-3) Optional: select specific GPU(s)
-```
-export HIP_VISIBLE_DEVICES=0
-export ROCR_VISIBLE_DEVICES=0
+# Run commands with uv (which invoke the right env)
+uv run python scripts/<script_name>.py ...
 ```
 
-> Note: For AMD, use `backend=triton` or `backend=helion` where applicable. CUDA backend is NVIDIA-only.
+You can still use `conda (python=3.10)` to create your environment and install dependencies with `requirements.txt`.
 
-##### AMD ROCm Tips
-- **What works**: AMD hardware-aware prompts, Triton backend generation, and ROCm-friendly timing.
-- **What does not (by default)**: CUDA backend evaluation on ROCm is blocked to avoid CUDA-only compile paths.
-- **Troubleshooting**: Ensure Triton is ROCm-enabled and PyTorch is a ROCm build.
+We use `litellm` for API calls. Please set your keys by creating a `.env` following our `.env.example`.
 
-To call LLM API providers, set the provider API key in your environment:
-```
-export OPENAI_API_KEY="your_api_key_here"
-```
+Running and profiling kernels require a GPU.
+If you don't have a GPU available locally, you can set up [Modal](https://modal.com/) for cloud serverless GPU evaluation. Set up your modal token after creating an account by running `modal token new`. Then, use the `generate_and_eval_single_sample_modal.py` script.
+
+You can also try out our [tutorial notebook](https://bit.ly/kernelbench-neurips-colab) (also in notebooks/tutorial.ipynb) with Google Colab.
 
 ## 🚀 Usage
-### Run on a single problem
-This will fetch the problem, generate a sample, and evaluate the sample.
+### Run on a single problem 
+It is easier to get started with a single problem. This will fetch the problem, generate a sample, and evaluate the sample. 
 
-```
-# Example: run level 2 problem 40 from Hugging Face
-python3 scripts/generate_and_eval_single_sample.py dataset_src="huggingface" level=2 problem_id=40
+```bash
+# for example, run level 2 problem 40 from huggingface and use google gemini 2.5 flash for generation
+
+uv run python scripts/generate_and_eval_single_sample.py dataset_src=huggingface level=2 problem_id=40 server_type=google model_name=gemini/gemini-2.5-flash
 
 # dataset_src could be "local" or "huggingface"
-# add .verbose_logging for more visibility
+# add .verbose_logging for more visbility
 ```
 
-We also support other GPU programming languages beyond `cuda`. Set `backend=triton`, `backend=cute`, or `backend=helion` as needed.
+**What you might need to modify**
+* **`gpu_arch`** - Depend on your GPU, you might need to adjust the `gpu_arch` argument to reflect your hardware. currently supported `["gfx1100"]` (W7900D), `["gfx1201"]` (R9700).
+* **`precision`** - You can specify the precision of tensor by `precision=fp32`. Currently all of our reported results are `fp32` but we added support for `fp16` & `bf16`.
+*  **`backend`** - We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support DSLs: `cuda`, `triton`, `cute`, `tilelang`, `thunderkittens`. Note: ROCm GPUs currently use `backend=triton`.
 
-#### AMD ROCm Example Commands
-Use `backend=triton` (recommended) or `backend=helion` on AMD GPUs:
-```
-# Triton on AMD ROCm (single problem)
-python3 scripts/generate_and_eval_single_sample.py \
-  dataset_src="huggingface" level=2 problem_id=40 \
-  backend=triton
-
-# Helion on AMD ROCm (single problem) (still in progress)
-python3 scripts/generate_and_eval_single_sample.py \
-  dataset_src="huggingface" level=2 problem_id=40 \
-  backend=helion
-```
 
-If you want to target a specific AMD GPU:
-```
-HIP_VISIBLE_DEVICES=0 ROCR_VISIBLE_DEVICES=0 \
-python3 scripts/generate_and_eval_single_sample.py \
-  dataset_src="huggingface" level=2 problem_id=40 \
-  backend=triton
-```
+Note on setting up ThunderKittens (TK) locally: to use `backend=thunderkittens`, you need to git clone the ThunderKittens repo and set the following environment variable to point to your local ThunderKittens directory, `export THUNDERKITTENS_ROOT=<PATH to ThunderKittens folder>`, and all ThunderKitten programs as shown in the [example](src/kernelbench/prompts/model_new_ex_add_thunderkittens.py), should contain `tk_root = os.environ.get("THUNDERKITTENS_ROOT", "/root/ThunderKittens")`, which enable the kernel to include the right TK primitives. In addition, we only support BF16 for TK right now.
 
-##### Optional: Force AMD Prompt Inputs
-Some scripts auto-detect GPU vendor/name. You can override:
-```
-python3 scripts/generate_and_eval_single_sample.py \
-  dataset_src=huggingface \
-  level=1 \
-  problem_id=1 \
-  backend=triton \
-  gpu_vendor=amd \
-  gpu_name=MI355X
-```
+Check the config fields for comprehensive set of options. Note we provide the model with a one-shot example by default along with the minimum set of info; you can check out other prompt settings or construct your own in `src/prompt_constructor_toml.py`.
 
-### Run on all problems
+### Run on all problems 
 
-```
+```bash
 # 1. Generate responses and store kernels locally to runs/{run_name} directory
-python3 scripts/generate_samples.py \
-  run_name=test_hf_level_1 dataset_src=huggingface level=1 num_workers=50 \
-  server_type=deepseek model_name=deepseek-chat temperature=0
-
-# If you use LLM_GATEWAY_KEY (AMD gateway), set server_type=openai and temperature=1
+uv run python scripts/generate_samples.py run_name=test_hf_level_1 dataset_src=huggingface level=1 num_workers=50 server_type=deepseek model_name=deepseek-chat temperature=0
 
-# 2. Evaluate all generated kernels in runs/{run_name}
-python3 scripts/eval_from_generations.py run_name=test_hf_level_1 dataset_src=local level=1 num_gpu_devices=8 timeout=300
+# 2. Evaluate on all generated kernels in runs/{run_name} directory
+uv run python scripts/eval_from_generations.py run_name=test_hf_level_1 dataset_src=local level=1 num_gpu_devices=8 timeout=300
 
-# To speed up evaluation, parallelize compilation on CPUs before GPU evaluation.
-# Add build_cache=True and num_cpu_workers=<num_cpu_workers> to the command.
-```
-
-##### AMD Triton Quick Start (batch)
-```
-python3 scripts/generate_samples.py \
-  run_name=amd_test \
-  dataset_src=huggingface \
-  level=1 \
-  backend=triton
-
-python3 scripts/eval_from_generations.py \
-  run_name=amd_test \
-  dataset_src=huggingface \
-  level=1 \
-  backend=triton \
-  eval_mode=local
-```
-
-##### AMD Baseline Timing
-```
-python3 scripts/get_baseline_time_single_problem.py
+# If you like to speedup evaluation, you can use parallelize compilation on CPUs before getting to evaluation on GPUs
+# add build_cache=True and num_cpu_workers=<num_cpu_workers> to the command
 ```
 ### Analyze the eval results to compute Benchmark Performance
-Use `scripts/benchmark_eval_analysis.py` to compute success rate, timing metrics, and overall benchmark performance `fast_p`.
+We provide `scripts/benchmark_eval_analysis.py` to analyze the eval results to compute success rate, timing metric, and overall benchmark performance  `fast_p`. 
 
+```bash
+uv run python scripts/benchmark_eval_analysis.py run_name=test_hf_level_1 level=1 hardware=L40S_matx3 baseline=baseline_time_torch
 ```
-python3 scripts/benchmark_eval_analysis.py run_name=test_hf_level_1 level=1 hardware=L40S_matx3 baseline=baseline_time_torch
-```
-If you use different hardware, generate a baseline with `scripts/generate_baseline_time.py`.
-We provide reference baselines for various NVIDIA GPUs in `results/timing`, but we recommend generating your own for accuracy (cluster power and software versions affect timing). See `results/timing/README.md` for details.
+If you are using a different hardware, you can generate the baseline time with `scripts/generate_baseline_time.py` script.
+We provide some reference baseline times a variety of NVIDIA GPUs across generations in `results/timing`, but we recommend you to generate your own baseline time for more accurate results (cluster power, software version, all affects timing result). See `results/timing/README.md` for more details.
 
-### Multi-Turn Framework
-We have also releaed the test-time framework [Caesar](https://github.com/simonguozirui/caesar) that are used in the multi-turn / iterative refinement experiments in our paper. You can use or modify this framework for high-throughput test-time scaling (both sequential and parallel) targeting KernelBench problems. 
+### Multi-Turn Framework & Integrations
+We have also releaed the test-time framework [Caesar](https://github.com/ScalingIntelligence/caesar) that are used in the multi-turn / iterative refinement experiments in our paper. You can use or modify this framework for high-throughput test-time scaling (both sequential and parallel) targeting KernelBench problems.
+
+You can also use KernelBench as a library for your projects, for example: `from kernelbench import timing`, `from kernelbench import eval as kb_eval`, or `from kernelbench.utils import set_gpu_arch`.
 
 ## 🛣️ Upcoming Roadmap
 Check out our [roadmap](https://github.com/ScalingIntelligence/KernelBench/issues/74) for what we plan to add as features. We welcome community contirbutions in these directions. 
@@ -228,6 +160,8 @@ Check out our [roadmap](https://github.com/ScalingIntelligence/KernelBench/issue
 ## 🔍 Known Usage
 Since release, we have gotten a lot of interest from researchers, research labs, and companies that use KernelBench to explore this direction. We have documented [known usage](https://docs.google.com/document/d/e/2PACX-1vTjS-UMH1HB5n_PENq2k-3YRfXIXkqKIKeNC2zcWMyLPdl4Jrwvdk4dNDVSsM8ybKrCxZB7GJq1slZF/pub) of KernelBench and related efforts towards automated kernel generations. If you are using KernelBench, we love to hear more about it!
 
+Disclaimer: KernelBench is designed as an open-source evaluation framework and toolkit. The KernelBench team does not review, validate, or endorse individual kernels or reported results. Users are responsible for independently verifying any results obtained using the framework. Please check out `EVAL.md` for more guidance on benchmarking and evaluating kernels.
+
 ## 🪪 License
 MIT. Check `LICENSE.md` for more details.
 

From b1659f4e4119986eb888900016cc1d8090df5aca Mon Sep 17 00:00:00 2001
From: 01xjw <220233704@seu.edu.cn>
Date: Thu, 29 Jan 2026 11:15:27 +0800
Subject: [PATCH 3/3] update

---
 pyproject.toml                                | 67 +++++++++++++++++++
 setup.py                                      |  9 ---
 src/kernelbench/eval.py                       |  9 ++-
 src/kernelbench/prompt_constructor_toml.py    | 50 ++------------
 src/kernelbench/prompts/hardware/gpu_specs.py | 45 -------------
 5 files changed, 75 insertions(+), 105 deletions(-)
 create mode 100644 pyproject.toml
 delete mode 100644 setup.py

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..4eb6ea4f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,67 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+# this should be our single source of truth for versioning
+
+[project]
+name = "kernelbench"
+version = "0.2.0.dev0"
+requires-python = "==3.10.*"
+dependencies = [
+    # Frameworks
+    "torch==2.9.0",
+
+    "pytorch-triton-rocm>=3.4.0",
+    "transformers",
+    "datasets",
+    "modal",
+    "ruff",
+
+    # helper
+    "tqdm",
+    "packaging",
+    "setuptools",
+    "pydra-config",
+    "ninja",
+    "tomli",
+    "tabulate",
+
+    # Numerics
+    "einops",
+    "python-dotenv",
+    "numpy",
+
+    # LLM providers
+    "openai",
+    "litellm[proxy]",
+]
+
+[project.optional-dependencies]
+gpu = [
+    # GPU-specific dependencies (ROCm / AMD Radeon)
+    "triton",
+    "tilelang",
+]
+dev = [
+    "pytest",
+    "ruff",
+]
+
+
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["kernelbench*"]
+
+[tool.setuptools.package-data]
+kernelbench = ["prompts/**/*"]
+
+[tool.uv.sources]
+torch = [{ index = "pytorch-rocm" }]
+torchvision = [{ index = "pytorch-rocm" }]
+pytorch-triton-rocm = [{ index = "pytorch-rocm" }]
+
+[[tool.uv.index]]
+name = "pytorch-rocm"
+url = "https://download.pytorch.org/whl/rocm6.4"
+explicit = true
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 83d82456..00000000
--- a/setup.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from setuptools import setup, find_packages
-
-if __name__ == "__main__":
-    setup(
-        name="kernelbench",
-        version="0.2.0",
-        package_dir={"": "src"},
-        packages=find_packages(where="src"),
-    )
diff --git a/src/kernelbench/eval.py b/src/kernelbench/eval.py
index 5ccfa708..170a4bcf 100644
--- a/src/kernelbench/eval.py
+++ b/src/kernelbench/eval.py
@@ -556,7 +556,7 @@ def eval_kernel_against_ref(
         verbose: Enable verbose logging
         build_dir: Directory for caching compiled kernels
         device: GPU device to run evaluation on (CUDA or ROCm)
-        backend: One of 'cuda', 'triton', 'tilelang', 'cute', or 'helion'
+        backend: One of 'cuda', 'triton', 'tilelang', or 'cute'
         precision: torch.dtype for computation (note: tilelang only supports fp16)
         check_for_excessive_speedup: Guard against potential reward hacking
         excessive_speedup_threshold: Flag if kernel is more than this faster than reference
@@ -590,10 +590,9 @@ def eval_kernel_against_ref(
     # Backends that use tempfile approach
     # - triton: @triton.jit decorator requires file-based import
     # - cute: CUTLASS requires file-based compilation
-    # - helion: @helion.kernel decorator requires inspect.getsource()
     # - tilelang: JIT requires file-based import
     backend_lower = backend.lower()
-    uses_tempfile = backend_lower in ["triton", "tilelang", "cute", "helion"]
+    uses_tempfile = backend_lower in ["triton", "tilelang", "cute"]
     
     metadata = {}  # for storing result metadata
     metadata["hardware"] = torch.cuda.get_device_name(device=device)
@@ -609,7 +608,7 @@ def eval_kernel_against_ref(
         metadata["compute_capability"] = gpu_info.get("compute_capability", "unknown")
 
     if uses_tempfile:
-        # Set device visibility for triton/cute/helion/tilelang
+        # Set device visibility for triton/cute/tilelang
         if isinstance(device, int):
             device_num = device
         elif isinstance(device, torch.device):
@@ -661,7 +660,7 @@ def eval_kernel_against_ref(
         tempfile = None
         # add hash for later to distinguish between multi-turn kernels
         
-        if backend_lower in ["triton", "tilelang", "cute", "helion"]:
+        if backend_lower in ["triton", "tilelang", "cute"]:
             # Use tempfile approach for triton, tilelang, and cute
             # These DSLs require proper module import for JIT decorators to work
             ModelNew, tempfile = load_custom_model_with_tempfile(
diff --git a/src/kernelbench/prompt_constructor_toml.py b/src/kernelbench/prompt_constructor_toml.py
index 82dcdda3..1de60288 100644
--- a/src/kernelbench/prompt_constructor_toml.py
+++ b/src/kernelbench/prompt_constructor_toml.py
@@ -184,36 +184,7 @@ def _gpu_context_from_gpu_specs(py_path: str, gpu_name: str, vendor: str = "nvid
         "gpu_best_practices_bullets": best_bullets,
         "gpu_vendor": vendor.lower(),
         "gpu_vendor_display": vendor_display,
-    }
-    
-    # Add AMD-specific prompts to context-In progress
-    # if is_amd:
-    #     context["amd_high_correct_prompt"] = high_correct_prompt
-    #     context["amd_rdna4_prompt"] = rdna4_prompt
-    #     context["amd_quant_op_prompt"] = quant_op_prompt
-    #     context["amd_helion_prompt"] = helion_prompt
-        
-    #     # Determine which AMD prompts to include based on GPU architecture
-    #     gpu_name_lower = gpu_name.lower()
-    #     gpu_arch_lower = gpu_architecture.lower()
-        
-    #     amd_extra_guidance = ""
-    #     amd_extra_guidance += "\n\n### AMD GPU Optimization Guidance\n\n"
-    #     amd_extra_guidance += high_correct_prompt
-        
-    #     # RDNA4 specific guidance
-    #     if any(x in gpu_name_lower or x in gpu_arch_lower for x in ["gfx12", "rdna4", "rx 9", "rx9", "r9700"]):
-    #         amd_extra_guidance += "\n\n" + rdna4_prompt
-        
-    #     # CDNA / MI series specific guidance
-    #     if any(x in gpu_name_lower or x in gpu_arch_lower for x in ["mi300", "mi355", "mi3", "gfx9"]):
-    #         amd_extra_guidance += "\n\n" + quant_op_prompt
-        
-    #     # Helion guidance for all AMD GPUs
-    #     amd_extra_guidance += "\n\n" + helion_prompt
-        
-    #     context["amd_optimization_guidance"] = amd_extra_guidance
-    
+    } 
     return context
 
 def render_prompt_by_option(
@@ -441,7 +412,7 @@ def get_prompt_for_backend(
         option: The prompt option (zero_shot, one_shot, few_shot)
         precision: Optional precision (fp32, fp16, bf16) - defaults to fp32 if not provided
         include_hardware: When True, append hardware guidance blocks (requires gpu_name)
-        gpu_name: GPU identifier used when include_hardware is True (e.g., "A100", "MI355X")
+        gpu_name: GPU identifier used when include_hardware is True (e.g., "A100", "R9700", "W7900D")
         vendor: GPU vendor ("nvidia" or "amd")
     """
     return render_prompt_by_option(
@@ -483,7 +454,7 @@ def get_custom_prompt(
         option: The prompt option (zero_shot, one_shot, few_shot)
         precision: Optional precision (fp32, fp16, bf16)
         include_hardware: When True, include hardware guidance
-        gpu_name: GPU identifier (e.g., "A100", "MI355X")
+        gpu_name: GPU identifier (e.g., "A100", "R9700", "W7900D")
         prompts_toml: Path to prompts.toml file
         vendor: GPU vendor ("nvidia" or "amd")
     """
@@ -576,17 +547,6 @@ def test_prompt():
     )
     log_prompt(hardware_prompt, os.path.join(scratch_dir), "hardware_prompt.txt")
 
-    # AMD hardware prompt (MI355X - CDNA)
-    amd_cdna_prompt = get_prompt_for_backend(
-        ref_arch_src=ref_arch_src,
-        backend="triton",
-        option="one_shot",
-        precision="fp32",
-        include_hardware=True,
-        gpu_name="MI355X",
-        vendor="amd",
-    )
-    log_prompt(amd_cdna_prompt, os.path.join(scratch_dir), "amd_cdna_prompt.txt")
     
     # AMD hardware prompt (RDNA4 - R9700)
     amd_rdna4_prompt = get_prompt_for_backend(
@@ -598,7 +558,7 @@ def test_prompt():
         gpu_name="R9700",
         vendor="amd",
     )
-    log_prompt(amd_rdna4_prompt, os.path.join(scratch_dir), "amd_rdna4_prompt.txt")
+
 
     # custom prompt defined in prompts.toml
     custom_prompt = get_custom_prompt(
@@ -615,7 +575,5 @@ def test_prompt():
     )
     log_prompt(custom_prompt, os.path.join(scratch_dir), "custom_prompt.txt")
     
-    print("All prompts generated successfully!")
-    
 if __name__ == "__main__":
     test_prompt()
\ No newline at end of file
diff --git a/src/kernelbench/prompts/hardware/gpu_specs.py b/src/kernelbench/prompts/hardware/gpu_specs.py
index 6e28cc82..f9e343d0 100644
--- a/src/kernelbench/prompts/hardware/gpu_specs.py
+++ b/src/kernelbench/prompts/hardware/gpu_specs.py
@@ -170,51 +170,6 @@
         "Warp Size": 32,
         "MFMA": "Unknown",
     },
-    # Based on provided rocminfo for AMD Instinct MI355X (gfx950)
-    "MI355X": {
-        "GPU Name": "AMD Instinct MI355X (gfx950)",
-        "GPU Architecture": "gfx950 (CDNA family)",
-        "Compute Units": 256,
-        "SIMDs per CU": 4,
-        "Shader Engines": 32,
-        "Shader Arrays per Engine": 1,
-        "Wavefront Size": "Wave64",
-        "Max Clock (MHz)": 2400,
-        "Peak Engine Clock": "2.4 GHz",
-        "Workgroup Max Size": 1024,
-        "Max Waves per CU": 32,
-        "Max Work-item per CU": 2048,
-        "Matrix Cores": 1024,
-        "Stream Processors": 16384,
-        "L1 Cache": "32 KB",
-        "L2 Cache": "4 MB",
-        "L3 Cache": "256 MB",
-        "Cacheline Size": "128 B",
-        "LDS (Workgroup Local Memory)": "160 KB",
-        "VRAM": "288 GB HBM3E (309,220,868,096 B)",
-        "Memory Bandwidth": "8 TB/s",
-        "Memory Interface": "8192 bits",
-        "Infinity Cache (Last Level)": "256 MB",
-        "FP16 Vector TFLOPS": "157.3",
-        "FP16 Matrix PFLOPS": "2.5166 (5.0332 w/ sparsity)",
-        "BF16 Matrix PFLOPS": "2.5166 (5.0332 w/ sparsity)",
-        "INT8 Matrix POPS": "5.0332 (10.0664 w/ sparsity)",
-        "MXFP8 PFLOPS": "5.0332",
-        "OCP-FP8 PFLOPS": "5.0332 (10.0664 w/ sparsity)",
-        "MXFP6 PFLOPS": "10.0663",
-        "MXFP4 PFLOPS": "10.0663",
-        "FP64 Vector TFLOPS": "78.6",
-        "FP32 Vector TFLOPS": "157.3",
-        "FP64 Matrix TFLOPS": "78.6",
-        "FP32 Matrix TFLOPS": "157.3",
-        "Max Registers per Block": 131072,
-        "Max Shared Memory per Block": 163840,
-        "Max Threads per Block": 1024,
-        "Max Threads per CU": 2048,
-        "Shared Memory per CU": 41943040,
-        "Warp Size": 64,
-        "MFMA": "Unknown",
-    },
     # Based on provided rocminfo + HIP query for AMD Radeon PRO W7900D (gfx1100)
     "W7900D": {
         "GPU Name": "AMD Radeon PRO W7900D (gfx1100)",