From 67e0c15276f55043fef347a62b19938c7f274b0e Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Tue, 18 Mar 2025 11:11:50 -0700
Subject: [PATCH 01/15] triton_backend_v2

---
 scripts/eval_from_generations.py              |  13 +-
 scripts/generate_and_eval_single_sample.py    |  27 +-
 .../generate_and_eval_single_sample_modal.py  |  38 +-
 scripts/generate_samples.py                   |   2 +
 src/eval.py                                   |  68 ++-
 src/prompt_constructor_triton.py              | 489 ++++++++++++++++++
 .../triton/model_new_ex_add_triton.py         |  63 +++
 src/prompts/model_new_ex_add_triton.py        |  63 +++
 8 files changed, 731 insertions(+), 32 deletions(-)
 create mode 100644 src/prompt_constructor_triton.py
 create mode 100644 src/prompts/few_shot/triton/model_new_ex_add_triton.py
 create mode 100644 src/prompts/model_new_ex_add_triton.py

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index 82913fce..e007abbe 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -6,7 +6,8 @@
 
 import json
 from tqdm import tqdm
-from src import eval, utils, compile
+# Import only what we need
+from src import compile
 import torch
 import os
 import multiprocessing as mp
@@ -15,7 +16,7 @@
 from datasets import load_dataset
 
 from src.dataset import construct_kernelbench_dataset
-from src.eval import build_compile_cache, eval_kernel_against_ref, KernelExecResult, check_metadata_serializable_all_types
+from src.eval import eval_kernel_against_ref, KernelExecResult, check_metadata_serializable_all_types
 from src.utils import set_gpu_arch, read_file
 
 """
@@ -82,6 +83,9 @@ def __init__(self):
         # number of GPUs to do batch evaluation
         self.num_gpu_devices = 1
         
+        # Backend to use for kernel implementation (cuda or triton)
+        self.backend = "cuda"
+        
 
     def __repr__(self):
         return f"EvalConfig({self.to_dict()})"
@@ -160,6 +164,7 @@ def evaluate_single_sample(work_args: WorkArgs, configs: EvalConfig, dataset, ru
             num_perf_trials=configs.num_perf_trials,
             build_dir=build_dir,
             device=device,
+            backend=configs.backend,
         )
         return eval_result
     except Exception as e:
@@ -205,7 +210,7 @@ def cuda_single_eval_wrapper(curr_work: WorkArgs, configs: dict, dataset, run_di
             pool.join()
             raise
         except mp.TimeoutError as e:
-            print(f"[WARNING] Evaluation TIMED OUT for Problem ID: {curr_work.problem_id}, Sample ID: {curr_work.sample_id}")
+            print(f"[WARNING] Evaluation TIMED OUT for Problem ID: {curr_work.problem_id}, Sample ID: {curr_work.sample_id}\nException: {e}")
 
         print(f"[Eval Result] Problem ID: {curr_work.problem_id}, Sample ID: {curr_work.sample_id}: {result}")
         return result
@@ -406,7 +411,7 @@ def main(config: EvalConfig):
     print(f"Evaluating 1 sample each for level {config.level} problems: {problem_id_range}")
 
     run_dir = os.path.join(config.runs_dir, config.run_name)
-    eval_file_path = os.path.join(run_dir, f"eval_results.json")
+    eval_file_path = os.path.join(run_dir, "eval_results.json")
 
 
     # set GPU arch to configure what target to build for
diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index 3fdb14b5..15ba4c1a 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -9,6 +9,7 @@
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
+from src.prompt_constructor_triton import prompt_generate_custom_triton_from_prompt_template
 from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
 
 """
@@ -56,6 +57,8 @@ def __init__(self):
         self.log_generated_kernel = False
         self.log_eval_result = False
 
+        self.backend = "cuda"
+
     def verbose_logging(self):
         self.log = True
         self.log_prompt = True
@@ -127,27 +130,35 @@ def main(config: EvalConfig):
     
 
-    custom_cuda_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
+    # Use appropriate prompt constructor based on backend
+    if config.backend == "cuda":
+        custom_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
+    elif config.backend == "triton":
+        custom_prompt = prompt_generate_custom_triton_from_prompt_template(ref_arch_src)
+    else:
+        raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda' or 'triton'.")
+        
     if config.log_prompt:
         with open(os.path.join(config.logdir, f"prompt_level_{config.level}_problem_{config.problem_id}.txt"), "w") as f:
-            f.write(custom_cuda_prompt)
+            f.write(custom_prompt)
 
     # Query server with constructed prompt
-    custom_cuda = inference_server(custom_cuda_prompt)
-    custom_cuda = extract_first_code(custom_cuda, ["python", "cpp"])
-    # check LLM is able to generate custom CUDA code
-    assert custom_cuda is not None, "Custom CUDA code generation failed"
+    custom_kernel = inference_server(custom_prompt)
+    custom_kernel = extract_first_code(custom_kernel, ["python", "cpp"])
+    # check LLM is able to generate custom kernel code
+    assert custom_kernel is not None, f"Custom {config.backend} kernel code generation failed"
     
     # this should be optional
     if config.log:
         with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py"), "w") as f:
-            f.write(custom_cuda)
+            f.write(custom_kernel)
 
     # 3. Evaluate Kernel
     # NOTE: no need to wrap around process here as only a single sample
     # see batch eval for examples of process isolation
     kernel_exec_result = eval_kernel_against_ref(
-        ref_arch_src, custom_cuda, verbose=config.verbose, measure_performance=True, num_correct_trials=5, num_perf_trials=100
+        ref_arch_src, custom_kernel, verbose=config.verbose, measure_performance=True, 
+        num_correct_trials=5, num_perf_trials=100, backend=config.backend
     )
     
     print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")
diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index e4a31233..d235b460 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -10,6 +10,7 @@
 #from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
+from src.prompt_constructor_triton import prompt_generate_custom_triton_from_prompt_template
 from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
 
 app = modal.App("eval_single_sample")
@@ -63,6 +64,8 @@ def __init__(self):
         self.log_generated_kernel = False
         self.log_eval_result = False
 
+        self.backend = "cuda"
+
     def verbose_logging(self):
         self.log = True
         self.log_prompt = True
@@ -106,15 +109,17 @@ def __repr__(self):
 class EvalFunc:
 
     @modal.method()
-    def eval_single_sample_modal(self, ref_arch_src, custom_cuda, verbose, gpu_arch):
+    def eval_single_sample_modal(self, ref_arch_src, custom_kernel, verbose, gpu_arch, backend):
         # 3. Evaluate Kernel
         # NOTE: no need to wrap around process here as only a single sample
         # see batch eval for examples of process isolation
         from src.eval import eval_kernel_against_ref
-        from src.utils import set_gpu_arch
-        set_gpu_arch(gpu_arch)
+        # Use utility function to set the GPU architecture in the modal environment
+        from src.utils import set_gpu_arch as modal_set_gpu_arch
+        modal_set_gpu_arch(gpu_arch)
         return eval_kernel_against_ref(
-            ref_arch_src, custom_cuda, verbose=verbose, measure_performance=True, num_correct_trials=5, num_perf_trials=100
+            ref_arch_src, custom_kernel, verbose=verbose, measure_performance=True, 
+            num_correct_trials=5, num_perf_trials=100, backend=backend
         )
 
 @pydra.main(base=EvalConfig)
@@ -174,24 +179,33 @@ def main(config: EvalConfig):
     
 
-    custom_cuda_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
+    # Use appropriate prompt constructor based on backend
+    if config.backend == "cuda":
+        custom_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
+    elif config.backend == "triton":
+        custom_prompt = prompt_generate_custom_triton_from_prompt_template(ref_arch_src)
+    else:
+        raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda' or 'triton'.")
+        
     if config.log_prompt:
         with open(os.path.join(config.logdir, f"prompt_level_{config.level}_problem_{config.problem_id}.txt"), "w") as f:
-            f.write(custom_cuda_prompt)
+            f.write(custom_prompt)
 
     # Query server with constructed prompt
-    custom_cuda = inference_server(custom_cuda_prompt)
-    custom_cuda = extract_first_code(custom_cuda, ["python", "cpp"])
-    # check LLM is able to generate custom CUDA code
-    assert custom_cuda is not None, "Custom CUDA code generation failed"
+    custom_kernel = inference_server(custom_prompt)
+    custom_kernel = extract_first_code(custom_kernel, ["python", "cpp"])
+    # check LLM is able to generate custom kernel code
+    assert custom_kernel is not None, f"Custom {config.backend} kernel code generation failed"
     
     # this should be optional
     if config.log:
         with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py"), "w") as f:
-            f.write(custom_cuda)
+            f.write(custom_kernel)
 
     with app.run():
-        kernel_exec_result = EvalFunc.with_options(gpu=config.gpu)().eval_single_sample_modal.remote(ref_arch_src, custom_cuda, config.verbose, gpu_arch_mapping[config.gpu])
+        kernel_exec_result = EvalFunc.with_options(gpu=config.gpu)().eval_single_sample_modal.remote(
+            ref_arch_src, custom_kernel, config.verbose, gpu_arch_mapping[config.gpu], config.backend
+        )
         
         print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")
         
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index 0d552b8b..51f1e892 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -62,6 +62,8 @@ def __init__(self):
 
         self.log_prompt = False
 
+        self.backend = "cuda"
+
     def greedy(self):
         # For greedy decoding, epsecially baseline eval
         self.greedy_sample = True
diff --git a/src/eval.py b/src/eval.py
index 4532154e..ba9bff30 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -112,6 +112,42 @@ def load_original_model_and_inputs(
     Model = context.get("Model")
     return (Model, get_init_inputs_fn, get_inputs_fn)
 
+def load_custom_model_with_tempfile(code_string, build_directory= None ,entry_point="ModelNew"):
+    """
+    Writes the provided Python code string to a temporary .py file,
+    dynamically imports the module so we can access the modified model class.
+
+    Returns both a Model class and the temporary file. The temporary file must be
+    deleted manually be the caller.
+
+    This is a hack that is needed for triton code as compile / exec do not play well
+    with the @triton.jit decorator.
+    """
+
+    if build_directory:
+        model_custom_src = (
+            "import os\n" f"os.environ['TORCH_EXTENSIONS_DIR'] = '{build_directory}'\n"
+        ) + model_custom_src
+
+    # Create a temporary named file with a .py extension
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tmp_file:
+        # Write the code string into the file
+        tmp_file.write(code_string)
+        # Capture the path to the file
+        tempfile_path = tmp_file.name
+        temp_file = tmp_file
+
+    # Create a module specification pointing to our temp file
+    spec = importlib.util.spec_from_file_location("temp_module", tempfile_path)
+    # Create a new module based on that spec
+    temp_module = importlib.util.module_from_spec(spec)
+    # Execute the code in the module's namespace
+    spec.loader.exec_module(temp_module)
+
+    ModelNew = getattr(temp_module, entry_point)
+
+    # Return the object (class, function, etc.) that was defined in the code
+    return ModelNew, temp_file
 
 def load_custom_model(
     model_custom_src: str, context: dict, build_directory: str = None
@@ -151,7 +187,7 @@ def _cleanup_cuda_extensions():
         shutil.rmtree(torch_extensions_path)
 
 
-def graceful_eval_cleanup(curr_context: dict, device: torch.device):
+def graceful_eval_cleanup(curr_context: dict, device: torch.device, tempfile: tempfile.NamedTemporaryFile = None):
     """
     Clean up env, gpu cache, and compiled CUDA extensions after evaluation
     """  # delete ran-specific function definitions before next eval run
@@ -166,6 +202,9 @@ def graceful_eval_cleanup(curr_context: dict, device: torch.device):
         torch.cuda.synchronize(
             device=device
         )  # Wait for all CUDA operations to complete
+    if tempfile:
+        tempfile.close()
+        os.remove(tempfile.name)
 
     # _cleanup_cuda_extensions() # SIMON NOTE: is this necessary?
 
@@ -301,6 +340,7 @@ def eval_kernel_against_ref(
     measure_performance: bool = False,
     build_dir: os.PathLike = None,
     device: torch.device = torch.cuda.current_device() if torch.cuda.is_available() else None, # have to run on GPU
+    backend: str = "cuda", # can be 'cuda' or 'triton'
 ) -> KernelExecResult:
     """
     Evaluate the custom kernel against the original model
@@ -308,6 +348,7 @@ def eval_kernel_against_ref(
     num_correct_trials: number of trials to initialize different random inputs; correctness pass only if all trials pass
     num_perf_trials: run the evalutation many times to take the average
     device: GPU (cuda) device to run the evalutation on
+    backend: str, either 'cuda' or 'triton', determines which backend implementation to use
     """
     # TODO: check device is busy
     assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval"
@@ -320,7 +361,11 @@ def eval_kernel_against_ref(
 
     # set CUDA device
     torch.cuda.set_device(device)
-
+    is_triton = backend == "triton"
+    if is_triton:
+        # need to set env var for triton code to guarentee no wrong device shennanignas 
+        assert device.type == "cuda", "CUDA is not availible on device, cannot run Eval"
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(device.index)
     context = {}
 
     if verbose:
@@ -352,8 +397,12 @@ def eval_kernel_against_ref(
     # this is where compilation happens
     try:
         os.environ["TORCH_USE_CUDA_DSA"] = "1"  # compile with device side assertion
+        tempfile = None
         # add hash for later to distinguish between multi-turn kernels
-        ModelNew = load_custom_model(custom_model_src, context, build_dir)
+        if is_triton:
+            ModelNew, tempfile = load_custom_model_with_tempfile(custom_model_src, build_dir)
+        else:
+            ModelNew = load_custom_model(custom_model_src, context, build_dir)
         torch.cuda.synchronize(device=device)  # not sure if this is too much
     except Exception as e:
         print(
@@ -367,11 +416,11 @@ def eval_kernel_against_ref(
             print(
                 f"[Eval] Lock file error during compilation, Please retry. Error: {e}"
             )
-            graceful_eval_cleanup(context, device)
+            graceful_eval_cleanup(context, device, tempfile)
             return None
         else:
             metadata["compilation_error"] = e
-            graceful_eval_cleanup(context, device)
+            graceful_eval_cleanup(context, device, tempfile)
             return KernelExecResult(
                 compiled=False, metadata=metadata
             )  # skip further steps
@@ -390,7 +439,7 @@ def eval_kernel_against_ref(
             f"Failed to load custom CUDA kernel; Compiled but not able to run, count as runtime error. \nError: {e}"
         )
         # TODO: add metadata for runtime error e.g. error in launching kernel, illegal memory access, ...
-        graceful_eval_cleanup(context, device)
+        graceful_eval_cleanup(context, device, tempfile)
         metadata["runtime_error"] = e
         return KernelExecResult(
             compiled=True, correctness=False, metadata=metadata
@@ -411,6 +460,7 @@ def eval_kernel_against_ref(
             verbose=verbose,
             seed=seed_num,
             device=device,
+            truncate_errors=not is_triton,
         )
     except Exception as e:
         # TODO: add metadata for runtime error e.g. error in launching kernel, illegal memory access, ...
@@ -454,7 +504,7 @@ def eval_kernel_against_ref(
                 print(f"[Eval] Error in Measuring Performance: {e}")
             kernel_exec_result.metadata["error_during_performance"] = e
 
-    graceful_eval_cleanup(context, device)
+    graceful_eval_cleanup(context, device, tempfile)
     return kernel_exec_result
 
 
@@ -550,6 +600,8 @@ def run_and_check_correctness(
     verbose=False,
     seed=42,
     device=None,
+    truncate_errors: bool =True,
+
 ) -> KernelExecResult:
     """
     run the model and check correctness,
@@ -629,7 +681,7 @@ def run_and_check_correctness(
                 print(f"Error in launching kernel for ModelNew: {e}")
 
                 metadata = register_and_format_exception(
-                    "runtime_error", e, metadata, truncate=True
+                    "runtime_error", e, metadata, truncate=truncate_errors
                 )
                 return KernelExecResult(
                     compiled=True, correctness=False, metadata=metadata
diff --git a/src/prompt_constructor_triton.py b/src/prompt_constructor_triton.py
new file mode 100644
index 00000000..807cceb8
--- /dev/null
+++ b/src/prompt_constructor_triton.py
@@ -0,0 +1,489 @@
+import os
+from .utils import read_file
+
+
+"""
+Construct Prompt
+
+Design principles: 
+- To evaluate base model performance on KernelBench, we use the simplest prompt possible to guide model output to generated desired output format.
+- However, we do not do extensive prompt engineering or few-shot example in the LLM to steer behaviour. 
+"""
+
+REPO_TOP_PATH = os.path.abspath(
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+    )
+)
+KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")
+
+
+def get_arch_definition_from_file(arch_path):
+    arch_src = read_file(arch_path)
+    return get_arch_definition(arch_src)
+
+
+def get_arch_definition(arch_src):
+    """
+    Construct torch definition from original torch nn.Module definition
+    """
+    prompt = f"Here is a pytorch defintion of a neural network architecture in the file model.py: ```{arch_src}```\n"
+    return prompt
+
+
+############################################
+# Triton Prompt
+############################################
+
+PROBLEM_STATEMENT = """You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups. \n
+    You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
+"""
+
+PROBLEM_INSTRUCTION = """
+Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
+"""
+
+def prompt_generate_custom_triton(
+    arc_src: str, example_arch_src: str, example_new_arch_src: str
+) -> str:
+    prompt = PROBLEM_STATEMENT
+
+    assert "@triton.jit" in example_new_arch_src, "Example new arch must contain Triton kernel"
+
+    if example_arch_src != "" and example_new_arch_src != "":
+        prompt += f"""
+        Here's an example to show you the syntax of inline embedding custom Triton kernels in torch: The example given architecture is: \n
+        ``` \n
+        {example_arch_src}
+        ``` \n
+        The example new arch with custom Triton kernels looks like this: \n
+        ```
+        {example_new_arch_src}
+        ``` \n
+        """
+
+    prompt += f"""
+    You are given the following architecture: \n
+    ```
+    {arc_src}
+    ```
+    """
+    prompt += PROBLEM_INSTRUCTION
+    return prompt
+
+
+PROBLEM_STATEMENT_CLEANED = """You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups.\n\nYou have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
+"""
+PROBLEM_INSTRUCTION_CLEANED = """
+Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
+"""
+
+def prompt_generate_custom_triton_fewshot_and_template(ref_arch_src: str, shots: list) -> str:
+    """
+    Generate a prompt with specified few-shot examples following a template 
+
+    shots: list of few-shot examples to include in the prompt
+    Avaliable few shot options to start with: 
+    - ex_add: pointwise addition
+    - ex_fuse_gelu: fused gelu
+    - ex_mnist2: fused convolutions and relus (DEPRECATED)
+    - ex_tiled_matmul: tiled matrix multiplication
+    - ex_flash_attn: simple flash attention
+    """
+    prompt = PROBLEM_STATEMENT_CLEANED
+
+    # k = 1
+    example_add = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_add.py")
+    )
+    example_add_new = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_add.py")
+    )
+    example_add_desc = "This given architecture is for a pointwise addition: "
+
+    # k = 2
+    example_fuse_gelu = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_fuse_gelu.py")
+    )
+    example_fuse_gelu_new = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_fuse_gelu.py")
+    )
+    example_fuse_gelu_desc = "This given architecture is for a fused gelu: "
+
+    # k = 3 (DEPRECATED)
+    example_mnist2 = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_mnist2.py")
+    )
+    example_mnist2_new = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_mnist2.py")
+    )
+    exmaple_mnist2_desc = "This given architecture is for a model with fused convolutions and relus: "
+
+    # k = 4
+    example_tiled_matmul = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_tiled_matmul.py")
+    )
+    example_tiled_matmul_new = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_tiled_matmul.py")
+    )
+    example_tiled_matmul_desc = "This given architecture is for a model with tiled matrix multiplication: "
+
+    # k = 5
+    example_flash_attn = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_flash_attn.py")
+    )
+    example_flash_attn_new = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_flash_attn.py")
+    )
+    example_flash_attn_desc = "This given architecture is for a model with simple io-aware implementation of attention, also known as flash attention: "
+
+    examples = []
+    for s in shots:
+        if s not in ["ex_add", "ex_fuse_gelu", "ex_mnist2", "ex_tiled_matmul", "ex_flash_attn"]:
+            raise ValueError(f"Invalid shot: {s}")
+        elif s == "ex_add":
+            examples.append((example_add, example_add_new, example_add_desc))
+        elif s == "ex_fuse_gelu":
+            examples.append((example_fuse_gelu, example_fuse_gelu_new, example_fuse_gelu_desc))
+        elif s == "ex_mnist2": # DEPRECATED
+            raise ValueError("ex_mnist2 is deprecated")
+            examples.append((example_mnist2, example_mnist2_new, exmaple_mnist2_desc))
+        elif s == "ex_tiled_matmul":
+            examples.append((example_tiled_matmul, example_tiled_matmul_new, example_tiled_matmul_desc))
+        elif s == "ex_flash_attn":
+            examples.append((example_flash_attn, example_flash_attn_new, example_flash_attn_desc))
+    
+
+    for i, tup in enumerate(examples):
+        base, kernel, desc = tup
+
+        prompt += f"""
+Example {i+1}:\n\n
+Here is an example architecture:\n\n
+```
+{base}
+```\n
+{PROBLEM_INSTRUCTION_CLEANED} \n
+Here is an optimized verison with custom CUDA kernels: \n
+```
+{kernel}
+```\n\n
+"""
+
+# should we put task here?
+    prompt += f"""
+Task:\n\n
+Here is an example architecture:\n\n
+```
+{ref_arch_src}
+```\n
+"""
+    prompt += PROBLEM_INSTRUCTION_CLEANED
+    return prompt
+
+def prompt_generate_ex_with_CoT_template(ref_arch_src: str, cot_example: str) -> str:
+    """
+    Generate a prompt with a CoT example following a template 
+    Avaliable CoT examples: 
+    - ex_fuse_gelu: fused gelu
+    - ex_mnist2: fused convolutions and relus
+    - ex_tiled_matmul: tiled matrix multiplication
+    """
+
+    # I updated this to allow CoT. Also explicilty state think step by step.
+    PROBLEM_INSTRUCTION_COT = """
+Optimize the architecture named Model with custom CUDA operators! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Do not output testing code. 
+In the end, make sure the final code block contains code for output architecture ModelNew with cuda code.\n
+Let's think step by step.\n
+""" 
+
+    prompt = PROBLEM_STATEMENT_CLEANED
+    
+    assert cot_example in ["ex_fuse_gelu", "ex_mnist2", "ex_tiled_matmul"]
+
+    # k = 2
+    example_fuse_gelu = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_fuse_gelu.py")
+    )
+    example_fuse_gelu_cot = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/cot/model_cot_fuse_gelu.py")
+    )
+    example_fuse_gelu_new = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_fuse_gelu.py")
+    )
+    example_fuse_gelu_desc = "This given architecture is for a fused gelu: "
+
+    # k = 3
+    example_mnist2 = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_mnist2.py")
+    )
+    example_mnist2_cot = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/cot/model_cot_mnist2.py")
+    )
+    example_mnist2_new = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_mnist2.py")
+    )
+    exmaple_mnist2_desc = "This given architecture is for a model with fused convolutions and relus: "
+
+    # k = 4
+    example_tiled_matmul = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_tiled_matmul.py")
+    )
+    example_tiled_matmul_cot = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/cot/model_cot_tiled_matmul.py")
+    )
+    example_tiled_matmul_new = read_file(
+        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_tiled_matmul.py")
+    )
+    example_tiled_matmul_desc = "This given architecture is for a model with tiled matrix multiplication: "
+    
+    match cot_example:
+        case "ex_fuse_gelu":
+            base = example_fuse_gelu
+            cot = example_fuse_gelu_cot
+            kernel = example_fuse_gelu_new
+            desc = example_fuse_gelu_desc
+        case "ex_mnist2":
+            base = example_mnist2
+            cot = example_mnist2_cot
+            kernel = example_mnist2_new
+            desc = exmaple_mnist2_desc
+        case "ex_tiled_matmul":
+            base = example_tiled_matmul
+            cot = example_tiled_matmul_cot
+            kernel = example_tiled_matmul_new
+            desc = example_tiled_matmul_desc
+        case _:
+            raise ValueError(f"Invalid CoT example: {cot_example} not found in CoT examples")
+
+    # construct example with 
+    # NOTE: we only do one example with CoT for now
+    # 1. ref_src problem -> 2. Instruction -> 3. CoT -> 4. Solution
+    prompt += f"""
+Here is an example architecture:\n\n
+```
+{base}
+```\n
+{PROBLEM_INSTRUCTION_COT} \n
+{cot} \n
+```
+{kernel}
+```\n\n
+"""
+
+# show task to solve
+    prompt += f"""
+Task:\n\n
+Here is an example architecture:\n\n
+```
+{ref_arch_src}
+```\n
+"""
+    prompt += PROBLEM_INSTRUCTION_COT
+
+    return prompt
+
+
+def prompt_generate_custom_triton_from_prompt_template(ref_arch_src: str) -> str:
+    """
+    Using prompt example (an element-wise addition) for prompt templates
+    The most basic form of example just to show LLM the task and the expected output format
+    """
+    arch = ref_arch_src
+    # These are strictly defined for now
+
+    # path to prompt template, show an example of Model (torch specifications) and ModelNew (torch + custom CUDA kernels)
+    example_arch_path = os.path.join(
+        REPO_TOP_PATH, f"src/prompts/model_ex_add.py"
+    )
+    example_new_arch_path = os.path.join(
+        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
+    )
+
+    if not os.path.exists(example_arch_path):
+        raise FileNotFoundError(
+            f"Example architecture file not found: {example_arch_path}"
+        )
+    if not os.path.exists(example_new_arch_path):
+        raise FileNotFoundError(
+            f"Example new architecture file not found: {example_new_arch_path}"
+        )
+
+    example_arch = read_file(example_arch_path)
+    example_new_arch = read_file(example_new_arch_path)
+
+    return prompt_generate_custom_triton(arch, example_arch, example_new_arch)
+
+
+def prompt_generate_prompt_with_hardware_info_from_template(ref_arch_src: str, gpu_name: str) -> str:
+    """
+    Similar to prompt_generate_custom_triton_from_prompt_template, 
+    but with hardware information for the given GPU
+    """
+
+    arch = ref_arch_src
+    # These are strictly defined for now
+
+    # path to prompt template, show an example of Model (torch specifications) and ModelNew (torch + custom CUDA kernels)
+    example_arch_path = os.path.join(
+        REPO_TOP_PATH, f"src/prompts/model_ex_add.py"
+    )
+    example_new_arch_path = os.path.join(
+        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
+    )
+
+    gpu_spec_file_path = os.path.join(REPO_TOP_PATH, f"src/prompts/hardware/gpu_specs.py")
+
+    example_arch = read_file(example_arch_path)
+    example_new_arch = read_file(example_new_arch_path)
+    gpu_spec_info = read_file(gpu_spec_file_path)
+
+    return prompt_generate_prompt_with_hardware_info(
+                                        ref_arch_src=arch, 
+                                        gpu_name=gpu_name, 
+                                        example_arch_src=example_arch, 
+                                        example_new_arch_src=example_new_arch, 
+                                        gpu_spec_info_src=gpu_spec_info
+                                        )
+    
+
+
+def prompt_generate_prompt_with_hardware_info(ref_arch_src: str, 
+                                              gpu_name: str, 
+                                              example_arch_src: str, 
+                                              example_new_arch_src: str, 
+                                              gpu_spec_info_src: str) -> str:
+    """
+    Generate a prompt with hardware information for the given GPU
+    gpu_spec_info_src: str of the gpu spec src file
+    """
+
+    # Create a dictionary to store the local namespace
+    local_dict = {}
+    
+    # Execute the GPU spec file in the local namespace
+    exec(gpu_spec_info_src, {}, local_dict)
+    
+    # Get the required variables from the local namespace
+    GPU_SPEC_INFO = local_dict.get('GPU_SPEC_INFO')
+    GPU_DEFINITIONS = local_dict.get('GPU_DEFINITIONS')
+    GPU_BEST_PRACTICES = local_dict.get('GPU_BEST_PRACTICES')
+    
+    if not GPU_SPEC_INFO or not GPU_DEFINITIONS or not GPU_BEST_PRACTICES:
+        raise ValueError("GPU_SPEC_INFO or GPU_DEFINITIONS or GPU_BEST_PRACTICES not found in gpu_spec_info_src")
+
+    assert gpu_name in GPU_SPEC_INFO, f"GPU name {gpu_name} not found in GPU_SPEC_INFO"
+
+    prompt = PROBLEM_STATEMENT
+
+    if example_arch_src != "" and example_new_arch_src != "":
+        prompt += f"""
+        Here's an example to show you the syntax of inline embedding custom CUDA operators in torch: The example given architecture is: \n
+        ``` \n
+        {example_arch_src}
+        ``` \n
+        The example new arch with custom CUDA kernels looks like this: 
+        ```
+        {example_new_arch_src}
+        ``` \n
+        """
+    
+    curr_gpu_spec_info = GPU_SPEC_INFO[gpu_name]
+
+    gpu_architecture = curr_gpu_spec_info.get("GPU Architecture")
+    prompt += f"""
+    Here is some information about the underlying hardware that you should keep in mind. \n\n
+The GPU that will run the kernel is NVIDIA {gpu_name}, {gpu_architecture} architecture.\n\n"""
+    
+    for key, value in curr_gpu_spec_info.items():
+        if key == "GPU Architecture":
+            continue
+        prompt += f"""- We have {value} of {key}.\n"""
+    
+    
+    prompt += f"""\n\n
+Here are some concepts about the GPU architecture that could be helpful: \n\n"""
+    for key, value in GPU_DEFINITIONS.items():
+        prompt += f"""- {key}: {value}\n"""
+
+    prompt += f"""\n\n
+Here are some best practices for writing CUDA kernels on GPU: \n\n"""
+    for best_practice in GPU_BEST_PRACTICES:
+        prompt += f"""- {best_practice}\n"""
+
+
+    prompt += f"""
+    You are given the following architecture: \n
+    ```
+    {ref_arch_src}
+    ```
+    """
+    
+
+    prompt += PROBLEM_INSTRUCTION
+    return prompt
+
+
+    return None
+
+
+
+
+
+def prompt_fix_compile(ref_arch_src, custom_cuda, metadata):
+    prompt = PROBLEM_STATEMENT
+    prompt += f"""
+    With the following architecture:
+    ```
+    {ref_arch_src}
+    ```
+    You generated the following solution and it failed to compile:
+    ```
+    {custom_cuda}
+    ```
+    Here's the metadata of the compilation error:
+    ```
+    {metadata}
+    ```
+    
+    Please fix the compilation error in the new model code. Please output the corrected code in codeblocks.
+    """
+    return prompt
+
+
+def prompt_fix_correctness(ref_arch_src, custom_cuda, metadata):
+    prompt = PROBLEM_STATEMENT
+    prompt += f"""
+    With the following architecture:
+    ```
+    {ref_arch_src}
+    ```
+    You generated the following solution and it failed correctness:
+    ```
+    {custom_cuda}
+    ```
+    Here's the metadata of the correctness error:
+    ```
+    {metadata}
+    ```
+    Please consider how your custom CUDA kernels are implemented, how it is different from the reference implementation, and fix the correctness error in the new model code. Please output the corrected code in codeblocks.
+    """
+    return prompt
+
+def main():
+    gpu_name = "L40S"
+
+
+    ref_arch_src = read_file(os.path.join(KERNEL_BENCH_PATH, f"level1/19_ReLU.py"))
+    assert len(ref_arch_src) > 0, "ref_arch_src is empty"
+    prompt = prompt_generate_prompt_with_hardware_info_from_template(ref_arch_src, gpu_name)
+    print(prompt)
+    # Write prompt to temp file
+    temp_file_path = os.path.join(REPO_TOP_PATH, "scratch", "prompt_draft.txt")
+    os.makedirs(os.path.dirname(temp_file_path), exist_ok=True)
+    with open(temp_file_path, "w") as f:
+        f.write(prompt)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/prompts/few_shot/triton/model_new_ex_add_triton.py b/src/prompts/few_shot/triton/model_new_ex_add_triton.py
new file mode 100644
index 00000000..43a3f712
--- /dev/null
+++ b/src/prompts/few_shot/triton/model_new_ex_add_triton.py
@@ -0,0 +1,63 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def add_kernel(
+    x_ptr,  # Pointer to first input
+    y_ptr,  # Pointer to second input
+    out_ptr,  # Pointer to output
+    n_elements,  # Total number of elements in input/output
+    BLOCK_SIZE: tl.constexpr,
+):
+    # Each program handles a contiguous block of data of size BLOCK_SIZE
+    block_start = tl.program_id(0) * BLOCK_SIZE
+    # Create a range of offsets [0..BLOCK_SIZE-1]
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # Mask to ensure we don't go out of bounds
+    mask = offsets < n_elements
+    # Load input values
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    y = tl.load(y_ptr + offsets, mask=mask, other=0.0)
+    # Perform the elementwise addition
+    out = x + y
+    # Store the result
+    tl.store(out_ptr + offsets, out, mask=mask)
+
+
+def triton_add(x: torch.Tensor, y: torch.Tensor):
+    """
+    This function wraps the Triton kernel call. It:
+      1. Ensures the inputs are contiguous on GPU.
+      2. Calculates the grid (blocks) needed.
+      3. Launches the Triton kernel.
+    """
+    assert x.is_cuda and y.is_cuda, "Tensors must be on CUDA."
+    x = x.contiguous()
+    y = y.contiguous()
+
+    # Prepare output tensor
+    out = torch.empty_like(x)
+
+    # Number of elements in the tensor
+    n_elements = x.numel()
+    BLOCK_SIZE = 128  # Tunable parameter for block size
+
+    # Determine the number of blocks needed
+    grid = lambda meta: ((n_elements + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
+
+    # Launch the Triton kernel
+    add_kernel[grid](x, y, out, n_elements, BLOCK_SIZE=BLOCK_SIZE)
+    return out
+
+
+class ModelNew(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, a, b):
+        # Instead of "return a + b", call our Triton-based addition
+        return triton_add(a, b)
\ No newline at end of file
diff --git a/src/prompts/model_new_ex_add_triton.py b/src/prompts/model_new_ex_add_triton.py
new file mode 100644
index 00000000..43a3f712
--- /dev/null
+++ b/src/prompts/model_new_ex_add_triton.py
@@ -0,0 +1,63 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def add_kernel(
+    x_ptr,  # Pointer to first input
+    y_ptr,  # Pointer to second input
+    out_ptr,  # Pointer to output
+    n_elements,  # Total number of elements in input/output
+    BLOCK_SIZE: tl.constexpr,
+):
+    # Each program handles a contiguous block of data of size BLOCK_SIZE
+    block_start = tl.program_id(0) * BLOCK_SIZE
+    # Create a range of offsets [0..BLOCK_SIZE-1]
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # Mask to ensure we don't go out of bounds
+    mask = offsets < n_elements
+    # Load input values
+    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
+    y = tl.load(y_ptr + offsets, mask=mask, other=0.0)
+    # Perform the elementwise addition
+    out = x + y
+    # Store the result
+    tl.store(out_ptr + offsets, out, mask=mask)
+
+
+def triton_add(x: torch.Tensor, y: torch.Tensor):
+    """
+    This function wraps the Triton kernel call. It:
+      1. Ensures the inputs are contiguous on GPU.
+      2. Calculates the grid (blocks) needed.
+      3. Launches the Triton kernel.
+    """
+    assert x.is_cuda and y.is_cuda, "Tensors must be on CUDA."
+    x = x.contiguous()
+    y = y.contiguous()
+
+    # Prepare output tensor
+    out = torch.empty_like(x)
+
+    # Number of elements in the tensor
+    n_elements = x.numel()
+    BLOCK_SIZE = 128  # Tunable parameter for block size
+
+    # Determine the number of blocks needed
+    grid = lambda meta: ((n_elements + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
+
+    # Launch the Triton kernel
+    add_kernel[grid](x, y, out, n_elements, BLOCK_SIZE=BLOCK_SIZE)
+    return out
+
+
+class ModelNew(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, a, b):
+        # Instead of "return a + b", call our Triton-based addition
+        return triton_add(a, b)
\ No newline at end of file

From 8bfdd210d85054ed7afca2cb6d9f82c3f6ac1988 Mon Sep 17 00:00:00 2001
From: PaliC <sahancpal@gmail.com>
Date: Tue, 18 Mar 2025 11:30:12 -0700
Subject: [PATCH 02/15] fix eval bugs

---
 src/eval.py | 90 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 31 deletions(-)

diff --git a/src/eval.py b/src/eval.py
index ba9bff30..8902c488 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -2,17 +2,21 @@
 Helpers for Evaluations
 """
 
+import importlib
+import json
+import os, subprocess
+import random
+import sys
+import tempfile
+from contextlib import redirect_stderr, redirect_stdout
+from io import StringIO
+from typing import Union
+
+import numpy as np
 import requests
 import torch
 import torch.nn as nn
-import os, subprocess
 from pydantic import BaseModel
-import numpy as np
-import random
-import json
-from contextlib import redirect_stdout, redirect_stderr
-from io import StringIO
-import sys
 
 from . import utils
 
@@ -112,7 +116,10 @@ def load_original_model_and_inputs(
     Model = context.get("Model")
     return (Model, get_init_inputs_fn, get_inputs_fn)
 
-def load_custom_model_with_tempfile(code_string, build_directory= None ,entry_point="ModelNew"):
+
+def load_custom_model_with_tempfile(
+    code_string, build_directory=None, entry_point="ModelNew"
+):
     """
     Writes the provided Python code string to a temporary .py file,
     dynamically imports the module so we can access the modified model class.
@@ -149,6 +156,7 @@ def load_custom_model_with_tempfile(code_string, build_directory= None ,entry_po
     # Return the object (class, function, etc.) that was defined in the code
     return ModelNew, temp_file
 
+
 def load_custom_model(
     model_custom_src: str, context: dict, build_directory: str = None
 ) -> nn.Module:
@@ -187,7 +195,11 @@ def _cleanup_cuda_extensions():
         shutil.rmtree(torch_extensions_path)
 
 
-def graceful_eval_cleanup(curr_context: dict, device: torch.device, tempfile: tempfile.NamedTemporaryFile = None):
+def graceful_eval_cleanup(
+    curr_context: dict,
+    device: torch.device,
+    tempfile: tempfile.NamedTemporaryFile = None,
+):
     """
     Clean up env, gpu cache, and compiled CUDA extensions after evaluation
     """  # delete ran-specific function definitions before next eval run
@@ -208,6 +220,7 @@ def graceful_eval_cleanup(curr_context: dict, device: torch.device, tempfile: te
 
     # _cleanup_cuda_extensions() # SIMON NOTE: is this necessary?
 
+
 def build_compile_cache_legacy(
     custom_model_src: str,
     verbose: bool = False,
@@ -241,11 +254,12 @@ def build_compile_cache_legacy(
         if verbose:
             print(f"[Compilation] Compilation Successful, saved cache at: {build_dir}")
     except Exception as e:
-        print(f"[Compilation] Failed to compile custom CUDA kernel. Unable to cache, \nError: {e}")
+        print(
+            f"[Compilation] Failed to compile custom CUDA kernel. Unable to cache, \nError: {e}"
+        )
         return False, stdout_buffer.getvalue(), str(e)
-    
-    return True, stdout_buffer.getvalue(), None
 
+    return True, stdout_buffer.getvalue(), None
 
 
 def build_compile_cache(
@@ -281,16 +295,16 @@ def build_compile_cache(
         if verbose:
             print(f"[Compilation] Compilation Successful, saved cache at: {build_dir}")
     except Exception as e:
-        print(f"[Compilation] Failed to compile custom CUDA kernel. Unable to cache, \nError: {e}")
+        print(
+            f"[Compilation] Failed to compile custom CUDA kernel. Unable to cache, \nError: {e}"
+        )
         return False, stdout_buffer.getvalue(), str(e)
 
     return True, stdout_buffer.getvalue(), None
 
 
 def build_compile_cache_with_capturing(
-    custom_model_src: str,
-    verbose: bool = False,
-    build_dir: os.PathLike = None
+    custom_model_src: str, verbose: bool = False, build_dir: os.PathLike = None
 ) -> tuple[int, str, str]:
     """
     Write a temporary python file to compile the custom model on CPU
@@ -312,22 +326,21 @@ def build_compile_cache_with_capturing(
         f.write(custom_model_src)
 
     # Execute the temporary Python file and capture output
-    process = subprocess.Popen(['python', tmp], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    process = subprocess.Popen(
+        ["python", tmp], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+    )
     stdout, stderr = process.communicate()
     returncode = process.returncode
 
     # Clean up temporary file
     os.remove(tmp)
 
-
     if verbose:
         print("[CPU Precompile] return code: ", returncode)
-        print("[CPU Precompile] stdout: \n", stdout.decode('utf-8'))
-        print("[CPU Precompile] stderr: \n", stderr.decode('utf-8')) 
-
-    return returncode, stdout.decode('utf-8'), stderr.decode('utf-8')
-
+        print("[CPU Precompile] stdout: \n", stdout.decode("utf-8"))
+        print("[CPU Precompile] stderr: \n", stderr.decode("utf-8"))
 
+    return returncode, stdout.decode("utf-8"), stderr.decode("utf-8")
 
 
 def eval_kernel_against_ref(
@@ -339,8 +352,10 @@ def eval_kernel_against_ref(
     verbose: bool = False,
     measure_performance: bool = False,
     build_dir: os.PathLike = None,
-    device: torch.device = torch.cuda.current_device() if torch.cuda.is_available() else None, # have to run on GPU
-    backend: str = "cuda", # can be 'cuda' or 'triton'
+    device: Union[torch.device, int] = (
+        torch.cuda.current_device() if torch.cuda.is_available() else None
+    ),  # have to run on GPU
+    backend: str = "cuda",  # can be 'cuda' or 'triton'
 ) -> KernelExecResult:
     """
     Evaluate the custom kernel against the original model
@@ -363,9 +378,19 @@ def eval_kernel_against_ref(
     torch.cuda.set_device(device)
     is_triton = backend == "triton"
     if is_triton:
-        # need to set env var for triton code to guarentee no wrong device shennanignas 
-        assert device.type == "cuda", "CUDA is not availible on device, cannot run Eval"
-        os.environ["CUDA_VISIBLE_DEVICES"] = str(device.index)
+        # need to set env var for triton code to guarentee no wrong device shennanignas
+        if isinstance(device, int):
+            device_num = device
+        elif isinstance(device, torch.device):
+            assert (
+                device.type == "cuda"
+            ), "CUDA is not availible on device, cannot run Eval"
+            device_num = device.index
+        else:
+            raise ValueError(
+                f"device must be an int or torch.device, got {type(device)}"
+            )
+        os.environ["CUDA_VISIBLE_DEVICES"] = str(device_num)
     context = {}
 
     if verbose:
@@ -400,7 +425,9 @@ def eval_kernel_against_ref(
         tempfile = None
         # add hash for later to distinguish between multi-turn kernels
         if is_triton:
-            ModelNew, tempfile = load_custom_model_with_tempfile(custom_model_src, build_dir)
+            ModelNew, tempfile = load_custom_model_with_tempfile(
+                custom_model_src, build_dir
+            )
         else:
             ModelNew = load_custom_model(custom_model_src, context, build_dir)
         torch.cuda.synchronize(device=device)  # not sure if this is too much
@@ -600,8 +627,7 @@ def run_and_check_correctness(
     verbose=False,
     seed=42,
     device=None,
-    truncate_errors: bool =True,
-
+    truncate_errors: bool = True,
 ) -> KernelExecResult:
     """
     run the model and check correctness,
@@ -730,11 +756,13 @@ def check_metadata_serializable(metadata: dict):
 
     return metadata
 
+
 def check_metadata_serializable_all_types(metadata: dict):
     """
     Ensure metadata is JSON serializable,
     if not, convert non-serializable values to strings recursively
     """
+
     def convert_to_serializable(obj):
         if isinstance(obj, dict):
             return {k: convert_to_serializable(v) for k, v in obj.items()}

From 32ff679f2512c7d61e3a887c7d5252cbcfa0fdab Mon Sep 17 00:00:00 2001
From: PaliC <sahancpal@gmail.com>
Date: Tue, 18 Mar 2025 15:09:47 -0700
Subject: [PATCH 03/15] fix issues

---
 scripts/eval_from_generations.py           | 248 ++++++++++++------
 scripts/generate_and_eval_single_sample.py | 119 ++++++---
 scripts/generate_samples.py                | 181 ++++++++-----
 src/eval.py                                |  26 +-
 src/prompt_constructor_triton.py           | 290 ++++-----------------
 5 files changed, 428 insertions(+), 436 deletions(-)

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index e007abbe..4f9e560d 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -1,23 +1,29 @@
-from dataclasses import dataclass
+import json
+import multiprocessing as mp
+import os
 import shutil
 import time
-import pydra
-from pydra import REQUIRED, Config
+import traceback
+from dataclasses import dataclass
 
-import json
-from tqdm import tqdm
-# Import only what we need
-from src import compile
+import pydra
 import torch
-import os
-import multiprocessing as mp
-
 
 from datasets import load_dataset
+from pydra import Config, REQUIRED
+
+# Import only what we need
+from src import compile
 
 from src.dataset import construct_kernelbench_dataset
-from src.eval import eval_kernel_against_ref, KernelExecResult, check_metadata_serializable_all_types
-from src.utils import set_gpu_arch, read_file
+from src.eval import (
+    check_metadata_serializable_all_types,
+    eval_kernel_against_ref,
+    get_error_name,
+    KernelExecResult,
+)
+from src.utils import read_file, set_gpu_arch
+from tqdm import tqdm
 
 """
 Batch Evaluation from Existing Generations
@@ -40,9 +46,9 @@
 class EvalConfig(Config):
     def __init__(self):
 
-        self.run_name = REQUIRED # name of the run to evaluate
+        self.run_name = REQUIRED  # name of the run to evaluate
 
-        self.dataset_src = REQUIRED # either huggingface or local
+        self.dataset_src = REQUIRED  # either huggingface or local
 
         # name of dataset name on Hugging Face
         self.dataset_name = "ScalingIntelligence/KernelBench"
@@ -51,7 +57,7 @@ def __init__(self):
         self.level = REQUIRED
 
         # subset of problems to evaluate
-        self.subset = (None, None) # (start_id, end_id), these are the logical index
+        self.subset = (None, None)  # (start_id, end_id), these are the logical index
 
         # Evaluation Mode: local (requires GPU), see modal (cloud GPU) in the modal file
         self.eval_mode = "local"
@@ -69,23 +75,24 @@ def __init__(self):
         # Eval settings
         self.num_correct_trials = 5
         self.num_perf_trials = 100
-        self.timeout = 180 # in seconds
+        self.timeout = 180  # in seconds
         self.measure_performance = True
-        
+
         # Eval Flow setting
         # To speedup evaluation, you can start building the kernel on CPU on disk as cache
         self.build_cache = False
-        self.num_cpu_workers = 20 # number of parallel process to to parallelize the build on CPUs
-        
+        self.num_cpu_workers = (
+            96  # number of parallel process to to parallelize the build on CPUs
+        )
+
         # Directory to build kernels for evaluation
         self.kernel_eval_build_dir = os.path.join(REPO_TOP_DIR, "cache")
 
         # number of GPUs to do batch evaluation
-        self.num_gpu_devices = 1
-        
+        self.num_gpu_devices = 8
+
         # Backend to use for kernel implementation (cuda or triton)
         self.backend = "cuda"
-        
 
     def __repr__(self):
         return f"EvalConfig({self.to_dict()})"
@@ -98,43 +105,58 @@ class WorkArgs:
     device: torch.device
 
 
-def fetch_ref_arch_from_problem_id(dataset, problem_id: int, dataset_src: str) -> str | None:
+def fetch_ref_arch_from_problem_id(
+    dataset, problem_id: int, dataset_src: str
+) -> str | None:
     """
     Fetch reference architecture from problem directory
     Either from Hugging Face or Local Dataset
     """
     if dataset_src == "huggingface":
-        curr_problem_row = dataset.filter(lambda x: x["problem_id"] == problem_id, num_proc=1, desc=None)
+        curr_problem_row = dataset.filter(
+            lambda x: x["problem_id"] == problem_id, num_proc=1, desc=None
+        )
         ref_arch_src = curr_problem_row["code"][0]
         problem_name = curr_problem_row["name"][0]
-    
+
     elif dataset_src == "local":
-        problem_idx_in_dataset = problem_id - 1 # due to dataset list being 0-indexed locally
+        problem_idx_in_dataset = (
+            problem_id - 1
+        )  # due to dataset list being 0-indexed locally
         ref_arch_path = dataset[problem_idx_in_dataset]
 
         problem_name = os.path.basename(ref_arch_path)
         ref_arch_src = read_file(ref_arch_path)
 
     # verify
-        # Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
+    # Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
     problem_number = int(problem_name.split("_")[0])
-    assert problem_number == problem_id, f"Problem number in filename ({problem_number}) does not match config problem_id ({problem_id})"
-    
+    assert (
+        problem_number == problem_id
+    ), f"Problem number in filename ({problem_number}) does not match config problem_id ({problem_id})"
+
     return ref_arch_src
 
 
-def fetch_kernel_from_disk(run_dir: str, level: int, problem_id: int, sample_id: int) -> str | None:
+def fetch_kernel_from_disk(
+    run_dir: str, level: int, problem_id: int, sample_id: int
+) -> str | None:
     """
     Fetch kernel file from disk (stored in runs/{run_name})
     """
-    kernel_path = os.path.join(run_dir, f"level_{level}_problem_{problem_id}_sample_{sample_id}_kernel.py")
-    
+    kernel_path = os.path.join(
+        run_dir, f"level_{level}_problem_{problem_id}_sample_{sample_id}_kernel.py"
+    )
+
     if os.path.exists(kernel_path):
         return read_file(kernel_path)
     else:
         return None
 
-def evaluate_single_sample(work_args: WorkArgs, configs: EvalConfig, dataset, run_dir: str) -> KernelExecResult | None:
+
+def evaluate_single_sample(
+    work_args: WorkArgs, configs: EvalConfig, dataset, run_dir: str
+) -> KernelExecResult | None:
     """
     Evaluate a single sample on a single GPU
     """
@@ -144,22 +166,28 @@ def evaluate_single_sample(work_args: WorkArgs, configs: EvalConfig, dataset, ru
         work_args.device,
     )
     # fetch reference architecture from problem directory
-    ref_arch_src = fetch_ref_arch_from_problem_id(dataset, problem_id, configs.dataset_src)
+    ref_arch_src = fetch_ref_arch_from_problem_id(
+        dataset, problem_id, configs.dataset_src
+    )
 
     # fetch kernel from disk
     # Add database support in the future
     kernel_src = fetch_kernel_from_disk(run_dir, configs.level, problem_id, sample_id)
 
-    assert kernel_src is not None, f"Kernel not found for problem {problem_id} sample {sample_id}"
+    assert (
+        kernel_src is not None
+    ), f"Kernel not found for problem {problem_id} sample {sample_id}"
 
-    build_dir = os.path.join(configs.kernel_eval_build_dir, configs.run_name, f"{problem_id}", f"{sample_id}")
+    build_dir = os.path.join(
+        configs.kernel_eval_build_dir, configs.run_name, f"{problem_id}", f"{sample_id}"
+    )
 
-    try: 
+    try:
         eval_result = eval_kernel_against_ref(
             original_model_src=ref_arch_src,
             custom_model_src=kernel_src,
             measure_performance=configs.measure_performance,
-            verbose=configs.verbose,    
+            verbose=configs.verbose,
             num_correct_trials=configs.num_correct_trials,
             num_perf_trials=configs.num_perf_trials,
             build_dir=build_dir,
@@ -175,6 +203,7 @@ def evaluate_single_sample(work_args: WorkArgs, configs: EvalConfig, dataset, ru
             # NOTE: count this as compilation failure as it is not runnable code
             metadata = {
                 "cuda_error": f"CUDA Error: {str(e)}",
+                "cuda_error_name": get_error_name(e),
                 "hardware": torch.cuda.get_device_name(device=device),
                 "device": str(device),
             }  # log this for debugging as this usually signifies illegal memory access
@@ -183,14 +212,18 @@ def evaluate_single_sample(work_args: WorkArgs, configs: EvalConfig, dataset, ru
             )
             return eval_result
         else:
-            metadata = {"other_error": f"error: {str(e)}",
-                        "hardware": torch.cuda.get_device_name(device=device),
-                        "device": str(device)
-                        } # for debugging
-            eval_result = KernelExecResult(compiled=False, correctness=False, 
-                                                metadata=metadata)
+            metadata = {
+                "other_error": f"error: {str(e)}",
+                "other_error_name": get_error_name(e),
+                "hardware": torch.cuda.get_device_name(device=device),
+                "device": str(device),
+            }  # for debugging
+            eval_result = KernelExecResult(
+                compiled=False, correctness=False, metadata=metadata
+            )
             return eval_result
-    
+
+
 def cuda_single_eval_wrapper(curr_work: WorkArgs, configs: dict, dataset, run_dir: str):
     """
     Wrapper to handle timeout and keyboard interrupt
@@ -203,16 +236,18 @@ def cuda_single_eval_wrapper(curr_work: WorkArgs, configs: dict, dataset, run_di
                 args=(curr_work, configs, dataset, run_dir),
             ).get(timeout=configs.timeout)
         except KeyboardInterrupt:
-            print(
-                "\n [Terminate] Caught KeyboardInterrupt, terminating workers..."
-            )
+            print("\n [Terminate] Caught KeyboardInterrupt, terminating workers...")
             pool.terminate()
             pool.join()
             raise
         except mp.TimeoutError as e:
-            print(f"[WARNING] Evaluation TIMED OUT for Problem ID: {curr_work.problem_id}, Sample ID: {curr_work.sample_id}\nException: {e}")
+            print(
+                f"[WARNING] Evaluation TIMED OUT for Problem ID: {curr_work.problem_id}, Sample ID: {curr_work.sample_id}\nException: {e}"
+            )
 
-        print(f"[Eval Result] Problem ID: {curr_work.problem_id}, Sample ID: {curr_work.sample_id}: {result}")
+        print(
+            f"[Eval Result] Problem ID: {curr_work.problem_id}, Sample ID: {curr_work.sample_id}: {result}"
+        )
         return result
 
 
@@ -221,15 +256,20 @@ def remove_cache_dir(cache_dir: str, run_name: str, problem_id, sample_id):
     Remove the cached folder for sample compilation so it can start a clean build next time
     useful for time out, failed build, etc.
     """
-    problem_cache_dir = os.path.join(cache_dir, run_name, f"{problem_id}", f"{sample_id}")
+    problem_cache_dir = os.path.join(
+        cache_dir, run_name, f"{problem_id}", f"{sample_id}"
+    )
     print(f"cache_dir to remove: {problem_cache_dir}")
     if os.path.exists(cache_dir):
         try:
             shutil.rmtree(cache_dir, ignore_errors=True)
-            print(f"\n[INFO] Removed cached folder for Problem ID: {problem_id}, Sample ID: {sample_id}")
+            print(
+                f"\n[INFO] Removed cached folder for Problem ID: {problem_id}, Sample ID: {sample_id}"
+            )
         except Exception as e:
             print(f"\n[WARNING] Failed to remove cache directory {cache_dir}: {str(e)}")
 
+
 def batch_eval(
     total_work: list[tuple[int, int]],
     config: EvalConfig,
@@ -253,7 +293,9 @@ def batch_eval(
             print(
                 f"[Curr Batch] {len(curr_work_batch)} tasks over {config.num_gpu_devices} GPUs; [Total Work left] {len(total_work)}"
             )
-            assert len(curr_work_batch) <= batch_size, f"Current batch size {len(curr_work_batch)} is greater than the number of GPUs {batch_size}"
+            assert (
+                len(curr_work_batch) <= batch_size
+            ), f"Current batch size {len(curr_work_batch)} is greater than the number of GPUs {batch_size}"
 
             with mp.Pool(batch_size) as pool:
 
@@ -278,7 +320,7 @@ def batch_eval(
                     async_results.append(
                         pool.apply_async(evaluate_single_sample, work_arg)
                     )
-            
+
                 # Collect results with a batch timeout
                 results = []
                 batch_timeout = config.timeout
@@ -290,20 +332,31 @@ def batch_eval(
                         remaining_time = max(0, batch_timeout - elapsed_time)
                         result = async_result.get(timeout=remaining_time)
                         results.append((problem_id, sample_id, result))
-                        
+
                     except mp.TimeoutError:
                         print(
                             f"[WARNING] Evaluation TIMED OUT for Problem ID: {problem_id}, Sample ID: {sample_id}"
                         )
                         results.append((problem_id, sample_id, None))
-                    
-                        remove_cache_dir(config.kernel_eval_build_dir, config.run_name, problem_id, sample_id)
+
+                        remove_cache_dir(
+                            config.kernel_eval_build_dir,
+                            config.run_name,
+                            problem_id,
+                            sample_id,
+                        )
                     except Exception as e:
                         print(
                             f"[ERROR] Evaluation FAILED for Problem ID: {problem_id}, Sample ID: {sample_id}: {str(e)}"
                         )
+                        traceback.print_exc()
                         results.append((problem_id, sample_id, None))
-                        remove_cache_dir(config.kernel_eval_build_dir, config.run_name, problem_id, sample_id)
+                        remove_cache_dir(
+                            config.kernel_eval_build_dir,
+                            config.run_name,
+                            problem_id,
+                            sample_id,
+                        )
 
                 end_time = time.time()
 
@@ -318,8 +371,12 @@ def batch_eval(
                     # add all the batch results here to avoid file race condition
                     # add to eval result if valid result
                     if result is not None:
-                        print(f"Adding Eval Result to file for problem {problem_id} sample {sample_id}")
-                        add_to_eval_results_file(problem_id, sample_id, result, eval_file_path)
+                        print(
+                            f"Adding Eval Result to file for problem {problem_id} sample {sample_id}"
+                        )
+                        add_to_eval_results_file(
+                            problem_id, sample_id, result, eval_file_path
+                        )
 
                 print("-" * 128)
                 print(
@@ -328,51 +385,62 @@ def batch_eval(
 
                 pbar.update(len(curr_work_batch))
 
-def check_if_eval_exists_local(problem_id: int, sample_id: int, eval_file_path: str) -> bool:
+
+def check_if_eval_exists_local(
+    problem_id: int, sample_id: int, eval_file_path: str
+) -> bool:
     """
     Check if evaluation result already exists in eval results file
     """
     if os.path.exists(eval_file_path):
-        with open(eval_file_path, 'r') as f:
+        with open(eval_file_path, "r") as f:
             eval_results = json.load(f)
         return str(problem_id) in eval_results
     return False
 
-def add_to_eval_results_file(problem_id: int, sample_id: int, eval_result: KernelExecResult, eval_file_path: str):
+
+def add_to_eval_results_file(
+    problem_id: int, sample_id: int, eval_result: KernelExecResult, eval_file_path: str
+):
     """
     Add evaluation result to eval results file
     TODO: migrate database support
     """
     # Load existing results if file exists
     if os.path.exists(eval_file_path):
-        with open(eval_file_path, 'r') as f:
+        with open(eval_file_path, "r") as f:
             eval_results = json.load(f)
     else:
         eval_results = {}
-    
+
     # Add new result
     eval_results[str(problem_id)] = {
         # assume 1 sample for now, will think about how to do this better for more samples
-        'sample_id': sample_id,
-        'compiled': eval_result.compiled,
-        'correctness': eval_result.correctness,
-        'metadata': check_metadata_serializable_all_types(eval_result.metadata),
-        'runtime': eval_result.runtime,
-        'runtime_stats': eval_result.runtime_stats,
+        "sample_id": sample_id,
+        "compiled": eval_result.compiled,
+        "correctness": eval_result.correctness,
+        "metadata": check_metadata_serializable_all_types(eval_result.metadata),
+        "runtime": eval_result.runtime,
+        "runtime_stats": eval_result.runtime_stats,
     }
-    
+
     # Write updated results back to file
     if not os.path.exists(eval_file_path):
         os.makedirs(os.path.dirname(eval_file_path), exist_ok=True)
-        
+
     with open(eval_file_path, "w") as f:
-        json.dump(eval_results, f)
+        json.dump(eval_results, f, indent=4)
 
-def single_eval_example(config: EvalConfig, curr_level_dataset: list[str], run_dir: str, eval_file_path ):
+
+def single_eval_example(
+    config: EvalConfig, curr_level_dataset: list[str], run_dir: str, eval_file_path
+):
     device = torch.device("cuda:0")
     example_work = WorkArgs(problem_id=1, sample_id=0, device=device)
     # example_eval_result = evaluate_single_sample(example_work, config, curr_level_dataset, run_dir)
-    example_eval_result = cuda_single_eval_wrapper(example_work, config, curr_level_dataset, run_dir)
+    example_eval_result = cuda_single_eval_wrapper(
+        example_work, config, curr_level_dataset, run_dir
+    )
     print(example_eval_result)
     if not check_if_eval_exists_local(1, 0, eval_file_path):
         add_to_eval_results_file(1, 0, example_eval_result, eval_file_path)
@@ -385,7 +453,7 @@ def main(config: EvalConfig):
     Store Eval Results in specified eval results file
     """
     print(f"Starting Batch Eval with config: {config}")
-    
+
     # Check if CUDA is available
     if not torch.cuda.is_available():
         raise RuntimeError("CUDA device not available. Evaluation requires GPU.")
@@ -399,35 +467,44 @@ def main(config: EvalConfig):
         curr_level_dataset = dataset[f"level_{config.level}"]
     elif config.dataset_src == "local":
         curr_level_dataset = construct_kernelbench_dataset(config.level)
-    
+
     num_problems_in_level = len(curr_level_dataset)
 
     if config.subset == (None, None):
         problem_id_range = range(1, num_problems_in_level)
     else:
-        assert config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level, f"Subset range {config.subset} out of range for Level {config.level}"
+        assert (
+            config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level
+        ), f"Subset range {config.subset} out of range for Level {config.level}"
         problem_id_range = range(config.subset[0], config.subset[1])
 
-    print(f"Evaluating 1 sample each for level {config.level} problems: {problem_id_range}")
+    print(
+        f"Evaluating 1 sample each for level {config.level} problems: {problem_id_range}"
+    )
 
     run_dir = os.path.join(config.runs_dir, config.run_name)
     eval_file_path = os.path.join(run_dir, "eval_results.json")
 
-
     # set GPU arch to configure what target to build for
     set_gpu_arch(config.gpu_arch)
-    assert config.num_gpu_devices <= torch.cuda.device_count(), f"Number of GPUs requested ({config.num_gpu_devices}) is greater than the number of available GPUs ({torch.cuda.device_count()})"
+    assert (
+        config.num_gpu_devices <= torch.cuda.device_count()
+    ), f"Number of GPUs requested ({config.num_gpu_devices}) is greater than the number of available GPUs ({torch.cuda.device_count()})"
 
     # To Debug
     # single_eval_example(config, curr_level_dataset, run_dir, eval_file_path)
 
     total_work = []
-    for problem_id in range(problem_id_range.start, problem_id_range.stop + 1): # end index is inclusive
-        sample_id = 0 # only evaluate 1 sample for now
+    for problem_id in range(
+        problem_id_range.start, problem_id_range.stop + 1
+    ):  # end index is inclusive
+        sample_id = 0  # only evaluate 1 sample for now
         if not check_if_eval_exists_local(problem_id, sample_id, eval_file_path):
             total_work.append((problem_id, sample_id))
 
-    print(f"Start evaluation on {len(total_work)} unevaluated samples in range: {problem_id_range}")
+    print(
+        f"Start evaluation on {len(total_work)} unevaluated samples in range: {problem_id_range}"
+    )
     # Build Cache on CPU as that is faster
     if config.build_cache:
         compile.batch_compile(total_work, config.to_dict())
@@ -438,4 +515,3 @@ def main(config: EvalConfig):
 
 if __name__ == "__main__":
     main()
-  
\ No newline at end of file
diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index 15ba4c1a..223cdc39 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -1,16 +1,25 @@
-import pydra
-from pydra import REQUIRED, Config
+import json
 import os, sys
+
+import pydra
 import torch
-import json
 
 from datasets import load_dataset
+from pydra import Config, REQUIRED
 
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
-from src.prompt_constructor_triton import prompt_generate_custom_triton_from_prompt_template
-from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
+from src.prompt_constructor_triton import (
+    prompt_generate_custom_triton_from_prompt_template,
+)
+from src.utils import (
+    create_inference_server_from_presets,
+    extract_first_code,
+    query_server,
+    read_file,
+    set_gpu_arch,
+)
 
 """
 Generate and evaluate a single sample
@@ -21,15 +30,15 @@
 
 torch.set_printoptions(precision=4, threshold=10)
 
+
 class EvalConfig(Config):
     def __init__(self):
-        
-        self.dataset_src = REQUIRED # either huggingface or local
+
+        self.dataset_src = REQUIRED  # either huggingface or local
 
         # name of dataset name on Hugging Face
         self.dataset_name = "ScalingIntelligence/KernelBench"
 
-
         # Problem Specification
         self.level = REQUIRED
         # NOTE: this is the logical index (problem id the problem_name)\
@@ -89,24 +98,31 @@ def main(config: EvalConfig):
 
     if config.log:
         os.makedirs(config.logdir, exist_ok=True)
-        
+
     # Problem Checks
     num_problems = len(curr_level_dataset)
     print(f"Number of problems in Level {config.level}: {num_problems}")
-    print(f"Start Generation + Evaluation for Level {config.level} Problem {config.problem_id}")
-
-    assert config.problem_id <= num_problems, f"Problem ID {config.problem_id} out of range for Level {config.level}"
+    print(
+        f"Start Generation + Evaluation for Level {config.level} Problem {config.problem_id}"
+    )
 
+    assert (
+        config.problem_id <= num_problems
+    ), f"Problem ID {config.problem_id} out of range for Level {config.level}"
 
     # 1. Fetch Problem
     if config.dataset_src == "huggingface":
 
-        curr_problem_row = curr_level_dataset.filter(lambda x: x["problem_id"] == config.problem_id)
+        curr_problem_row = curr_level_dataset.filter(
+            lambda x: x["problem_id"] == config.problem_id
+        )
         ref_arch_src = curr_problem_row["code"][0]
         problem_name = curr_problem_row["name"][0]
 
     elif config.dataset_src == "local":
-        problem_idx_in_dataset = config.problem_id - 1 # due to dataset list being 0-indexed locally
+        problem_idx_in_dataset = (
+            config.problem_id - 1
+        )  # due to dataset list being 0-indexed locally
         ref_arch_path = curr_level_dataset[problem_idx_in_dataset]
 
         problem_name = os.path.basename(ref_arch_path)
@@ -115,20 +131,21 @@ def main(config: EvalConfig):
 
     # Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
     problem_number = int(problem_name.split("_")[0])
-    assert problem_number == config.problem_id, f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
-    
-    
+    assert (
+        problem_number == config.problem_id
+    ), f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
+
     # 2. Generate Sample
     # Create inference function with config parameters
     # We provide some presets in utils but you can also pass in your own, see query_server for more details
-    inference_server = create_inference_server_from_presets(server_type=config.server_type,
-                                                        model_name=config.model_name,
-                                                        temperature=config.temperature,
-                                                        max_tokens=config.max_tokens,
-                                                        verbose=config.verbose,
-                                                        time_generation=True)
-    
-
+    inference_server = create_inference_server_from_presets(
+        server_type=config.server_type,
+        model_name=config.model_name,
+        temperature=config.temperature,
+        max_tokens=config.max_tokens,
+        verbose=config.verbose,
+        time_generation=True,
+    )
 
     # Use appropriate prompt constructor based on backend
     if config.backend == "cuda":
@@ -136,39 +153,67 @@ def main(config: EvalConfig):
     elif config.backend == "triton":
         custom_prompt = prompt_generate_custom_triton_from_prompt_template(ref_arch_src)
     else:
-        raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda' or 'triton'.")
-        
+        raise ValueError(
+            f"Unsupported backend: {config.backend}. Must be 'cuda' or 'triton'."
+        )
+
     if config.log_prompt:
-        with open(os.path.join(config.logdir, f"prompt_level_{config.level}_problem_{config.problem_id}.txt"), "w") as f:
+        with open(
+            os.path.join(
+                config.logdir,
+                f"prompt_level_{config.level}_problem_{config.problem_id}.txt",
+            ),
+            "w",
+        ) as f:
             f.write(custom_prompt)
 
     # Query server with constructed prompt
     custom_kernel = inference_server(custom_prompt)
     custom_kernel = extract_first_code(custom_kernel, ["python", "cpp"])
     # check LLM is able to generate custom kernel code
-    assert custom_kernel is not None, f"Custom {config.backend} kernel code generation failed"
-    
+    assert (
+        custom_kernel is not None
+    ), f"Custom {config.backend} kernel code generation failed"
+
     # this should be optional
     if config.log:
-        with open(os.path.join(config.logdir, f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py"), "w") as f:
+        with open(
+            os.path.join(
+                config.logdir,
+                f"generated_kernel_level_{config.level}_problem_{config.problem_id}.py",
+            ),
+            "w",
+        ) as f:
             f.write(custom_kernel)
 
     # 3. Evaluate Kernel
     # NOTE: no need to wrap around process here as only a single sample
     # see batch eval for examples of process isolation
     kernel_exec_result = eval_kernel_against_ref(
-        ref_arch_src, custom_kernel, verbose=config.verbose, measure_performance=True, 
-        num_correct_trials=5, num_perf_trials=100, backend=config.backend
+        ref_arch_src,
+        custom_kernel,
+        verbose=config.verbose,
+        measure_performance=True,
+        num_correct_trials=5,
+        num_perf_trials=100,
+        backend=config.backend,
+    )
+
+    print(
+        f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}"
     )
-    
-    print(f"Evaluation result for level {config.level} problem {config.problem_id}:\n{kernel_exec_result}")
 
     if config.log:
-        with open(os.path.join(config.logdir, f"eval_result_level_{config.level}_problem_{config.problem_id}.txt"), "a") as f:
+        with open(
+            os.path.join(
+                config.logdir,
+                f"eval_result_level_{config.level}_problem_{config.problem_id}.txt",
+            ),
+            "a",
+        ) as f:
             f.write(f"Problem Name: {problem_name}\n")
             f.write(str(kernel_exec_result))
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index 51f1e892..9dd48bf5 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -1,17 +1,26 @@
-import pydra
-from pydra import REQUIRED, Config
-import os, sys
-import torch
 import json
+import os, sys
 from dataclasses import dataclass
 
+import pydra
+import torch
 
 from datasets import load_dataset
+from pydra import Config, REQUIRED
 
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
-from src.utils import extract_first_code, set_gpu_arch, read_file, create_inference_server_from_presets, maybe_multithread
+from src.prompt_constructor_triton import (
+    prompt_generate_custom_triton_from_prompt_template,
+)
+from src.utils import (
+    create_inference_server_from_presets,
+    extract_first_code,
+    maybe_multithread,
+    read_file,
+    set_gpu_arch,
+)
 
 """
 Batch Generate Samples for Particular Level
@@ -23,21 +32,25 @@
 
 torch.set_printoptions(precision=4, threshold=10)
 
+
 class GenerationConfig(Config):
     def __init__(self):
-        
-        self.dataset_src = REQUIRED # either huggingface or local
+
+        self.dataset_src = REQUIRED  # either huggingface or local
 
         # name of dataset name on Hugging Face
         self.dataset_name = "ScalingIntelligence/KernelBench"
 
         # Problem Specification
         self.level = REQUIRED
-        
+
         # subset of problems to generate, otherwise generate on all problems in the level
-        self.subset = (None, None) # (problem_id, problem_name), these are the logical index
+        self.subset = (
+            None,
+            None,
+        )  # (problem_id, problem_name), these are the logical index
 
-        self.run_name = REQUIRED # name of the run
+        self.run_name = REQUIRED  # name of the run
 
         # num of thread pool to call inference server in parallel
         self.num_workers = 1
@@ -48,13 +61,13 @@ def __init__(self):
         self.model_name = "deepseek-coder"
         self.max_tokens = 4096
         self.temperature = 0.0
-        
+
         # Logging
         # Top Directory to Store Runs
         self.runs_dir = os.path.join(REPO_TOP_DIR, "runs")
-    
+
         self.verbose = False
-        self.store_type = "local" # TODO: add Database Integration
+        self.store_type = "local"  # TODO: add Database Integration
 
         # Future support
         # Migrate Monkeys code base to KernelBench
@@ -70,23 +83,34 @@ def greedy(self):
 
     def __repr__(self):
         return f"EvalConfig({self.to_dict()})"
-    
+
 
 @dataclass
 class WorkArgs:
-    problem_id: int # logically indexed
+    problem_id: int  # logically indexed
     sample_id: int
 
-def generate_sample_single(work: WorkArgs, config: GenerationConfig, dataset, inference_server: callable, run_dir: str) -> bool:
+
+def generate_sample_single(
+    work: WorkArgs,
+    config: GenerationConfig,
+    dataset,
+    inference_server: callable,
+    run_dir: str,
+) -> bool:
     # 1. Fetch Problem
     if config.dataset_src == "huggingface":
-        curr_problem_row = dataset.filter(lambda x: x["problem_id"] == work.problem_id, desc=None)
+        curr_problem_row = dataset.filter(
+            lambda x: x["problem_id"] == work.problem_id, desc=None
+        )
 
         ref_arch_src = curr_problem_row["code"][0]
         problem_name = curr_problem_row["name"][0]
 
     elif config.dataset_src == "local":
-        problem_idx_in_dataset = work.problem_id - 1 # due to dataset list being 0-indexed locally
+        problem_idx_in_dataset = (
+            work.problem_id - 1
+        )  # due to dataset list being 0-indexed locally
         ref_arch_path = dataset[problem_idx_in_dataset]
 
         problem_name = os.path.basename(ref_arch_path)
@@ -94,14 +118,24 @@ def generate_sample_single(work: WorkArgs, config: GenerationConfig, dataset, in
 
     # Extract problem number from problem name (e.g. "1" from "1_Square_matrix_multiplication_.py")
     problem_number = int(problem_name.split("_")[0])
-    assert problem_number == work.problem_id, f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
-    
-    
-
-    # Construct Prompt   
-    custom_cuda_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
+    assert (
+        problem_number == work.problem_id
+    ), f"Problem number in filename ({problem_number}) does not match config problem_id ({config.problem_id})"
+
+    # Construct Prompt
+    if config.backend == "cuda":
+        custom_cuda_prompt = prompt_generate_custom_cuda_from_prompt_template(
+            ref_arch_src
+        )
+    elif config.backend == "triton":
+        custom_cuda_prompt = prompt_generate_custom_triton_from_prompt_template(
+            ref_arch_src
+        )
     if config.log_prompt:
-        prompt_path = os.path.join(run_dir, f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_prompt.txt")
+        prompt_path = os.path.join(
+            run_dir,
+            f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_prompt.txt",
+        )
         with open(prompt_path, "w") as f:
             f.write(custom_cuda_prompt)
 
@@ -112,17 +146,28 @@ def generate_sample_single(work: WorkArgs, config: GenerationConfig, dataset, in
     assert custom_cuda is not None, "Custom CUDA code generation failed"
 
     if config.verbose:
-        print(f"Generated sample {work.sample_id} for problem {problem_number}: {problem_name}")
+        print(
+            f"Generated sample {work.sample_id} for problem {problem_number}: {problem_name}"
+        )
 
     # Store to local file
-    kernel_path = os.path.join(run_dir, f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_kernel.py")
+    kernel_path = os.path.join(
+        run_dir,
+        f"level_{config.level}_problem_{work.problem_id}_sample_{work.sample_id}_kernel.py",
+    )
     with open(kernel_path, "w") as f:
         f.write(custom_cuda)
-    
+
     return True
-    
 
-def generate_sample_launcher(work: WorkArgs, config: GenerationConfig, dataset, inference_server: callable, run_dir: str):
+
+def generate_sample_launcher(
+    work: WorkArgs,
+    config: GenerationConfig,
+    dataset,
+    inference_server: callable,
+    run_dir: str,
+):
     try:
         return generate_sample_single(work, config, dataset, inference_server, run_dir)
     except Exception as e:
@@ -130,13 +175,17 @@ def generate_sample_launcher(work: WorkArgs, config: GenerationConfig, dataset,
         return None
 
 
-def check_kernel_exists(run_dir: str, level: int, problem_id: int, sample_id: int) -> bool:
+def check_kernel_exists(
+    run_dir: str, level: int, problem_id: int, sample_id: int
+) -> bool:
     """
     Check if a kernel for a given problem and sample ID already exists in the run directory
     """
-    kernel_path = os.path.join(run_dir, f"level_{level}_problem_{problem_id}_sample_{sample_id}_kernel.py")
+    kernel_path = os.path.join(
+        run_dir, f"level_{level}_problem_{problem_id}_sample_{sample_id}_kernel.py"
+    )
     return os.path.exists(kernel_path)
-    
+
 
 @pydra.main(base=GenerationConfig)
 def main(config: GenerationConfig):
@@ -153,63 +202,69 @@ def main(config: GenerationConfig):
     elif config.dataset_src == "local":
         curr_level_dataset = construct_kernelbench_dataset(config.level)
 
-
     num_problems_in_level = len(curr_level_dataset)
 
     if config.subset == (None, None):
         problem_id_range = range(1, num_problems_in_level)
     else:
-        assert config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level, f"Subset range {config.subset} out of range for Level {config.level}"
+        assert (
+            config.subset[0] >= 1 and config.subset[1] <= num_problems_in_level
+        ), f"Subset range {config.subset} out of range for Level {config.level}"
         problem_id_range = range(config.subset[0], config.subset[1])
 
-    print(f"Generating on 1 sample each for level {config.level} problems: {problem_id_range}")
+    print(
+        f"Generating on 1 sample each for level {config.level} problems: {problem_id_range}"
+    )
 
     # set up run directory
     run_dir = os.path.join(config.runs_dir, config.run_name)
     os.makedirs(run_dir, exist_ok=True)
     pydra.save_yaml(config.to_dict(), os.path.join(run_dir, "generation_config.yaml"))
 
-
-    assert config.store_type == "local", "supporting local file-system based storage for now" # database integreation coming soon, need to migrate from CUDA Monkeys code
+    assert (
+        config.store_type == "local"
+    ), "supporting local file-system based storage for now"  # database integreation coming soon, need to migrate from CUDA Monkeys code
 
     problems_to_run = []
-    for problem_id in range(problem_id_range.start, problem_id_range.stop + 1): # end index is inclusive
+    for problem_id in range(
+        problem_id_range.start, problem_id_range.stop + 1
+    ):  # end index is inclusive
         # assume sample id is 0 for now
         if not check_kernel_exists(run_dir, config.level, problem_id, sample_id=0):
             problems_to_run.append(
-                WorkArgs(
-                    problem_id=int(problem_id),
-                    sample_id=0 # fix to 0 for now
-                )
-        )
-    
+                WorkArgs(problem_id=int(problem_id), sample_id=0)  # fix to 0 for now
+            )
 
     # Create inference function with config parameters
     # We provide some presets in utils but you can also pass in your own, see query_server for more details
-    inference_server = create_inference_server_from_presets(server_type=config.server_type,
-                                                        model_name=config.model_name,
-                                                        temperature=config.temperature,
-                                                        max_tokens=config.max_tokens,
-                                                        verbose=config.verbose)
+    inference_server = create_inference_server_from_presets(
+        server_type=config.server_type,
+        model_name=config.model_name,
+        temperature=config.temperature,
+        max_tokens=config.max_tokens,
+        verbose=config.verbose,
+    )
 
     # Launch workers
-    generation_results = maybe_multithread(generate_sample_launcher, 
-                      problems_to_run, 
-                      config.num_workers, 
-                      time_interval=config.api_query_interval, 
-                      # extra args
-                      config=config, 
-                      dataset=curr_level_dataset, 
-                      inference_server=inference_server,
-                      run_dir=run_dir
-                      )
-    
+    generation_results = maybe_multithread(
+        generate_sample_launcher,
+        problems_to_run,
+        config.num_workers,
+        time_interval=config.api_query_interval,
+        # extra args
+        config=config,
+        dataset=curr_level_dataset,
+        inference_server=inference_server,
+        run_dir=run_dir,
+    )
+
     num_generated_samples = len(generation_results)
     total_problems = len(problems_to_run)
     num_failed_problems = total_problems - num_generated_samples
-    print(f"Generated {num_generated_samples} samples for total {total_problems} problems, Please retry for the {num_failed_problems} failed problems.")
+    print(
+        f"Generated {num_generated_samples} samples for total {total_problems} problems, Please retry for the {num_failed_problems} failed problems."
+    )
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/src/eval.py b/src/eval.py
index 8902c488..345495a5 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -29,6 +29,11 @@
 KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")
 
 
+def get_error_name(e: Exception) -> str:
+
+    return f"{e.__class__.__module__}.{e.__class__.__name__}"
+
+
 def fetch_kernel_from_database(
     run_name: str, problem_id: int, sample_id: int, server_url: str
 ):
@@ -118,7 +123,7 @@ def load_original_model_and_inputs(
 
 
 def load_custom_model_with_tempfile(
-    code_string, build_directory=None, entry_point="ModelNew"
+    model_custom_src, build_directory=None, entry_point="ModelNew"
 ):
     """
     Writes the provided Python code string to a temporary .py file,
@@ -139,7 +144,7 @@ def load_custom_model_with_tempfile(
     # Create a temporary named file with a .py extension
     with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tmp_file:
         # Write the code string into the file
-        tmp_file.write(code_string)
+        tmp_file.write(model_custom_src)
         # Capture the path to the file
         tempfile_path = tmp_file.name
         temp_file = tmp_file
@@ -377,6 +382,10 @@ def eval_kernel_against_ref(
     # set CUDA device
     torch.cuda.set_device(device)
     is_triton = backend == "triton"
+    metadata = {}  # for storing result metadata
+    metadata["hardware"] = torch.cuda.get_device_name(device=device)
+    metadata["device"] = str(device)  # for debugging
+
     if is_triton:
         # need to set env var for triton code to guarentee no wrong device shennanignas
         if isinstance(device, int):
@@ -415,10 +424,6 @@ def eval_kernel_against_ref(
     if verbose:
         print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel")
 
-    metadata = {}  # for storing result metadata
-    metadata["hardware"] = torch.cuda.get_device_name(device=device)
-    metadata["device"] = str(device)  # for debugging
-
     # this is where compilation happens
     try:
         os.environ["TORCH_USE_CUDA_DSA"] = "1"  # compile with device side assertion
@@ -446,6 +451,7 @@ def eval_kernel_against_ref(
             graceful_eval_cleanup(context, device, tempfile)
             return None
         else:
+            metadata["compilation_error_name"] = get_error_name(e)
             metadata["compilation_error"] = e
             graceful_eval_cleanup(context, device, tempfile)
             return KernelExecResult(
@@ -468,6 +474,7 @@ def eval_kernel_against_ref(
         # TODO: add metadata for runtime error e.g. error in launching kernel, illegal memory access, ...
         graceful_eval_cleanup(context, device, tempfile)
         metadata["runtime_error"] = e
+        metadata["runtime_error_name"] = get_error_name(e)
         return KernelExecResult(
             compiled=True, correctness=False, metadata=metadata
         )  # skip further steps
@@ -487,11 +494,11 @@ def eval_kernel_against_ref(
             verbose=verbose,
             seed=seed_num,
             device=device,
-            truncate_errors=not is_triton,
         )
     except Exception as e:
         # TODO: add metadata for runtime error e.g. error in launching kernel, illegal memory access, ...
         metadata["runtime_error"] = e
+        metadata["runtime_error_name"] = get_error_name(e)
         kernel_exec_result = KernelExecResult(
             compiled=True, correctness=False, metadata=metadata
         )
@@ -627,7 +634,6 @@ def run_and_check_correctness(
     verbose=False,
     seed=42,
     device=None,
-    truncate_errors: bool = True,
 ) -> KernelExecResult:
     """
     run the model and check correctness,
@@ -678,6 +684,7 @@ def run_and_check_correctness(
                         f"Output shape mismatch: Expected {output.shape}, got {output_new.shape}",
                         metadata,
                     )
+                    metadata["correctness_issue_name"] = "correctness_issue"
                     if verbose:
                         print(
                             f"[FAIL] trial {trial}: Output shape mismatch: Expected {output.shape}, got {output_new.shape}"
@@ -707,8 +714,9 @@ def run_and_check_correctness(
                 print(f"Error in launching kernel for ModelNew: {e}")
 
                 metadata = register_and_format_exception(
-                    "runtime_error", e, metadata, truncate=truncate_errors
+                    "runtime_error", e, metadata, truncate=True
                 )
+                metadata["runtime_error_name"] = get_error_name(e)
                 return KernelExecResult(
                     compiled=True, correctness=False, metadata=metadata
                 )
diff --git a/src/prompt_constructor_triton.py b/src/prompt_constructor_triton.py
index 807cceb8..e3ea945b 100644
--- a/src/prompt_constructor_triton.py
+++ b/src/prompt_constructor_triton.py
@@ -1,4 +1,5 @@
 import os
+
 from .utils import read_file
 
 
@@ -44,12 +45,15 @@ def get_arch_definition(arch_src):
 Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
 """
 
+
 def prompt_generate_custom_triton(
     arc_src: str, example_arch_src: str, example_new_arch_src: str
 ) -> str:
     prompt = PROBLEM_STATEMENT
 
-    assert "@triton.jit" in example_new_arch_src, "Example new arch must contain Triton kernel"
+    assert (
+        "@triton.jit" in example_new_arch_src
+    ), "Example new arch must contain Triton kernel"
 
     if example_arch_src != "" and example_new_arch_src != "":
         prompt += f"""
@@ -79,210 +83,15 @@ def prompt_generate_custom_triton(
 Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
 """
 
-def prompt_generate_custom_triton_fewshot_and_template(ref_arch_src: str, shots: list) -> str:
-    """
-    Generate a prompt with specified few-shot examples following a template 
-
-    shots: list of few-shot examples to include in the prompt
-    Avaliable few shot options to start with: 
-    - ex_add: pointwise addition
-    - ex_fuse_gelu: fused gelu
-    - ex_mnist2: fused convolutions and relus (DEPRECATED)
-    - ex_tiled_matmul: tiled matrix multiplication
-    - ex_flash_attn: simple flash attention
-    """
-    prompt = PROBLEM_STATEMENT_CLEANED
-
-    # k = 1
-    example_add = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_add.py")
-    )
-    example_add_new = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_add.py")
-    )
-    example_add_desc = "This given architecture is for a pointwise addition: "
-
-    # k = 2
-    example_fuse_gelu = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_fuse_gelu.py")
-    )
-    example_fuse_gelu_new = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_fuse_gelu.py")
-    )
-    example_fuse_gelu_desc = "This given architecture is for a fused gelu: "
-
-    # k = 3 (DEPRECATED)
-    example_mnist2 = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_mnist2.py")
-    )
-    example_mnist2_new = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_mnist2.py")
-    )
-    exmaple_mnist2_desc = "This given architecture is for a model with fused convolutions and relus: "
-
-    # k = 4
-    example_tiled_matmul = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_tiled_matmul.py")
-    )
-    example_tiled_matmul_new = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_tiled_matmul.py")
-    )
-    example_tiled_matmul_desc = "This given architecture is for a model with tiled matrix multiplication: "
-
-    # k = 5
-    example_flash_attn = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_flash_attn.py")
-    )
-    example_flash_attn_new = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_flash_attn.py")
-    )
-    example_flash_attn_desc = "This given architecture is for a model with simple io-aware implementation of attention, also known as flash attention: "
-
-    examples = []
-    for s in shots:
-        if s not in ["ex_add", "ex_fuse_gelu", "ex_mnist2", "ex_tiled_matmul", "ex_flash_attn"]:
-            raise ValueError(f"Invalid shot: {s}")
-        elif s == "ex_add":
-            examples.append((example_add, example_add_new, example_add_desc))
-        elif s == "ex_fuse_gelu":
-            examples.append((example_fuse_gelu, example_fuse_gelu_new, example_fuse_gelu_desc))
-        elif s == "ex_mnist2": # DEPRECATED
-            raise ValueError("ex_mnist2 is deprecated")
-            examples.append((example_mnist2, example_mnist2_new, exmaple_mnist2_desc))
-        elif s == "ex_tiled_matmul":
-            examples.append((example_tiled_matmul, example_tiled_matmul_new, example_tiled_matmul_desc))
-        elif s == "ex_flash_attn":
-            examples.append((example_flash_attn, example_flash_attn_new, example_flash_attn_desc))
-    
-
-    for i, tup in enumerate(examples):
-        base, kernel, desc = tup
 
-        prompt += f"""
-Example {i+1}:\n\n
-Here is an example architecture:\n\n
-```
-{base}
-```\n
-{PROBLEM_INSTRUCTION_CLEANED} \n
-Here is an optimized verison with custom CUDA kernels: \n
-```
-{kernel}
-```\n\n
-"""
+def prompt_generate_custom_triton_fewshot_and_template(
+    ref_arch_src: str, shots: list
+) -> str:
+    raise NotImplementedError("This function has not been implemented yet")
 
-# should we put task here?
-    prompt += f"""
-Task:\n\n
-Here is an example architecture:\n\n
-```
-{ref_arch_src}
-```\n
-"""
-    prompt += PROBLEM_INSTRUCTION_CLEANED
-    return prompt
 
 def prompt_generate_ex_with_CoT_template(ref_arch_src: str, cot_example: str) -> str:
-    """
-    Generate a prompt with a CoT example following a template 
-    Avaliable CoT examples: 
-    - ex_fuse_gelu: fused gelu
-    - ex_mnist2: fused convolutions and relus
-    - ex_tiled_matmul: tiled matrix multiplication
-    """
-
-    # I updated this to allow CoT. Also explicilty state think step by step.
-    PROBLEM_INSTRUCTION_COT = """
-Optimize the architecture named Model with custom CUDA operators! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Do not output testing code. 
-In the end, make sure the final code block contains code for output architecture ModelNew with cuda code.\n
-Let's think step by step.\n
-""" 
-
-    prompt = PROBLEM_STATEMENT_CLEANED
-    
-    assert cot_example in ["ex_fuse_gelu", "ex_mnist2", "ex_tiled_matmul"]
-
-    # k = 2
-    example_fuse_gelu = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_fuse_gelu.py")
-    )
-    example_fuse_gelu_cot = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/cot/model_cot_fuse_gelu.py")
-    )
-    example_fuse_gelu_new = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_fuse_gelu.py")
-    )
-    example_fuse_gelu_desc = "This given architecture is for a fused gelu: "
-
-    # k = 3
-    example_mnist2 = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_mnist2.py")
-    )
-    example_mnist2_cot = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/cot/model_cot_mnist2.py")
-    )
-    example_mnist2_new = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_mnist2.py")
-    )
-    exmaple_mnist2_desc = "This given architecture is for a model with fused convolutions and relus: "
-
-    # k = 4
-    example_tiled_matmul = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_ex_tiled_matmul.py")
-    )
-    example_tiled_matmul_cot = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/cot/model_cot_tiled_matmul.py")
-    )
-    example_tiled_matmul_new = read_file(
-        os.path.join(REPO_TOP_PATH, "src/prompts/few_shot/model_new_ex_tiled_matmul.py")
-    )
-    example_tiled_matmul_desc = "This given architecture is for a model with tiled matrix multiplication: "
-    
-    match cot_example:
-        case "ex_fuse_gelu":
-            base = example_fuse_gelu
-            cot = example_fuse_gelu_cot
-            kernel = example_fuse_gelu_new
-            desc = example_fuse_gelu_desc
-        case "ex_mnist2":
-            base = example_mnist2
-            cot = example_mnist2_cot
-            kernel = example_mnist2_new
-            desc = exmaple_mnist2_desc
-        case "ex_tiled_matmul":
-            base = example_tiled_matmul
-            cot = example_tiled_matmul_cot
-            kernel = example_tiled_matmul_new
-            desc = example_tiled_matmul_desc
-        case _:
-            raise ValueError(f"Invalid CoT example: {cot_example} not found in CoT examples")
-
-    # construct example with 
-    # NOTE: we only do one example with CoT for now
-    # 1. ref_src problem -> 2. Instruction -> 3. CoT -> 4. Solution
-    prompt += f"""
-Here is an example architecture:\n\n
-```
-{base}
-```\n
-{PROBLEM_INSTRUCTION_COT} \n
-{cot} \n
-```
-{kernel}
-```\n\n
-"""
-
-# show task to solve
-    prompt += f"""
-Task:\n\n
-Here is an example architecture:\n\n
-```
-{ref_arch_src}
-```\n
-"""
-    prompt += PROBLEM_INSTRUCTION_COT
-
-    return prompt
+    raise NotImplementedError("This function has not been implemented yet")
 
 
 def prompt_generate_custom_triton_from_prompt_template(ref_arch_src: str) -> str:
@@ -294,9 +103,7 @@ def prompt_generate_custom_triton_from_prompt_template(ref_arch_src: str) -> str
     # These are strictly defined for now
 
     # path to prompt template, show an example of Model (torch specifications) and ModelNew (torch + custom CUDA kernels)
-    example_arch_path = os.path.join(
-        REPO_TOP_PATH, f"src/prompts/model_ex_add.py"
-    )
+    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
     example_new_arch_path = os.path.join(
         REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
     )
@@ -316,9 +123,11 @@ def prompt_generate_custom_triton_from_prompt_template(ref_arch_src: str) -> str
     return prompt_generate_custom_triton(arch, example_arch, example_new_arch)
 
 
-def prompt_generate_prompt_with_hardware_info_from_template(ref_arch_src: str, gpu_name: str) -> str:
+def prompt_generate_prompt_with_hardware_info_from_template(
+    ref_arch_src: str, gpu_name: str
+) -> str:
     """
-    Similar to prompt_generate_custom_triton_from_prompt_template, 
+    Similar to prompt_generate_custom_triton_from_prompt_template,
     but with hardware information for the given GPU
     """
 
@@ -326,34 +135,35 @@ def prompt_generate_prompt_with_hardware_info_from_template(ref_arch_src: str, g
     # These are strictly defined for now
 
     # path to prompt template, show an example of Model (torch specifications) and ModelNew (torch + custom CUDA kernels)
-    example_arch_path = os.path.join(
-        REPO_TOP_PATH, f"src/prompts/model_ex_add.py"
-    )
+    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
     example_new_arch_path = os.path.join(
         REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
     )
 
-    gpu_spec_file_path = os.path.join(REPO_TOP_PATH, f"src/prompts/hardware/gpu_specs.py")
+    gpu_spec_file_path = os.path.join(
+        REPO_TOP_PATH, f"src/prompts/hardware/gpu_specs.py"
+    )
 
     example_arch = read_file(example_arch_path)
     example_new_arch = read_file(example_new_arch_path)
     gpu_spec_info = read_file(gpu_spec_file_path)
 
     return prompt_generate_prompt_with_hardware_info(
-                                        ref_arch_src=arch, 
-                                        gpu_name=gpu_name, 
-                                        example_arch_src=example_arch, 
-                                        example_new_arch_src=example_new_arch, 
-                                        gpu_spec_info_src=gpu_spec_info
-                                        )
-    
+        ref_arch_src=arch,
+        gpu_name=gpu_name,
+        example_arch_src=example_arch,
+        example_new_arch_src=example_new_arch,
+        gpu_spec_info_src=gpu_spec_info,
+    )
 
 
-def prompt_generate_prompt_with_hardware_info(ref_arch_src: str, 
-                                              gpu_name: str, 
-                                              example_arch_src: str, 
-                                              example_new_arch_src: str, 
-                                              gpu_spec_info_src: str) -> str:
+def prompt_generate_prompt_with_hardware_info(
+    ref_arch_src: str,
+    gpu_name: str,
+    example_arch_src: str,
+    example_new_arch_src: str,
+    gpu_spec_info_src: str,
+) -> str:
     """
     Generate a prompt with hardware information for the given GPU
     gpu_spec_info_src: str of the gpu spec src file
@@ -361,17 +171,19 @@ def prompt_generate_prompt_with_hardware_info(ref_arch_src: str,
 
     # Create a dictionary to store the local namespace
     local_dict = {}
-    
+
     # Execute the GPU spec file in the local namespace
     exec(gpu_spec_info_src, {}, local_dict)
-    
+
     # Get the required variables from the local namespace
-    GPU_SPEC_INFO = local_dict.get('GPU_SPEC_INFO')
-    GPU_DEFINITIONS = local_dict.get('GPU_DEFINITIONS')
-    GPU_BEST_PRACTICES = local_dict.get('GPU_BEST_PRACTICES')
-    
+    GPU_SPEC_INFO = local_dict.get("GPU_SPEC_INFO")
+    GPU_DEFINITIONS = local_dict.get("GPU_DEFINITIONS")
+    GPU_BEST_PRACTICES = local_dict.get("GPU_BEST_PRACTICES")
+
     if not GPU_SPEC_INFO or not GPU_DEFINITIONS or not GPU_BEST_PRACTICES:
-        raise ValueError("GPU_SPEC_INFO or GPU_DEFINITIONS or GPU_BEST_PRACTICES not found in gpu_spec_info_src")
+        raise ValueError(
+            "GPU_SPEC_INFO or GPU_DEFINITIONS or GPU_BEST_PRACTICES not found in gpu_spec_info_src"
+        )
 
     assert gpu_name in GPU_SPEC_INFO, f"GPU name {gpu_name} not found in GPU_SPEC_INFO"
 
@@ -388,20 +200,19 @@ def prompt_generate_prompt_with_hardware_info(ref_arch_src: str,
         {example_new_arch_src}
         ``` \n
         """
-    
+
     curr_gpu_spec_info = GPU_SPEC_INFO[gpu_name]
 
     gpu_architecture = curr_gpu_spec_info.get("GPU Architecture")
     prompt += f"""
     Here is some information about the underlying hardware that you should keep in mind. \n\n
 The GPU that will run the kernel is NVIDIA {gpu_name}, {gpu_architecture} architecture.\n\n"""
-    
+
     for key, value in curr_gpu_spec_info.items():
         if key == "GPU Architecture":
             continue
         prompt += f"""- We have {value} of {key}.\n"""
-    
-    
+
     prompt += f"""\n\n
 Here are some concepts about the GPU architecture that could be helpful: \n\n"""
     for key, value in GPU_DEFINITIONS.items():
@@ -412,25 +223,19 @@ def prompt_generate_prompt_with_hardware_info(ref_arch_src: str,
     for best_practice in GPU_BEST_PRACTICES:
         prompt += f"""- {best_practice}\n"""
 
-
     prompt += f"""
     You are given the following architecture: \n
     ```
     {ref_arch_src}
     ```
     """
-    
 
     prompt += PROBLEM_INSTRUCTION
     return prompt
 
-
     return None
 
 
-
-
-
 def prompt_fix_compile(ref_arch_src, custom_cuda, metadata):
     prompt = PROBLEM_STATEMENT
     prompt += f"""
@@ -471,13 +276,15 @@ def prompt_fix_correctness(ref_arch_src, custom_cuda, metadata):
     """
     return prompt
 
+
 def main():
     gpu_name = "L40S"
 
-
     ref_arch_src = read_file(os.path.join(KERNEL_BENCH_PATH, f"level1/19_ReLU.py"))
     assert len(ref_arch_src) > 0, "ref_arch_src is empty"
-    prompt = prompt_generate_prompt_with_hardware_info_from_template(ref_arch_src, gpu_name)
+    prompt = prompt_generate_prompt_with_hardware_info_from_template(
+        ref_arch_src, gpu_name
+    )
     print(prompt)
     # Write prompt to temp file
     temp_file_path = os.path.join(REPO_TOP_PATH, "scratch", "prompt_draft.txt")
@@ -485,5 +292,6 @@ def main():
     with open(temp_file_path, "w") as f:
         f.write(prompt)
 
+
 if __name__ == "__main__":
     main()

From eb4e8aaceb3d6466f5fce8bdb2d5623f9a4280d0 Mon Sep 17 00:00:00 2001
From: PaliC <sahancpal@gmail.com>
Date: Tue, 18 Mar 2025 15:14:05 -0700
Subject: [PATCH 04/15] revert eval

---
 scripts/eval_from_generations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index 4f9e560d..4898ed7a 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -82,14 +82,14 @@ def __init__(self):
         # To speedup evaluation, you can start building the kernel on CPU on disk as cache
         self.build_cache = False
         self.num_cpu_workers = (
-            96  # number of parallel process to to parallelize the build on CPUs
+            20  # number of parallel process to to parallelize the build on CPUs
         )
 
         # Directory to build kernels for evaluation
         self.kernel_eval_build_dir = os.path.join(REPO_TOP_DIR, "cache")
 
         # number of GPUs to do batch evaluation
-        self.num_gpu_devices = 8
+        self.num_gpu_devices = 1
 
         # Backend to use for kernel implementation (cuda or triton)
         self.backend = "cuda"

From e9bf734c08a4fb36b7f067ad6e7e369d63feecec Mon Sep 17 00:00:00 2001
From: PaliC <sahancpal@gmail.com>
Date: Tue, 18 Mar 2025 15:16:44 -0700
Subject: [PATCH 05/15] remove traceback

---
 scripts/eval_from_generations.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index 4898ed7a..25bf3e80 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -3,7 +3,6 @@
 import os
 import shutil
 import time
-import traceback
 from dataclasses import dataclass
 
 import pydra
@@ -349,7 +348,6 @@ def batch_eval(
                         print(
                             f"[ERROR] Evaluation FAILED for Problem ID: {problem_id}, Sample ID: {sample_id}: {str(e)}"
                         )
-                        traceback.print_exc()
                         results.append((problem_id, sample_id, None))
                         remove_cache_dir(
                             config.kernel_eval_build_dir,

From 26d4cc0d07d48cdafa862fa3c5eb6a725ed09d39 Mon Sep 17 00:00:00 2001
From: PaliC <sahancpal@gmail.com>
Date: Tue, 25 Mar 2025 09:36:10 -0700
Subject: [PATCH 06/15] remove cot

---
 .../triton/model_new_ex_add_triton.py         | 63 -------------------
 1 file changed, 63 deletions(-)
 delete mode 100644 src/prompts/few_shot/triton/model_new_ex_add_triton.py

diff --git a/src/prompts/few_shot/triton/model_new_ex_add_triton.py b/src/prompts/few_shot/triton/model_new_ex_add_triton.py
deleted file mode 100644
index 43a3f712..00000000
--- a/src/prompts/few_shot/triton/model_new_ex_add_triton.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def add_kernel(
-    x_ptr,  # Pointer to first input
-    y_ptr,  # Pointer to second input
-    out_ptr,  # Pointer to output
-    n_elements,  # Total number of elements in input/output
-    BLOCK_SIZE: tl.constexpr,
-):
-    # Each program handles a contiguous block of data of size BLOCK_SIZE
-    block_start = tl.program_id(0) * BLOCK_SIZE
-    # Create a range of offsets [0..BLOCK_SIZE-1]
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    # Mask to ensure we don't go out of bounds
-    mask = offsets < n_elements
-    # Load input values
-    x = tl.load(x_ptr + offsets, mask=mask, other=0.0)
-    y = tl.load(y_ptr + offsets, mask=mask, other=0.0)
-    # Perform the elementwise addition
-    out = x + y
-    # Store the result
-    tl.store(out_ptr + offsets, out, mask=mask)
-
-
-def triton_add(x: torch.Tensor, y: torch.Tensor):
-    """
-    This function wraps the Triton kernel call. It:
-      1. Ensures the inputs are contiguous on GPU.
-      2. Calculates the grid (blocks) needed.
-      3. Launches the Triton kernel.
-    """
-    assert x.is_cuda and y.is_cuda, "Tensors must be on CUDA."
-    x = x.contiguous()
-    y = y.contiguous()
-
-    # Prepare output tensor
-    out = torch.empty_like(x)
-
-    # Number of elements in the tensor
-    n_elements = x.numel()
-    BLOCK_SIZE = 128  # Tunable parameter for block size
-
-    # Determine the number of blocks needed
-    grid = lambda meta: ((n_elements + meta["BLOCK_SIZE"] - 1) // meta["BLOCK_SIZE"],)
-
-    # Launch the Triton kernel
-    add_kernel[grid](x, y, out, n_elements, BLOCK_SIZE=BLOCK_SIZE)
-    return out
-
-
-class ModelNew(nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-
-    def forward(self, a, b):
-        # Instead of "return a + b", call our Triton-based addition
-        return triton_add(a, b)
\ No newline at end of file

From e5ba9b3d1d12f5cca88978c3429eefaeb52899df Mon Sep 17 00:00:00 2001
From: PaliC <sahancpal@gmail.com>
Date: Tue, 25 Mar 2025 11:06:51 -0700
Subject: [PATCH 07/15] improve eval

---
 src/eval.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/eval.py b/src/eval.py
index 345495a5..370786d4 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -122,9 +122,7 @@ def load_original_model_and_inputs(
     return (Model, get_init_inputs_fn, get_inputs_fn)
 
 
-def load_custom_model_with_tempfile(
-    model_custom_src, build_directory=None, entry_point="ModelNew"
-):
+def load_custom_model_with_tempfile(model_custom_src, entry_point="ModelNew"):
     """
     Writes the provided Python code string to a temporary .py file,
     dynamically imports the module so we can access the modified model class.
@@ -136,11 +134,6 @@ def load_custom_model_with_tempfile(
     with the @triton.jit decorator.
     """
 
-    if build_directory:
-        model_custom_src = (
-            "import os\n" f"os.environ['TORCH_EXTENSIONS_DIR'] = '{build_directory}'\n"
-        ) + model_custom_src
-
     # Create a temporary named file with a .py extension
     with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as tmp_file:
         # Write the code string into the file
@@ -431,7 +424,7 @@ def eval_kernel_against_ref(
         # add hash for later to distinguish between multi-turn kernels
         if is_triton:
             ModelNew, tempfile = load_custom_model_with_tempfile(
-                custom_model_src, build_dir
+                custom_model_src, entry_point="ModelNew"
             )
         else:
             ModelNew = load_custom_model(custom_model_src, context, build_dir)

From 01e0c0e2cb883168fe44517f838bc5014400ebeb Mon Sep 17 00:00:00 2001
From: AffectionateCurry <160357316+AffectionateCurry@users.noreply.github.com>
Date: Tue, 21 Oct 2025 21:13:52 -0700
Subject: [PATCH 08/15] looked over pr and added future support for other
 languages

---
 scripts/eval_from_generations.py              |   2 +-
 scripts/generate_and_eval_single_sample.py    |  21 +-
 .../generate_and_eval_single_sample_modal.py  |  15 +-
 scripts/generate_samples.py                   |  14 +-
 src/eval.py                                   | 187 +++++-
 src/prompt_constructor_multilang.py           | 553 ++++++++++++++++++
 src/prompt_constructor_triton.py              | 297 ----------
 7 files changed, 747 insertions(+), 342 deletions(-)
 create mode 100644 src/prompt_constructor_multilang.py
 delete mode 100644 src/prompt_constructor_triton.py

diff --git a/scripts/eval_from_generations.py b/scripts/eval_from_generations.py
index 25bf3e80..84d71ddd 100644
--- a/scripts/eval_from_generations.py
+++ b/scripts/eval_from_generations.py
@@ -512,4 +512,4 @@ def main(config: EvalConfig):
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/scripts/generate_and_eval_single_sample.py b/scripts/generate_and_eval_single_sample.py
index 223cdc39..ff71e4bc 100644
--- a/scripts/generate_and_eval_single_sample.py
+++ b/scripts/generate_and_eval_single_sample.py
@@ -1,18 +1,16 @@
-import json
-import os, sys
-
 import pydra
+from pydra import REQUIRED, Config
+import os, sys
 import torch
+import json
+import modal
 
 from datasets import load_dataset
-from pydra import Config, REQUIRED
 
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
-from src.prompt_constructor_triton import (
-    prompt_generate_custom_triton_from_prompt_template,
-)
+from src.prompt_constructor_multilang import get_prompt_for_backend
 from src.utils import (
     create_inference_server_from_presets,
     extract_first_code,
@@ -150,11 +148,11 @@ def main(config: EvalConfig):
     # Use appropriate prompt constructor based on backend
     if config.backend == "cuda":
         custom_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
-    elif config.backend == "triton":
-        custom_prompt = prompt_generate_custom_triton_from_prompt_template(ref_arch_src)
+    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+        custom_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
     else:
         raise ValueError(
-            f"Unsupported backend: {config.backend}. Must be 'cuda' or 'triton'."
+            f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'."
         )
 
     if config.log_prompt:
@@ -170,6 +168,7 @@ def main(config: EvalConfig):
     # Query server with constructed prompt
     custom_kernel = inference_server(custom_prompt)
     custom_kernel = extract_first_code(custom_kernel, ["python", "cpp"])
+
     # check LLM is able to generate custom kernel code
     assert (
         custom_kernel is not None
@@ -216,4 +215,4 @@ def main(config: EvalConfig):
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index d235b460..c2a38c2e 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -10,7 +10,7 @@
 #from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
-from src.prompt_constructor_triton import prompt_generate_custom_triton_from_prompt_template
+from src.prompt_constructor_multilang import get_prompt_for_backend
 from src.utils import extract_first_code, query_server, set_gpu_arch, read_file, create_inference_server_from_presets
 
 app = modal.App("eval_single_sample")
@@ -102,7 +102,14 @@ def __repr__(self):
         "pytest",
         "ninja",
         "utils",
+        # "tilelang",  # commented out - not working currently
+        #"apache-tvm",
+        "python-dotenv",
+        "nvidia-cutlass-dsl",
+        
     )
+    #.add_local_dir(os.path.join(REPO_TOP_DIR, "KernelBench"), remote_path="/root/KernelBench")
+    .add_local_dir(os.path.join(REPO_TOP_DIR, "src"), remote_path="/root/src")
 )
 
 @app.cls(image=image)
@@ -182,10 +189,10 @@ def main(config: EvalConfig):
     # Use appropriate prompt constructor based on backend
     if config.backend == "cuda":
         custom_prompt = prompt_generate_custom_cuda_from_prompt_template(ref_arch_src)
-    elif config.backend == "triton":
-        custom_prompt = prompt_generate_custom_triton_from_prompt_template(ref_arch_src)
+    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+        custom_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
     else:
-        raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda' or 'triton'.")
+        raise ValueError(f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'.")
         
     if config.log_prompt:
         with open(os.path.join(config.logdir, f"prompt_level_{config.level}_problem_{config.problem_id}.txt"), "w") as f:
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index 9dd48bf5..ed8da502 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -11,9 +11,7 @@
 from src.dataset import construct_kernelbench_dataset
 from src.eval import eval_kernel_against_ref
 from src.prompt_constructor import prompt_generate_custom_cuda_from_prompt_template
-from src.prompt_constructor_triton import (
-    prompt_generate_custom_triton_from_prompt_template,
-)
+from src.prompt_constructor_multilang import get_prompt_for_backend
 from src.utils import (
     create_inference_server_from_presets,
     extract_first_code,
@@ -127,9 +125,11 @@ def generate_sample_single(
         custom_cuda_prompt = prompt_generate_custom_cuda_from_prompt_template(
             ref_arch_src
         )
-    elif config.backend == "triton":
-        custom_cuda_prompt = prompt_generate_custom_triton_from_prompt_template(
-            ref_arch_src
+    elif config.backend in ["triton", "cute"]:  # removed "tilelang"
+        custom_cuda_prompt = get_prompt_for_backend(ref_arch_src, config.backend)
+    else:
+        raise ValueError(
+            f"Unsupported backend: {config.backend}. Must be 'cuda', 'triton', or 'cute'."
         )
     if config.log_prompt:
         prompt_path = os.path.join(
@@ -267,4 +267,4 @@ def main(config: GenerationConfig):
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/src/eval.py b/src/eval.py
index 370786d4..e2411f3c 100644
--- a/src/eval.py
+++ b/src/eval.py
@@ -2,12 +2,15 @@
 Helpers for Evaluations
 """
 
+import hashlib
 import importlib
 import json
+import linecache
 import os, subprocess
 import random
 import sys
 import tempfile
+import traceback
 from contextlib import redirect_stderr, redirect_stdout
 from io import StringIO
 from typing import Union
@@ -155,6 +158,41 @@ def load_custom_model_with_tempfile(model_custom_src, entry_point="ModelNew"):
     return ModelNew, temp_file
 
 
+# def load_tilelang_model(
+#     model_custom_src: str,
+#     context: dict,
+#     build_directory: str | None = None
+# ):
+#     """
+#     Load TileLang model using linecache instead of tempfile.
+#     This registers the source code in memory so inspect.getsource() works,
+#     which is needed for TileLang's JIT decorator.
+#     """
+#     if build_directory:
+#         model_custom_src = (
+#             "import os\n"
+#             f"os.environ['TORCH_EXTENSIONS_DIR'] = '{build_directory}'\n"
+#             + model_custom_src
+#         )
+#
+#     # Register source so inspect.getsource works
+#     fake_fname = (
+#         f"/tmp/tilelang_kernel_"
+#         f"{hashlib.md5(model_custom_src.encode()).hexdigest()}.py"
+#     )
+#     # linecache expects a list with trailing newlines
+#     linecache.cache[fake_fname] = (
+#         len(model_custom_src),
+#         None,
+#         model_custom_src.splitlines(True),
+#         fake_fname,
+#     )
+#
+#     code_obj = compile(model_custom_src, fake_fname, "exec")
+#     exec(code_obj, context)
+#     return context["ModelNew"]
+
+
 def load_custom_model(
     model_custom_src: str, context: dict, build_directory: str = None
 ) -> nn.Module:
@@ -341,6 +379,33 @@ def build_compile_cache_with_capturing(
     return returncode, stdout.decode("utf-8"), stderr.decode("utf-8")
 
 
+def _process_input_tensor(tensor, device, backend):
+    """
+    Helper function to move tensors to the correct device and apply backend-specific dtype casting.
+    
+    Args:
+        tensor: Input tensor or non-tensor value
+        device: Target CUDA device
+        backend: Backend type (e.g., 'cuda', 'triton', 'cute')
+    
+    Returns:
+        Processed tensor on correct device with correct dtype, or original value if not a tensor
+    """
+    if not isinstance(tensor, torch.Tensor):
+        return tensor
+    
+    # Preserve integer dtypes for labels/targets (e.g., classification losses)
+    if tensor.dtype in [torch.int32, torch.int64, torch.long]:
+        return tensor.to(device=device)
+    
+    # Apply backend-specific dtype casting for float tensors
+    # if backend.lower() == "tilelang":
+    #     return tensor.to(device=device, dtype=torch.float16)
+    
+    # Default for all other backends and float types
+    return tensor.to(device=device)
+
+
 def eval_kernel_against_ref(
     original_model_src: str,
     custom_model_src: str,
@@ -353,7 +418,7 @@ def eval_kernel_against_ref(
     device: Union[torch.device, int] = (
         torch.cuda.current_device() if torch.cuda.is_available() else None
     ),  # have to run on GPU
-    backend: str = "cuda",  # can be 'cuda' or 'triton'
+    backend: str = "cuda",  # can be 'cuda', 'triton', or 'cute'
 ) -> KernelExecResult:
     """
     Evaluate the custom kernel against the original model
@@ -361,10 +426,15 @@ def eval_kernel_against_ref(
     num_correct_trials: number of trials to initialize different random inputs; correctness pass only if all trials pass
     num_perf_trials: run the evalutation many times to take the average
     device: GPU (cuda) device to run the evalutation on
-    backend: str, either 'cuda' or 'triton', determines which backend implementation to use
+    backend: str, one of 'cuda', 'triton', or 'cute'
     """
     # TODO: check device is busy
     assert torch.cuda.is_available(), "CUDA is not available, cannot run Eval"
+    
+    # SET DEFAULT DTYPE TO FLOAT16 ONLY FOR TILELANG
+    # if backend.lower() == "tilelang":
+    #     torch.set_default_dtype(torch.float16)
+    
     torch.set_printoptions(
         precision=4,  # Decimal places
         threshold=10,  # Total number of elements before truncating
@@ -374,13 +444,16 @@ def eval_kernel_against_ref(
 
     # set CUDA device
     torch.cuda.set_device(device)
-    is_triton = backend == "triton"
+    
+    # Backends that use tempfile approach and need CUDA_VISIBLE_DEVICES
+    uses_tempfile = backend.lower() in ["triton", "cute"]  # removed "tilelang"
+    
     metadata = {}  # for storing result metadata
     metadata["hardware"] = torch.cuda.get_device_name(device=device)
     metadata["device"] = str(device)  # for debugging
 
-    if is_triton:
-        # need to set env var for triton code to guarentee no wrong device shennanignas
+    if uses_tempfile:
+        # need to set env var for triton/cute code to guarantee no wrong device shenanigans
         if isinstance(device, int):
             device_num = device
         elif isinstance(device, torch.device):
@@ -404,16 +477,17 @@ def eval_kernel_against_ref(
     )
     set_seed(seed_num)  # set seed for reproducible input
     init_inputs = get_init_inputs()
-    init_inputs = [
-        x.cuda(device=device) if isinstance(x, torch.Tensor) else x for x in init_inputs
-    ]
-
+    
+    # Convert inputs to appropriate dtypes for GPU computation
+    init_inputs = [_process_input_tensor(x, device, backend) for x in init_inputs]
+    
     with torch.no_grad():
         set_seed(seed_num)  # set seed for reproducible weights
         original_model = Model(*init_inputs)
         assert hasattr(original_model, "forward")
         if verbose:
             print("[Eval] Original Model Loaded")
+    
     if verbose:
         print("[Eval] Loading and Compiling New Model with Custom CUDA Kernel")
 
@@ -422,11 +496,18 @@ def eval_kernel_against_ref(
         os.environ["TORCH_USE_CUDA_DSA"] = "1"  # compile with device side assertion
         tempfile = None
         # add hash for later to distinguish between multi-turn kernels
-        if is_triton:
+        
+        backend_lower = backend.lower()
+        # if backend_lower == "tilelang":
+        #     # Use linecache approach for TileLang
+        #     ModelNew = load_tilelang_model(custom_model_src, context, build_dir)
+        if backend_lower in ["triton", "cute"]:
+            # Use tempfile approach for triton and cute
             ModelNew, tempfile = load_custom_model_with_tempfile(
                 custom_model_src, entry_point="ModelNew"
             )
         else:
+            # Default CUDA backend
             ModelNew = load_custom_model(custom_model_src, context, build_dir)
         torch.cuda.synchronize(device=device)  # not sure if this is too much
     except Exception as e:
@@ -457,6 +538,27 @@ def eval_kernel_against_ref(
             set_seed(seed_num)  # set seed for reproducible weights
             custom_model = ModelNew(*init_inputs)
             assert hasattr(custom_model, "forward")
+            # Move models to GPU with float16 dtype (only for TileLang)
+            # if backend.lower() == "tilelang":
+            #     try:
+            #         original_model = original_model.to(device=device, dtype=torch.float16)
+            #     except Exception as e:
+            #         # TileLang JIT kernels may not support .to(), already on GPU
+            #         if verbose:
+            #             print(f"[Info] Could not call .to() on original model (TileLang), using as-is: {e}")
+            #             print("[Traceback]:")
+            #             traceback.print_exc()
+            #     try:
+            #         custom_model = custom_model.to(device=device, dtype=torch.float16)
+            #     except Exception as e:
+            #         # TileLang JIT kernels may not support .to(), already on GPU
+            #         if verbose:
+            #             print(f"[Info] Could not call .to() on custom model (TileLang), using as-is: {e}")
+            #             print("[Traceback]:")
+            #             traceback.print_exc()
+            # else:
+            original_model = original_model.to(device=device)
+            custom_model = custom_model.to(device=device)
             torch.cuda.synchronize(device=device)
         if verbose:
             print("[Eval] New Model with Custom CUDA Kernel Loaded")
@@ -487,6 +589,7 @@ def eval_kernel_against_ref(
             verbose=verbose,
             seed=seed_num,
             device=device,
+            backend=backend,
         )
     except Exception as e:
         # TODO: add metadata for runtime error e.g. error in launching kernel, illegal memory access, ...
@@ -506,11 +609,21 @@ def eval_kernel_against_ref(
                 torch.cuda.synchronize(device=device)
                 set_seed(seed_num)
                 inputs = get_inputs()
-                inputs = [
-                    x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                    for x in inputs
-                ]
-                model_new = custom_model.cuda(device=device)
+                # Convert inputs for performance measurement
+                inputs = [_process_input_tensor(x, device, backend) for x in inputs]
+                
+                # if backend.lower() == "tilelang":
+                #     try:
+                #         model_new = custom_model.to(device=device, dtype=torch.float16)
+                #     except Exception as e:
+                #         # TileLang JIT kernels may not support .to(), already on GPU
+                #         if verbose:
+                #             print(f"[Info] Line 616 - Could not call .to() on custom model for perf measurement (TileLang): {e}")
+                #             print("[Traceback] From performance measurement - line 616:")
+                #             traceback.print_exc()
+                #         model_new = custom_model
+                # else:
+                model_new = custom_model.to(device=device)
                 torch.cuda.synchronize(device=device)
 
                 elapsed_times = time_execution_with_cuda_event(
@@ -627,6 +740,7 @@ def run_and_check_correctness(
     verbose=False,
     seed=42,
     device=None,
+    backend="cuda",
 ) -> KernelExecResult:
     """
     run the model and check correctness,
@@ -634,6 +748,7 @@ def run_and_check_correctness(
     this is all on GPU, requiring cuda device and transfer .cuda()
 
     num_correct_trials: run the evalutation multiple times with (ideally) different random inputs to ensure correctness
+    backend: backend type for handling dtype conversions
     """
     pass_count = 0
 
@@ -650,19 +765,42 @@ def run_and_check_correctness(
             trial_seed = correctness_trial_seeds[trial]
             if verbose:
                 print(f"[Eval] Generating Random Input with seed {trial_seed}")
+            
+            # if backend.lower() == "tilelang":
+            #     torch.set_default_dtype(torch.float16)
 
             set_seed(trial_seed)
             inputs = get_inputs_fn()
-            inputs = [
-                x.cuda(device=device) if isinstance(x, torch.Tensor) else x
-                for x in inputs
-            ]
+            # Convert inputs to appropriate dtypes for GPU computation
+            inputs = [_process_input_tensor(x, device, backend) for x in inputs]
 
             set_seed(trial_seed)
-            model = original_model_instance.cuda(device=device)
+            # if backend.lower() == "tilelang":
+            #     try:
+            #         model = original_model_instance.to(device=device, dtype=torch.float16)
+            #     except Exception as e:
+            #         # TileLang JIT kernels may not support .to(), already on GPU
+            #         if verbose:
+            #             print(f"[Info] Line 771 - Could not call .to() on original model (TileLang): {e}")
+            #             print("[Traceback] From run_and_check_correctness - line 771:")
+            #             traceback.print_exc()
+            #         model = original_model_instance
+            # else:
+            model = original_model_instance.to(device=device)
 
             set_seed(trial_seed)
-            model_new = new_model_instance.cuda(device=device)
+            # if backend.lower() == "tilelang":
+            #     try:
+            #         model_new = new_model_instance.to(device=device, dtype=torch.float16)
+            #     except Exception as e:
+            #         # TileLang JIT kernels may not support .to(), already on GPU
+            #         if verbose:
+            #             print(f"[Info] Line 777 - Could not call .to() on custom model (TileLang): {e}")
+            #             print("[Traceback] From run_and_check_correctness - line 777:")
+            #             traceback.print_exc()
+            #         model_new = new_model_instance
+            # else:
+            model_new = new_model_instance.to(device=device)
 
             output = model(*inputs)
             torch.cuda.synchronize(device=device)
@@ -705,11 +843,16 @@ def run_and_check_correctness(
             except Exception as e:
                 print("[Error] Exception happens during correctness check")
                 print(f"Error in launching kernel for ModelNew: {e}")
+                print("\n[Full Traceback]:")
+                traceback.print_exc()
+                print("\n")
 
                 metadata = register_and_format_exception(
                     "runtime_error", e, metadata, truncate=True
                 )
                 metadata["runtime_error_name"] = get_error_name(e)
+                # Also store the full traceback in metadata for debugging
+                metadata["runtime_error_traceback"] = traceback.format_exc()
                 return KernelExecResult(
                     compiled=True, correctness=False, metadata=metadata
                 )
@@ -840,4 +983,4 @@ def get_timing_stats(elapsed_times: list[float], device: torch.device = None) ->
 # if __name__ == "__main__":
 # fetch_kernel_from_database("kernelbench_prompt_v2_level_2", 1, 1, "http://localhost:9091")
 # print(fetch_ref_arch_from_level_problem_id("2", 1, with_name=True))
-# fetch_baseline_time("level1", 0, ["1_Square_matrix_multiplication_.py"], "tests/baseline_time_matx3.json")
+# fetch_baseline_time("level1", 0, ["1_Square_matrix_multiplication_.py"], "tests/baseline_time_matx3.json")
\ No newline at end of file
diff --git a/src/prompt_constructor_multilang.py b/src/prompt_constructor_multilang.py
new file mode 100644
index 00000000..39d16243
--- /dev/null
+++ b/src/prompt_constructor_multilang.py
@@ -0,0 +1,553 @@
+import os
+from .utils import read_file
+
+"""
+Multi-Language Prompt Constructor
+
+Supports: Triton, CuTe (TileLang currently disabled/commented out)
+
+Design principles: 
+- To evaluate base model performance on KernelBench, we use the simplest prompt possible to guide model output to generated desired output format.
+- However, we do not do extensive prompt engineering or few-shot examples in the LLM to steer behaviour. 
+"""
+
+REPO_TOP_PATH = os.path.abspath(
+    os.path.join(
+        os.path.dirname(__file__),
+        "..",
+    )
+)
+KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")
+
+
+def get_arch_definition_from_file(arch_path):
+    arch_src = read_file(arch_path)
+    return get_arch_definition(arch_src)
+
+
+def get_arch_definition(arch_src):
+    """
+    Construct torch definition from original torch nn.Module definition
+    """
+    prompt = f"Here is a pytorch defintion of a neural network architecture in the file model.py: ```{arch_src}```\n"
+    return prompt
+
+
+################################################################################
+# Triton Backend
+################################################################################
+
+TRITON_PROBLEM_STATEMENT = """You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups. \n
+    You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
+"""
+
+TRITON_PROBLEM_INSTRUCTION = """
+Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
+"""
+
+TRITON_PROBLEM_STATEMENT_CLEANED = """You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups.\n\nYou have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
+"""
+
+TRITON_PROBLEM_INSTRUCTION_CLEANED = """
+Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
+"""
+
+
+def prompt_generate_custom_triton(
+    arc_src: str, example_arch_src: str, example_new_arch_src: str
+) -> str:
+    prompt = TRITON_PROBLEM_STATEMENT
+
+    assert (
+        "@triton.jit" in example_new_arch_src
+    ), "Example new arch must contain Triton kernel"
+
+    if example_arch_src != "" and example_new_arch_src != "":
+        prompt += f"""
+        Here's an example to show you the syntax of inline embedding custom Triton kernels in torch: The example given architecture is: \n
+        ``` \n
+        {example_arch_src}
+        ``` \n
+        The example new arch with custom Triton kernels looks like this: \n
+        ```
+        {example_new_arch_src}
+        ``` \n
+        """
+
+    prompt += f"""
+    You are given the following architecture: \n
+    ```
+    {arc_src}
+    ```
+    """
+    prompt += TRITON_PROBLEM_INSTRUCTION
+    return prompt
+
+
+def prompt_generate_custom_triton_fewshot_and_template(
+    ref_arch_src: str, shots: list
+) -> str:
+    raise NotImplementedError("This function has not been implemented yet")
+
+
+def prompt_generate_ex_with_CoT_template_triton(ref_arch_src: str, cot_example: str) -> str:
+    raise NotImplementedError("This function has not been implemented yet")
+
+
+def prompt_generate_custom_triton_from_prompt_template(ref_arch_src: str) -> str:
+    """
+    Using prompt example (an element-wise addition) for prompt templates
+    The most basic form of example just to show LLM the task and the expected output format
+    """
+    arch = ref_arch_src
+
+    # path to prompt template, show an example of Model (torch specifications) and ModelNew (torch + custom Triton kernels)
+    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
+    example_new_arch_path = os.path.join(
+        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
+    )
+
+    if not os.path.exists(example_arch_path):
+        raise FileNotFoundError(
+            f"Example architecture file not found: {example_arch_path}"
+        )
+    if not os.path.exists(example_new_arch_path):
+        raise FileNotFoundError(
+            f"Example new architecture file not found: {example_new_arch_path}"
+        )
+
+    example_arch = read_file(example_arch_path)
+    example_new_arch = read_file(example_new_arch_path)
+
+    return prompt_generate_custom_triton(arch, example_arch, example_new_arch)
+
+
+def prompt_generate_prompt_with_hardware_info_from_template_triton(
+    ref_arch_src: str, gpu_name: str
+) -> str:
+    """
+    Similar to prompt_generate_custom_triton_from_prompt_template,
+    but with hardware information for the given GPU
+    """
+    arch = ref_arch_src
+
+    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
+    example_new_arch_path = os.path.join(
+        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
+    )
+    gpu_spec_file_path = os.path.join(
+        REPO_TOP_PATH, f"src/prompts/hardware/gpu_specs.py"
+    )
+
+    example_arch = read_file(example_arch_path)
+    example_new_arch = read_file(example_new_arch_path)
+    gpu_spec_info = read_file(gpu_spec_file_path)
+
+    return prompt_generate_prompt_with_hardware_info_triton(
+        ref_arch_src=arch,
+        gpu_name=gpu_name,
+        example_arch_src=example_arch,
+        example_new_arch_src=example_new_arch,
+        gpu_spec_info_src=gpu_spec_info,
+    )
+
+
+def prompt_generate_prompt_with_hardware_info_triton(
+    ref_arch_src: str,
+    gpu_name: str,
+    example_arch_src: str,
+    example_new_arch_src: str,
+    gpu_spec_info_src: str,
+) -> str:
+    """
+    Generate a prompt with hardware information for the given GPU
+    gpu_spec_info_src: str of the gpu spec src file
+    """
+    local_dict = {}
+    exec(gpu_spec_info_src, {}, local_dict)
+
+    GPU_SPEC_INFO = local_dict.get("GPU_SPEC_INFO")
+    GPU_DEFINITIONS = local_dict.get("GPU_DEFINITIONS")
+    GPU_BEST_PRACTICES = local_dict.get("GPU_BEST_PRACTICES")
+
+    if not GPU_SPEC_INFO or not GPU_DEFINITIONS or not GPU_BEST_PRACTICES:
+        raise ValueError(
+            "GPU_SPEC_INFO or GPU_DEFINITIONS or GPU_BEST_PRACTICES not found in gpu_spec_info_src"
+        )
+
+    assert gpu_name in GPU_SPEC_INFO, f"GPU name {gpu_name} not found in GPU_SPEC_INFO"
+
+    prompt = TRITON_PROBLEM_STATEMENT
+
+    if example_arch_src != "" and example_new_arch_src != "":
+        prompt += f"""
+        Here's an example to show you the syntax of inline embedding custom Triton kernels in torch: The example given architecture is: \n
+        ``` \n
+        {example_arch_src}
+        ``` \n
+        The example new arch with custom Triton kernels looks like this: 
+        ```
+        {example_new_arch_src}
+        ``` \n
+        """
+
+    curr_gpu_spec_info = GPU_SPEC_INFO[gpu_name]
+    gpu_architecture = curr_gpu_spec_info.get("GPU Architecture")
+    prompt += f"""
+    Here is some information about the underlying hardware that you should keep in mind. \n\n
+The GPU that will run the kernel is NVIDIA {gpu_name}, {gpu_architecture} architecture.\n\n"""
+
+    for key, value in curr_gpu_spec_info.items():
+        if key == "GPU Architecture":
+            continue
+        prompt += f"""- We have {value} of {key}.\n"""
+
+    prompt += f"""\n\n
+Here are some concepts about the GPU architecture that could be helpful: \n\n"""
+    for key, value in GPU_DEFINITIONS.items():
+        prompt += f"""- {key}: {value}\n"""
+
+    prompt += f"""\n\n
+Here are some best practices for writing Triton kernels on GPU: \n\n"""
+    for best_practice in GPU_BEST_PRACTICES:
+        prompt += f"""- {best_practice}\n"""
+
+    prompt += f"""
+    You are given the following architecture: \n
+    ```
+    {ref_arch_src}
+    ```
+    """
+
+    prompt += TRITON_PROBLEM_INSTRUCTION
+    return prompt
+
+
+def prompt_fix_compile_triton(ref_arch_src, custom_kernel, metadata):
+    prompt = TRITON_PROBLEM_STATEMENT
+    prompt += f"""
+    With the following architecture:
+    ```
+    {ref_arch_src}
+    ```
+    You generated the following solution and it failed to compile:
+    ```
+    {custom_kernel}
+    ```
+    Here's the metadata of the compilation error:
+    ```
+    {metadata}
+    ```
+    
+    Please fix the compilation error in the new model code. Please output the corrected code in codeblocks.
+    """
+    return prompt
+
+
+def prompt_fix_correctness_triton(ref_arch_src, custom_kernel, metadata):
+    prompt = TRITON_PROBLEM_STATEMENT
+    prompt += f"""
+    With the following architecture:
+    ```
+    {ref_arch_src}
+    ```
+    You generated the following solution and it failed correctness:
+    ```
+    {custom_kernel}
+    ```
+    Here's the metadata of the correctness error:
+    ```
+    {metadata}
+    ```
+    Please consider how your custom Triton kernels are implemented, how it is different from the reference implementation, and fix the correctness error in the new model code. Please output the corrected code in codeblocks.
+    """
+    return prompt
+
+
+################################################################################
+# TileLang Backend - COMMENTED OUT (not working currently)
+################################################################################
+
+# TILELANG_PROBLEM_STATEMENT = """You write custom TileLang kernels to replace the pytorch operators in the given architecture to get speedups. \n
+#     You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom TileLang kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
+# """
+# 
+# TILELANG_PROBLEM_INSTRUCTION = """
+# Optimize the architecture named Model with custom TileLang kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
+# """
+# 
+# TILELANG_PROBLEM_STATEMENT_CLEANED = """You write custom TileLang kernels to replace the pytorch operators in the given architecture to get speedups.\n\nYou have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom TileLang kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
+# """
+# 
+# TILELANG_PROBLEM_INSTRUCTION_CLEANED = """
+# Optimize the architecture named Model with custom TileLang kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
+# """
+# 
+# 
+# def prompt_generate_custom_tilelang(
+#     arc_src: str, example_arch_src: str, example_new_arch_src: str
+# ) -> str:
+#     prompt = TILELANG_PROBLEM_STATEMENT
+# 
+#     if example_arch_src != "" and example_new_arch_src != "":
+#         prompt += f"""
+#         Here's an example to show you the syntax of inline embedding custom TileLang kernels in torch: The example given architecture is: \n
+#         ``` \n
+#         {example_arch_src}
+#         ``` \n
+#         The example new arch with custom TileLang kernels looks like this: \n
+#         ```
+#         {example_new_arch_src}
+#         ``` \n
+#         """
+# 
+#     prompt += f"""
+#     You are given the following architecture: \n
+#     ```
+#     {arc_src}
+#     ```
+#     """
+#     prompt += TILELANG_PROBLEM_INSTRUCTION
+#     return prompt
+# 
+# 
+# def prompt_generate_custom_tilelang_from_prompt_template(ref_arch_src: str) -> str:
+#     """
+#     Using prompt example for TileLang
+#     Note: You'll need to create a TileLang example file similar to the Triton one
+#     """
+#     arch = ref_arch_src
+# 
+#     # TODO: Create model_new_ex_add_tilelang.py example file
+#     example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
+#     example_new_arch_path = os.path.join(
+#         REPO_TOP_PATH, f"src/prompts/model_new_ex_add_tilelang.py"
+#     )
+# 
+#     if not os.path.exists(example_arch_path):
+#         raise FileNotFoundError(
+#             f"Example architecture file not found: {example_arch_path}"
+#         )
+#     if not os.path.exists(example_new_arch_path):
+#         # For now, use a basic template without examples if file doesn't exist
+#         return prompt_generate_custom_tilelang(arch, "", "")
+# 
+#     example_arch = read_file(example_arch_path)
+#     example_new_arch = read_file(example_new_arch_path)
+# 
+#     return prompt_generate_custom_tilelang(arch, example_arch, example_new_arch)
+# 
+# 
+# def prompt_fix_compile_tilelang(ref_arch_src, custom_kernel, metadata):
+#     prompt = TILELANG_PROBLEM_STATEMENT
+#     prompt += f"""
+#     With the following architecture:
+#     ```
+#     {ref_arch_src}
+#     ```
+#     You generated the following solution and it failed to compile:
+#     ```
+#     {custom_kernel}
+#     ```
+#     Here's the metadata of the compilation error:
+#     ```
+#     {metadata}
+#     ```
+#     
+#     Please fix the compilation error in the new model code. Please output the corrected code in codeblocks.
+#     """
+#     return prompt
+# 
+# 
+# def prompt_fix_correctness_tilelang(ref_arch_src, custom_kernel, metadata):
+#     prompt = TILELANG_PROBLEM_STATEMENT
+#     prompt += f"""
+#     With the following architecture:
+#     ```
+#     {ref_arch_src}
+#     ```
+#     You generated the following solution and it failed correctness:
+#     ```
+#     {custom_kernel}
+#     ```
+#     Here's the metadata of the correctness error:
+#     ```
+#     {metadata}
+#     ```
+#     Please consider how your custom TileLang kernels are implemented, how it is different from the reference implementation, and fix the correctness error in the new model code. Please output the corrected code in codeblocks.
+#     """
+#     return prompt
+
+
+################################################################################
+# CuTe Backend
+################################################################################
+
+CUTE_PROBLEM_STATEMENT = """You write custom CuTe (CUTLASS) kernels to replace the pytorch operators in the given architecture to get speedups. \n
+    You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CuTe kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
+"""
+
+CUTE_PROBLEM_INSTRUCTION = """
+Optimize the architecture named Model with custom CuTe (CUTLASS) kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
+"""
+
+CUTE_PROBLEM_STATEMENT_CLEANED = """You write custom CuTe (CUTLASS) kernels to replace the pytorch operators in the given architecture to get speedups.\n\nYou have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom CuTe kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
+"""
+
+CUTE_PROBLEM_INSTRUCTION_CLEANED = """
+Optimize the architecture named Model with custom CuTe (CUTLASS) kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
+"""
+
+
+def prompt_generate_custom_cute(
+    arc_src: str, example_arch_src: str, example_new_arch_src: str
+) -> str:
+    prompt = CUTE_PROBLEM_STATEMENT
+
+    if example_arch_src != "" and example_new_arch_src != "":
+        prompt += f"""
+        Here's an example to show you the syntax of inline embedding custom CuTe (CUTLASS) kernels in torch: The example given architecture is: \n
+        ``` \n
+        {example_arch_src}
+        ``` \n
+        The example new arch with custom CuTe kernels looks like this: \n
+        ```
+        {example_new_arch_src}
+        ``` \n
+        """
+
+    prompt += f"""
+    You are given the following architecture: \n
+    ```
+    {arc_src}
+    ```
+    """
+    prompt += CUTE_PROBLEM_INSTRUCTION
+    return prompt
+
+
+def prompt_generate_custom_cute_from_prompt_template(ref_arch_src: str) -> str:
+    """
+    Using prompt example for CuTe
+    Note: You'll need to create a CuTe example file
+    """
+    arch = ref_arch_src
+
+    # TODO: Create model_new_ex_add_cute.py example file
+    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
+    example_new_arch_path = os.path.join(
+        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_cute.py"
+    )
+
+    if not os.path.exists(example_arch_path):
+        raise FileNotFoundError(
+            f"Example architecture file not found: {example_arch_path}"
+        )
+    if not os.path.exists(example_new_arch_path):
+        # For now, use a basic template without examples if file doesn't exist
+        return prompt_generate_custom_cute(arch, "", "")
+
+    example_arch = read_file(example_arch_path)
+    example_new_arch = read_file(example_new_arch_path)
+
+    return prompt_generate_custom_cute(arch, example_arch, example_new_arch)
+
+
+def prompt_fix_compile_cute(ref_arch_src, custom_kernel, metadata):
+    prompt = CUTE_PROBLEM_STATEMENT
+    prompt += f"""
+    With the following architecture:
+    ```
+    {ref_arch_src}
+    ```
+    You generated the following solution and it failed to compile:
+    ```
+    {custom_kernel}
+    ```
+    Here's the metadata of the compilation error:
+    ```
+    {metadata}
+    ```
+    
+    Please fix the compilation error in the new model code. Please output the corrected code in codeblocks.
+    """
+    return prompt
+
+
+def prompt_fix_correctness_cute(ref_arch_src, custom_kernel, metadata):
+    prompt = CUTE_PROBLEM_STATEMENT
+    prompt += f"""
+    With the following architecture:
+    ```
+    {ref_arch_src}
+    ```
+    You generated the following solution and it failed correctness:
+    ```
+    {custom_kernel}
+    ```
+    Here's the metadata of the correctness error:
+    ```
+    {metadata}
+    ```
+    Please consider how your custom CuTe kernels are implemented, how it is different from the reference implementation, and fix the correctness error in the new model code. Please output the corrected code in codeblocks.
+    """
+    return prompt
+
+
+################################################################################
+# Unified API
+################################################################################
+
+def get_prompt_for_backend(ref_arch_src: str, backend: str = "triton") -> str:
+    """
+    Unified API to get prompt for any supported backend
+    
+    Args:
+        ref_arch_src: Reference architecture source code
+        backend: One of 'triton', 'cute'  (tilelang removed - not working)
+    
+    Returns:
+        Prompt string for the specified backend
+    """
+    backend_lower = backend.lower()
+    
+    if backend_lower == "triton":
+        return prompt_generate_custom_triton_from_prompt_template(ref_arch_src)
+    # elif backend_lower == "tilelang":
+    #     return prompt_generate_custom_tilelang_from_prompt_template(ref_arch_src)
+    elif backend_lower == "cute":
+        return prompt_generate_custom_cute_from_prompt_template(ref_arch_src)
+    else:
+        raise ValueError(
+            f"Unsupported backend: {backend}. Must be one of: 'triton', 'cute'"
+        )
+
+
+################################################################################
+# Main (for testing)
+################################################################################
+
+def main():
+    gpu_name = "L40S"
+    backend = "triton"  # Change this to test different backends
+
+    ref_arch_src = read_file(os.path.join(KERNEL_BENCH_PATH, f"level1/19_ReLU.py"))
+    assert len(ref_arch_src) > 0, "ref_arch_src is empty"
+    
+    prompt = get_prompt_for_backend(ref_arch_src, backend)
+    print(f"\n{'='*80}\n{backend.upper()} PROMPT:\n{'='*80}\n")
+    print(prompt)
+    
+    # Write prompt to temp file
+    temp_file_path = os.path.join(REPO_TOP_PATH, "scratch", f"prompt_{backend}_draft.txt")
+    os.makedirs(os.path.dirname(temp_file_path), exist_ok=True)
+    with open(temp_file_path, "w") as f:
+        f.write(prompt)
+    print(f"\nPrompt written to: {temp_file_path}")
+
+
+if __name__ == "__main__":
+    main()
+
+
+
diff --git a/src/prompt_constructor_triton.py b/src/prompt_constructor_triton.py
deleted file mode 100644
index e3ea945b..00000000
--- a/src/prompt_constructor_triton.py
+++ /dev/null
@@ -1,297 +0,0 @@
-import os
-
-from .utils import read_file
-
-
-"""
-Construct Prompt
-
-Design principles: 
-- To evaluate base model performance on KernelBench, we use the simplest prompt possible to guide model output to generated desired output format.
-- However, we do not do extensive prompt engineering or few-shot example in the LLM to steer behaviour. 
-"""
-
-REPO_TOP_PATH = os.path.abspath(
-    os.path.join(
-        os.path.dirname(__file__),
-        "..",
-    )
-)
-KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench")
-
-
-def get_arch_definition_from_file(arch_path):
-    arch_src = read_file(arch_path)
-    return get_arch_definition(arch_src)
-
-
-def get_arch_definition(arch_src):
-    """
-    Construct torch definition from original torch nn.Module definition
-    """
-    prompt = f"Here is a pytorch defintion of a neural network architecture in the file model.py: ```{arch_src}```\n"
-    return prompt
-
-
-############################################
-# Triton Prompt
-############################################
-
-PROBLEM_STATEMENT = """You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups. \n
-    You have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
-"""
-
-PROBLEM_INSTRUCTION = """
-Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
-"""
-
-
-def prompt_generate_custom_triton(
-    arc_src: str, example_arch_src: str, example_new_arch_src: str
-) -> str:
-    prompt = PROBLEM_STATEMENT
-
-    assert (
-        "@triton.jit" in example_new_arch_src
-    ), "Example new arch must contain Triton kernel"
-
-    if example_arch_src != "" and example_new_arch_src != "":
-        prompt += f"""
-        Here's an example to show you the syntax of inline embedding custom Triton kernels in torch: The example given architecture is: \n
-        ``` \n
-        {example_arch_src}
-        ``` \n
-        The example new arch with custom Triton kernels looks like this: \n
-        ```
-        {example_new_arch_src}
-        ``` \n
-        """
-
-    prompt += f"""
-    You are given the following architecture: \n
-    ```
-    {arc_src}
-    ```
-    """
-    prompt += PROBLEM_INSTRUCTION
-    return prompt
-
-
-PROBLEM_STATEMENT_CLEANED = """You write custom Triton kernels to replace the pytorch operators in the given architecture to get speedups.\n\nYou have complete freedom to choose the set of operators you want to replace. You may make the decision to replace some operators with custom Triton kernels and leave others unchanged. You may replace multiple operators with custom implementations, consider operator fusion opportunities (combining multiple operators into a single kernel, for example, combining matmul+relu), or algorithmic changes (such as online softmax). You are only limited by your imagination.\n
-"""
-PROBLEM_INSTRUCTION_CLEANED = """
-Optimize the architecture named Model with custom Triton kernels! Name your optimized output architecture ModelNew. Output the new code in codeblocks. Please generate real code, NOT pseudocode, make sure the code compiles and is fully functional. Just output the new model code, no other text, and NO testing code! \n
-"""
-
-
-def prompt_generate_custom_triton_fewshot_and_template(
-    ref_arch_src: str, shots: list
-) -> str:
-    raise NotImplementedError("This function has not been implemented yet")
-
-
-def prompt_generate_ex_with_CoT_template(ref_arch_src: str, cot_example: str) -> str:
-    raise NotImplementedError("This function has not been implemented yet")
-
-
-def prompt_generate_custom_triton_from_prompt_template(ref_arch_src: str) -> str:
-    """
-    Using prompt example (an element-wise addition) for prompt templates
-    The most basic form of example just to show LLM the task and the expected output format
-    """
-    arch = ref_arch_src
-    # These are strictly defined for now
-
-    # path to prompt template, show an example of Model (torch specifications) and ModelNew (torch + custom CUDA kernels)
-    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
-    example_new_arch_path = os.path.join(
-        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
-    )
-
-    if not os.path.exists(example_arch_path):
-        raise FileNotFoundError(
-            f"Example architecture file not found: {example_arch_path}"
-        )
-    if not os.path.exists(example_new_arch_path):
-        raise FileNotFoundError(
-            f"Example new architecture file not found: {example_new_arch_path}"
-        )
-
-    example_arch = read_file(example_arch_path)
-    example_new_arch = read_file(example_new_arch_path)
-
-    return prompt_generate_custom_triton(arch, example_arch, example_new_arch)
-
-
-def prompt_generate_prompt_with_hardware_info_from_template(
-    ref_arch_src: str, gpu_name: str
-) -> str:
-    """
-    Similar to prompt_generate_custom_triton_from_prompt_template,
-    but with hardware information for the given GPU
-    """
-
-    arch = ref_arch_src
-    # These are strictly defined for now
-
-    # path to prompt template, show an example of Model (torch specifications) and ModelNew (torch + custom CUDA kernels)
-    example_arch_path = os.path.join(REPO_TOP_PATH, f"src/prompts/model_ex_add.py")
-    example_new_arch_path = os.path.join(
-        REPO_TOP_PATH, f"src/prompts/model_new_ex_add_triton.py"
-    )
-
-    gpu_spec_file_path = os.path.join(
-        REPO_TOP_PATH, f"src/prompts/hardware/gpu_specs.py"
-    )
-
-    example_arch = read_file(example_arch_path)
-    example_new_arch = read_file(example_new_arch_path)
-    gpu_spec_info = read_file(gpu_spec_file_path)
-
-    return prompt_generate_prompt_with_hardware_info(
-        ref_arch_src=arch,
-        gpu_name=gpu_name,
-        example_arch_src=example_arch,
-        example_new_arch_src=example_new_arch,
-        gpu_spec_info_src=gpu_spec_info,
-    )
-
-
-def prompt_generate_prompt_with_hardware_info(
-    ref_arch_src: str,
-    gpu_name: str,
-    example_arch_src: str,
-    example_new_arch_src: str,
-    gpu_spec_info_src: str,
-) -> str:
-    """
-    Generate a prompt with hardware information for the given GPU
-    gpu_spec_info_src: str of the gpu spec src file
-    """
-
-    # Create a dictionary to store the local namespace
-    local_dict = {}
-
-    # Execute the GPU spec file in the local namespace
-    exec(gpu_spec_info_src, {}, local_dict)
-
-    # Get the required variables from the local namespace
-    GPU_SPEC_INFO = local_dict.get("GPU_SPEC_INFO")
-    GPU_DEFINITIONS = local_dict.get("GPU_DEFINITIONS")
-    GPU_BEST_PRACTICES = local_dict.get("GPU_BEST_PRACTICES")
-
-    if not GPU_SPEC_INFO or not GPU_DEFINITIONS or not GPU_BEST_PRACTICES:
-        raise ValueError(
-            "GPU_SPEC_INFO or GPU_DEFINITIONS or GPU_BEST_PRACTICES not found in gpu_spec_info_src"
-        )
-
-    assert gpu_name in GPU_SPEC_INFO, f"GPU name {gpu_name} not found in GPU_SPEC_INFO"
-
-    prompt = PROBLEM_STATEMENT
-
-    if example_arch_src != "" and example_new_arch_src != "":
-        prompt += f"""
-        Here's an example to show you the syntax of inline embedding custom CUDA operators in torch: The example given architecture is: \n
-        ``` \n
-        {example_arch_src}
-        ``` \n
-        The example new arch with custom CUDA kernels looks like this: 
-        ```
-        {example_new_arch_src}
-        ``` \n
-        """
-
-    curr_gpu_spec_info = GPU_SPEC_INFO[gpu_name]
-
-    gpu_architecture = curr_gpu_spec_info.get("GPU Architecture")
-    prompt += f"""
-    Here is some information about the underlying hardware that you should keep in mind. \n\n
-The GPU that will run the kernel is NVIDIA {gpu_name}, {gpu_architecture} architecture.\n\n"""
-
-    for key, value in curr_gpu_spec_info.items():
-        if key == "GPU Architecture":
-            continue
-        prompt += f"""- We have {value} of {key}.\n"""
-
-    prompt += f"""\n\n
-Here are some concepts about the GPU architecture that could be helpful: \n\n"""
-    for key, value in GPU_DEFINITIONS.items():
-        prompt += f"""- {key}: {value}\n"""
-
-    prompt += f"""\n\n
-Here are some best practices for writing CUDA kernels on GPU: \n\n"""
-    for best_practice in GPU_BEST_PRACTICES:
-        prompt += f"""- {best_practice}\n"""
-
-    prompt += f"""
-    You are given the following architecture: \n
-    ```
-    {ref_arch_src}
-    ```
-    """
-
-    prompt += PROBLEM_INSTRUCTION
-    return prompt
-
-    return None
-
-
-def prompt_fix_compile(ref_arch_src, custom_cuda, metadata):
-    prompt = PROBLEM_STATEMENT
-    prompt += f"""
-    With the following architecture:
-    ```
-    {ref_arch_src}
-    ```
-    You generated the following solution and it failed to compile:
-    ```
-    {custom_cuda}
-    ```
-    Here's the metadata of the compilation error:
-    ```
-    {metadata}
-    ```
-    
-    Please fix the compilation error in the new model code. Please output the corrected code in codeblocks.
-    """
-    return prompt
-
-
-def prompt_fix_correctness(ref_arch_src, custom_cuda, metadata):
-    prompt = PROBLEM_STATEMENT
-    prompt += f"""
-    With the following architecture:
-    ```
-    {ref_arch_src}
-    ```
-    You generated the following solution and it failed correctness:
-    ```
-    {custom_cuda}
-    ```
-    Here's the metadata of the correctness error:
-    ```
-    {metadata}
-    ```
-    Please consider how your custom CUDA kernels are implemented, how it is different from the reference implementation, and fix the correctness error in the new model code. Please output the corrected code in codeblocks.
-    """
-    return prompt
-
-
-def main():
-    gpu_name = "L40S"
-
-    ref_arch_src = read_file(os.path.join(KERNEL_BENCH_PATH, f"level1/19_ReLU.py"))
-    assert len(ref_arch_src) > 0, "ref_arch_src is empty"
-    prompt = prompt_generate_prompt_with_hardware_info_from_template(
-        ref_arch_src, gpu_name
-    )
-    print(prompt)
-    # Write prompt to temp file
-    temp_file_path = os.path.join(REPO_TOP_PATH, "scratch", "prompt_draft.txt")
-    os.makedirs(os.path.dirname(temp_file_path), exist_ok=True)
-    with open(temp_file_path, "w") as f:
-        f.write(prompt)
-
-
-if __name__ == "__main__":
-    main()

From 31116dd8723f8e444511ec61d8ae8dd5ed26e69b Mon Sep 17 00:00:00 2001
From: AffectionateCurry <160357316+AffectionateCurry@users.noreply.github.com>
Date: Tue, 21 Oct 2025 21:37:07 -0700
Subject: [PATCH 09/15] updated requirements

---
 requirements.txt                                | Bin 150 -> 416 bytes
 .../generate_and_eval_single_sample_modal.py    |   6 ++++++
 2 files changed, 6 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index 043818816cf59a2a9f8b99581cd3710dd924daf5..9d85da228587bb89ef73be9374ea6866476ec406 100644
GIT binary patch
literal 416
zcmX|-+X})k5Jcx$@KYip_~^Gt@LFxt+D5^jSI?xal%$!<ZYKNvL?vZi)vHyfN(U{q
zPy;$DsxBz$3DfCakbNhrB5r#-P6yrDFEELHWZ!h?VfUyl{26uBGPsAmYR9+HN?X>t
z)MY)WYs%RPGHc+z%<jIq^h}_0>C&L}lXih=xub>){GHOOyHt8{o-;dvb-&z>IdKLJ
zu*Rvp=0=TC#O1_(N%;3U6^|z}rc2yXk-x$b@IUEQa~}8t{4h<_d7RSPJh3=U(uo3f
HFQ)JVBz8!M

literal 150
zcmWN}K@P$|2mru4e`#Y(JoYVy>{5z$0Rc_=d-puapw<!Q(qLG}6ZAX$0&`LnaBOd6
zQ3k(y;Io<T!cZ3N*tYxnSZ`3DKA>Y^QYBQN=58@464;DST9}jqOMW!VGNw8x=2(;k
PD*8ge8JaHBNz^?5Jt#Le

diff --git a/scripts/generate_and_eval_single_sample_modal.py b/scripts/generate_and_eval_single_sample_modal.py
index c2a38c2e..17a9f693 100644
--- a/scripts/generate_and_eval_single_sample_modal.py
+++ b/scripts/generate_and_eval_single_sample_modal.py
@@ -1,3 +1,9 @@
+'''
+Example Usage:
+python scripts/generate_and_eval_single_sample_modal.py dataset_src=huggingfac level=1 problem_id=1 eval_mode=modal gpu=L40S 
+    server_type=deepseek model_name=deepseek-coder max_tokens=4096 temperature=0.0
+'''
+
 import pydra
 from pydra import REQUIRED, Config
 import os, sys

From 37dd8dbeaef662097835292f07c82e76a6759460 Mon Sep 17 00:00:00 2001
From: AffectionateCurry <160357316+AffectionateCurry@users.noreply.github.com>
Date: Wed, 22 Oct 2025 17:09:04 -0700
Subject: [PATCH 10/15] added back requirements.txt

---
 requirements.txt | Bin 0 -> 388 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a79836e3b9819af89afe16243a7e6f71cb6ca99c 100644
GIT binary patch
literal 388
zcmX|--3r1m6oYdu_$Z19UivIj_N#8)+^XQqt6w^m(XL6(Npt#s1w|#bnlxzCS%ubG
zsRxw`>N1MD!;E@n<hqKkK(%W}PKB=Q4NPQT*e4DX><M*%uTd8*fhX8jN4~wbI<R)R
zFX={KpU+N^STpw}cKfF689{Y+>CvXI*1!zh(ZOx_JEc{-ob}-R%j^W!J-I7>i4$ms
wHI8|j3q3}GDkt_No4-$$Yy2Whx)R+Uk=quJ2NfkN&{gn`iFBT52AAxeK5!UA%K!iX

literal 0
HcmV?d00001


From f0b88d752ef69e16023b22bb197344ac4a95b473 Mon Sep 17 00:00:00 2001
From: nathanjpaek <46549297+nathanjpaek@users.noreply.github.com>
Date: Wed, 22 Oct 2025 17:43:16 -0700
Subject: [PATCH 11/15] add cute one shot addition example

---
 src/prompts/model_new_ex_add_cute.py | 57 ++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 src/prompts/model_new_ex_add_cute.py

diff --git a/src/prompts/model_new_ex_add_cute.py b/src/prompts/model_new_ex_add_cute.py
new file mode 100644
index 00000000..3cd3d63f
--- /dev/null
+++ b/src/prompts/model_new_ex_add_cute.py
@@ -0,0 +1,57 @@
+import torch
+import cutlass
+import cutlass.cute as cute
+from cutlass.cute.runtime import from_dlpack
+
+@cute.kernel
+def elementwise_add_kernel(gA: cute.Tensor, gB: cute.Tensor, gC: cute.Tensor): 
+    tidx, _, _ = cute.arch.thread_idx()  
+    bidx, _, _ = cute.arch.block_idx()  
+    bdim, _, _ = cute.arch.block_dim()  
+
+    thread_idx = bidx * bdim + tidx
+
+    m, n = gA.shape
+    ni = thread_idx % n  
+    mi = thread_idx // n  
+
+    a_val = gA[mi, ni]
+    b_val = gB[mi, ni]
+
+    gC[mi, ni] = a_val + b_val
+
+@cute.jit
+def elementwise_add_host(mA: cute.Tensor, mB: cute.Tensor, mC: cute.Tensor):
+    M = mA.shape[0]
+    N = mA.shape[1]
+
+    threads_per_block = 256
+    total_elems = M * N
+    grid_x = cute.ceil_div(total_elems, threads_per_block)
+
+    elementwise_add_kernel(mA, mB, mC).launch(grid=(grid_x, 1, 1), block=(threads_per_block, 1, 1))
+
+
+class ModelNew(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.compiled = {}
+
+    def forward(self, A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:
+        M, N = A.shape
+        A = A.contiguous().cuda()
+        B = B.contiguous().cuda()
+        C = torch.empty((M, N), dtype=A.dtype, device=A.device)
+
+        mA = from_dlpack(A, assumed_align=16).mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+        mB = from_dlpack(B, assumed_align=16).mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+        mC = from_dlpack(C, assumed_align=16).mark_compact_shape_dynamic(mode=0, stride_order=(0, 1))
+
+        key = (A.dtype,)
+        compiled = self.compiled.get(key)
+        if compiled is None:
+            compiled = cute.compile(elementwise_add_host, mA, mB, mC)
+            self.compiled[key] = compiled
+
+        compiled(mA, mB, mC)
+        return C

From 6bbcf976c5d84cacb2d2671ab6d19c1936ba1bc6 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Wed, 22 Oct 2025 17:49:30 -0700
Subject: [PATCH 12/15] remove unncessary files and redo requirements

---
 KernelBench/test.py |   8 --------
 requirements.txt    | Bin 388 -> 0 bytes
 2 files changed, 8 deletions(-)
 delete mode 100644 KernelBench/test.py
 delete mode 100644 requirements.txt

diff --git a/KernelBench/test.py b/KernelBench/test.py
deleted file mode 100644
index 4c4d5451..00000000
--- a/KernelBench/test.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import torch
-
-print("Before patch:", torch.randn)                 # built-in
-
-import src.utils      # or: import src.utils
-print("After  patch:", torch.randn)                 # <function _randn_patched at 0x…>
-
-print("Module  :", torch.randn.__module__)          # should be 'src.utils'
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index a79836e3b9819af89afe16243a7e6f71cb6ca99c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 388
zcmX|--3r1m6oYdu_$Z19UivIj_N#8)+^XQqt6w^m(XL6(Npt#s1w|#bnlxzCS%ubG
zsRxw`>N1MD!;E@n<hqKkK(%W}PKB=Q4NPQT*e4DX><M*%uTd8*fhX8jN4~wbI<R)R
zFX={KpU+N^STpw}cKfF689{Y+>CvXI*1!zh(ZOx_JEc{-ob}-R%j^W!J-I7>i4$ms
wHI8|j3q3}GDkt_No4-$$Yy2Whx)R+Uk=quJ2NfkN&{gn`iFBT52AAxeK5!UA%K!iX


From e2bc2719994f0e83b4ebdb06f492beec6be5534f Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Wed, 22 Oct 2025 17:51:57 -0700
Subject: [PATCH 13/15] let's see if that fixes it

---
 requirements.txt | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..c912156c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,28 @@
+# Frameworks
+torch==2.5.0
+# we shall upgrade torch for blackwell when it is stable
+transformers
+datasets
+modal
+
+# DSLs
+nvidia-cutlass-dsl
+
+# helper
+tqdm
+packaging
+pydra_config
+pytest
+ninja
+
+# Numerics
+einops
+dotenv
+numpy
+
+# to deprecate with litellm
+google-generativeai
+together
+openai
+anthropic
+

From a5f63af5d8ff26771ec2b4c8062f54d189172c08 Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Wed, 22 Oct 2025 18:00:48 -0700
Subject: [PATCH 14/15] fix config in file suggested soksoerey

---
 README.md                   | 19 ++-----------------
 scripts/generate_samples.py |  2 +-
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 99db491b..07ee4851 100644
--- a/README.md
+++ b/README.md
@@ -120,25 +120,10 @@ We provide some reference baseline times a variety of NVIDIA GPUs across generat
 We have also releaed the test-time framework [Caesar](https://github.com/simonguozirui/caesar) that are used in the multi-turn / iterative refinement experiments in our paper. You can use or modify this framework for high-throughput test-time scaling (both sequential and parallel) targeting KernelBench problems. 
 
 ## 🛣️ Upcoming Roadmap
-- [ ] Triton Variant (To be merged)
-- [ ] Easy to use CoLab Notebook Example
-- [ ] Push button flow on Modal / Cloud Provider 
-- [ ] Integrate with more frameworks, such as [ThunderKittens](https://github.com/HazyResearch/ThunderKittens)
-- [ ] Add backward pass
-- [ ] Integrate with toolchains such as NCU
-See Issues for the ongoing roadmap and directions.
-
-
+Check out our [roadmap](https://github.com/ScalingIntelligence/KernelBench/issues/74) for what we plan to add as features. We welcome community contirbutions in these directions. 
 
 ## 🔍 Known Usage
-- [NVIDIA](https://developer.nvidia.com/blog/automating-gpu-kernel-generation-with-deepseek-r1-and-inference-time-scaling/) - Automating GPU Kernel Generation with DeepSeek-R1 and Inference Time Scaling
-- [METR](https://metr.org/blog/2025-02-14-measuring-automated-kernel-engineering/) - Measuring Automated Kernel Engineering
-- [Sakana AI](https://sakana.ai/ai-cuda-engineer/) - AI Cuda Engineer
-- [Project Popcorn](https://www.youtube.com/watch?v=mdDVkBeFy9A) - Triton Support for KernelBench, Data Scaling + SFT'd Kernel LLM
-- [Kevin](https://cognition.ai/blog/kevin-32b) - Kevin-32B: Multi-Turn RL for Writing CUDA Kernels
-- [Simple Test-Time Search](https://scalingintelligence.stanford.edu/blogs/fastkernels/) - by @anneouyang
-
-If you are using KernelBench, we love to hear more about it!
+Since release, we have gotten a lot of interest from researchers, research labs, and companies that use KernelBench to explore this direction. We have documented [known usage](https://docs.google.com/document/d/e/2PACX-1vTjS-UMH1HB5n_PENq2k-3YRfXIXkqKIKeNC2zcWMyLPdl4Jrwvdk4dNDVSsM8ybKrCxZB7GJq1slZF/pub) of KernelBench and related efforts towards automated kernel generations. If you are using KernelBench, we love to hear more about it!
 
 ## 🪪 License
 MIT. Check `LICENSE.md` for more details.
diff --git a/scripts/generate_samples.py b/scripts/generate_samples.py
index 64e65231..5ee217cf 100644
--- a/scripts/generate_samples.py
+++ b/scripts/generate_samples.py
@@ -186,7 +186,7 @@ def check_kernel_exists(
     return os.path.exists(kernel_path)
 
 
-@pydra.main(base=Generation.)
+@pydra.main(base=GenerationConfig)
 def main(config: GenerationConfig):
     """
     Batch Generate Samples for Particular Level

From dc87a3714d1616f809c99902a7be57c954b22cda Mon Sep 17 00:00:00 2001
From: Simon Guo <simonguo@stanford.edu>
Date: Wed, 22 Oct 2025 18:10:33 -0700
Subject: [PATCH 15/15] move natalia's old file into change log

---
 .../changelog/original_files.txt                    | Bin
 README.md                                           |   5 +++++
 2 files changed, 5 insertions(+)
 rename original_files.txt => KernelBench/changelog/original_files.txt (100%)

diff --git a/original_files.txt b/KernelBench/changelog/original_files.txt
similarity index 100%
rename from original_files.txt
rename to KernelBench/changelog/original_files.txt
diff --git a/README.md b/README.md
index 07ee4851..8ec45e8a 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,8 @@ We construct KernelBench to have 4 Levels of categories:
 - **Level 4 🤗**:  Level Hugging Face 
     Optimize whole model architectures from HuggingFace
 
+We are actively extending KernelBench to other DSLs beyond `cuda` as well.
+
 ## ⚖️ Evaluation
 #### Methodology
 To evaluate model-generated kernels, we need to check if they:
@@ -47,6 +49,7 @@ Some examples to illustrate this metric that filters based on speedups:
 
 You can increase speedup threshold `p` to make the task more challenging.
 
+
 #### Compute Overall Benchmark Performance
 
 We provide a script `scripts/greedy_analysis.py` to compute the overall benchmark performance. 
@@ -95,6 +98,8 @@ python3 scripts/generate_and_eval_single_sample.py dataset_src="huggingface" lev
 # add .verbose_logging for more visbility
 ```
 
+We are also supporting other GPU programming languages beyond `cuda`. Simply specify `backend=triton`. For now we support (`cuda`, `triton`, `cute`).
+
 ### Run on all problems 
 
 ```