From e5caca340b805011aedb6e575368288f6f4af79e Mon Sep 17 00:00:00 2001 From: Willy-Chan Date: Mon, 1 Dec 2025 16:23:04 -0800 Subject: [PATCH 1/2] Added TK backend and mounting for modal support. Need to copy thunderkittens repo to the root! --- .gitignore | Bin 171 -> 197 bytes scripts/eval_from_generations.py | 42 ++++-- scripts/generate_and_eval_single_sample.py | 6 +- .../generate_and_eval_single_sample_modal.py | 45 ++++-- scripts/generate_samples.py | 4 +- .../model_new_ex_add_thunderkittens.py | 142 ++++++++++++++++++ src/prompts/prompts.toml | 5 + 7 files changed, 218 insertions(+), 26 deletions(-) create mode 100644 src/prompts/model_new_ex_add_thunderkittens.py diff --git a/.gitignore b/.gitignore index abb125d6aabe372688772976daa7ee024353eaf8..bc05f65e5cb6bf7ff3a0357d0df20910f2bb149b 100644 GIT binary patch delta 33 ocmZ3@c$9I%YAHRJ;P~K@{GwE@kc`s2l++^c%#xDSykdPW0MA4V_y7O^ delta 6 NcmX@gxSDaoY5)l%0 template struct. +# 3. Create an alias like: using warp = kittens::group<1>; +# 4. Then call: warp::load(...), warp::zero(...), etc. +# +elementwise_add_cuda_source = """ +// IMPORTANT: Define KITTENS_HOPPER before including ThunderKittens headers for H100/Hopper GPUs +// This enables FP8 types and Hopper-specific features +#define KITTENS_HOPPER + +#include +#include + +// Include ThunderKittens headers +#include "kittens.cuh" + +// ThunderKittens namespace and group aliases +// Operations are accessed through these group types, NOT as free functions +using namespace kittens; +using warp = kittens::group<1>; // For single-warp operations (32 threads) +// For multi-warp operations, use: using warpgroup = kittens::group<4>; + +// Constants for tile dimensions +constexpr int TILE_DIM = 16; + +// ThunderKittens elementwise add kernel using shared memory tiles +// This example demonstrates the ThunderKittens API pattern +__global__ void tk_elementwise_add_kernel(const float* __restrict__ a_ptr, + const float* __restrict__ b_ptr, + float* __restrict__ out_ptr, + int rows, int cols) { + // For simple element-wise ops, we use a straightforward approach + // ThunderKittens shines for matrix ops with tiles, but here we show basic pattern + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int total = rows * cols; + + // Grid-stride loop for simple element-wise addition + for (int i = idx; i < total; i += blockDim.x * gridDim.x) { + out_ptr[i] = a_ptr[i] + b_ptr[i]; + } +} + +// Alternative: ThunderKittens tiled version for larger matrices +// Shows proper usage of ThunderKittens tile types and group operations +// Uncomment and adapt for matrix operations: +/* +__global__ void tk_matmul_kernel(const bf16* A, const bf16* B, bf16* C, + int M, int N, int K) { + // Define aliases for the group - THIS IS REQUIRED for ThunderKittens ops + using warpgroup = kittens::group<4>; // 4 warps = 128 threads + + // ThunderKittens register tiles for accumulation + rt_fl<16, 16> acc; // 16x16 float register tile + + // Shared memory tiles + extern __shared__ alignment_dummy __shm[]; + st_bf<16, 16> (&a_smem)[2] = *reinterpret_cast(*)[2]>(__shm); + st_bf<16, 16> (&b_smem)[2] = *reinterpret_cast(*)[2]>(__shm + sizeof(st_bf<16,16>)*2); + + // Initialize accumulator to zero - NOTE: use warpgroup:: prefix! + warpgroup::zero(acc); + + // Main loop would go here with: + // warpgroup::load(a_smem[...], ...); // Load from global to shared + // warpgroup::mma_AB(acc, a_tile, b_tile); // Matrix multiply-accumulate + // warpgroup::store(C_ptr, acc, ...); // Store result +} +*/ + +torch::Tensor elementwise_add_cuda(torch::Tensor a, torch::Tensor b) { + TORCH_CHECK(a.is_cuda(), "Input tensor a must be on CUDA"); + TORCH_CHECK(b.is_cuda(), "Input tensor b must be on CUDA"); + TORCH_CHECK(a.sizes() == b.sizes(), "Input tensors must have the same shape"); + + auto out = torch::empty_like(a); + int rows = a.size(0); + int cols = a.numel() / rows; + + const int block_size = 256; + const int num_blocks = (a.numel() + block_size - 1) / block_size; + + tk_elementwise_add_kernel<<>>( + a.data_ptr(), + b.data_ptr(), + out.data_ptr(), + rows, cols + ); + + return out; +} +""" + +# Compile the ThunderKittens kernel inline +elementwise_add = load_inline( + name="elementwise_add_tk", + cpp_sources=elementwise_add_cpp_source, + cuda_sources=elementwise_add_cuda_source, + functions=["elementwise_add_cuda"], + verbose=True, + extra_include_paths=[ + TK_PATH, + os.path.join(TK_PATH, "include"), + ], + extra_cflags=["-std=c++20", "-O3"], + extra_cuda_cflags=[ + "-std=c++20", + "-O3", + "--expt-relaxed-constexpr", + "--expt-extended-lambda", + "-Xcompiler", "-fPIC", + "-DNDEBUG", + "-DKITTENS_HOPPER", + ], +) + + +class ModelNew(nn.Module): + def __init__(self) -> None: + super().__init__() + self.elementwise_add = elementwise_add + + def forward(self, a, b): + return self.elementwise_add.elementwise_add_cuda(a, b) diff --git a/src/prompts/prompts.toml b/src/prompts/prompts.toml index bcf4e4ed..1d0c1fed 100644 --- a/src/prompts/prompts.toml +++ b/src/prompts/prompts.toml @@ -49,6 +49,11 @@ backend_display = "TileLang kernels" one_shot_new_arch = "src/prompts/model_new_ex_add_tilelang.py" # No few_shot_examples - will use one-shot when few_shot option is selected +[backends.thunderkittens] +backend_display = "ThunderKittens kernels" +one_shot_new_arch = "src/prompts/model_new_ex_add_thunderkittens.py" +# No few_shot_examples - will use one-shot when few_shot option is selected + # ------------------------------------------------------------------------- # Precision: Precision-specific configuration # ------------------------------------------------------------------------- From 56c977a4e78fc43557717fcd6a1bb75e44fdabc8 Mon Sep 17 00:00:00 2001 From: Willy-Chan Date: Sat, 6 Dec 2025 14:48:40 -0800 Subject: [PATCH 2/2] run and check works with TK --- scripts/run_and_check.py | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/scripts/run_and_check.py b/scripts/run_and_check.py index 316b96ee..04a61d57 100644 --- a/scripts/run_and_check.py +++ b/scripts/run_and_check.py @@ -26,20 +26,44 @@ REPO_TOP_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) KERNEL_BENCH_PATH = os.path.join(REPO_TOP_PATH, "KernelBench") +THUNDERKITTENS_LOCAL_PATH = os.path.join(REPO_TOP_PATH, "ThunderKittens") +SRC_PATH = os.path.join(REPO_TOP_PATH, "src") cuda_version = "12.8.0" flavor = "devel" operating_sys = "ubuntu22.04" tag = f"{cuda_version}-{flavor}-{operating_sys}" -image = ( - modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10") - .apt_install("git", "gcc-10", "g++-10", "clang") - .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt")) - .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench") - .add_local_python_source("src") - .add_local_python_source("scripts") -) +# ThunderKittens support - use TK image if directory exists locally +if os.path.isdir(THUNDERKITTENS_LOCAL_PATH): + # ThunderKittens image with TK environment and mounting + image = ( + modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10") + .apt_install("git", "gcc-10", "g++-10", "clang") + .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt")) + .env({ + "THUNDERKITTENS_ROOT": "/root/ThunderKittens", + "THUNDERKITTENS_PATH": "/root/ThunderKittens", + "TORCH_CUDA_ARCH_LIST": "9.0", + "CXX": "g++-10", + "CC": "gcc-10", + }) + .add_local_dir(THUNDERKITTENS_LOCAL_PATH, remote_path="/root/ThunderKittens", copy=True) + .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench") + .add_local_dir(SRC_PATH, remote_path="/root/src") + .add_local_python_source("src") + .add_local_python_source("scripts") + ) +else: + # Standard image without ThunderKittens + image = ( + modal.Image.from_registry(f"nvidia/cuda:{tag}", add_python="3.10") + .apt_install("git", "gcc-10", "g++-10", "clang") + .pip_install_from_requirements(os.path.join(REPO_TOP_PATH, "requirements.txt")) + .add_local_dir(KERNEL_BENCH_PATH, remote_path="/root/KernelBench") + .add_local_python_source("src") + .add_local_python_source("scripts") + ) """ Run a pair of KernelBench format (problem, solution) to check if solution is correct and compute speedup