ROCm · Zzz9990 · Dec 25, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 18, 2025
diff --git a/.github/workflows/pre-checks.yaml b/.github/workflows/pre-checks.yaml
@@ -35,7 +35,7 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
       - name: Set up Python environment
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: "3.12"
       - name: Install dependencies
@@ -46,7 +46,16 @@ jobs:
         env:
           REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
-          ruff check . -e | reviewdog -efm="%f:%l:%c: %m" -diff="git diff FETCH_HEAD" -reporter=github-pr-check -tee
+          ruff check . \
+            --output-format=rdjson \
+            --exit-zero \
+            --no-fix \
+          | reviewdog \
+            -f=rdjson \
+            -name="ruff" \
+            -reporter=github-pr-review \
+            -filter-mode=diff_context \
+            -fail-on-error=true
 
   upload-success-artifact:
     name: Upload Success Signal

diff --git a/.github/workflows/triton-test.yaml b/.github/workflows/triton-test.yaml
@@ -29,7 +29,7 @@ jobs:
           GITHUB_SHA: ${{ github.sha }}
 
   triton:
-    runs-on: aiter-mi300-1gpu
+    runs-on: aiter-1gpu-runner
     needs: [check-signal]
     env:
       DOCKER_IMAGE: "rocm/pytorch:latest"

diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,6 @@
 graft aiter
-graft aiter_meta
+graft aiter_meta
+
+# exclude cache and compiled files .pyc / .pyo / .pyd / .pyd 
+global-exclude *.py[cod]
+prune aiter/jit/build
diff --git a/aiter/aot/sampling.py b/aiter/aot/sampling.py
@@ -0,0 +1,89 @@
+from collections import namedtuple
+import os
+import concurrent.futures
+from csrc.cpp_itfs.sampling.top_k_renorm_probs import (
+    compile as top_k_renorm_probs_compile,
+)
+from csrc.cpp_itfs.sampling.top_p_sampling_from_probs import (
+    compile as top_p_sampling_from_probs_compile,
+)
+from csrc.cpp_itfs.sampling.top_k_top_p_sampling_from_probs import (
+    compile as top_k_top_p_sampling_from_probs_compile,
+)
+
+TopKRenormConfig = namedtuple(
+    "TopKRenormConfig",
+    ["vec_size", "func_name"],
+)
+
+TopPSamplingConfig = namedtuple(
+    "TopPSamplingConfig",
+    ["vec_size", "deterministic", "func_name"],
+)
+
+TopKTopPSamplingConfig = namedtuple(
+    "TopKTopPSamplingConfig",
+    ["vec_size", "deterministic", "func_name"],
+)
+
+
+def process_top_k_renorm_config(config):
+    return top_k_renorm_probs_compile(config.vec_size)
+
+
+def process_top_p_sampling_config(config):
+    return top_p_sampling_from_probs_compile(config.vec_size, config.deterministic)
+
+
+def process_top_k_top_p_sampling_config(config):
+    return top_k_top_p_sampling_from_probs_compile(
+        config.vec_size, config.deterministic
+    )
+
+
+def main():
+    # Generate configs for top_k_renorm_probs
+    top_k_renorm_configs = []
+    for vec_size in range(1, 5):
+        top_k_renorm_configs.append(
+            TopKRenormConfig(
+                vec_size=vec_size,
+                func_name="top_k_renorm_probs",
+            )
+        )
+
+    # Generate configs for top_p_sampling_from_probs
+    top_p_sampling_configs = []
+    for vec_size in range(1, 5):
+        for deterministic in [False, True]:
+            top_p_sampling_configs.append(
+                TopPSamplingConfig(
+                    vec_size=vec_size,
+                    deterministic=deterministic,
+                    func_name="top_p_sampling_from_probs",
+                )
+            )
+
+    # Generate configs for top_k_top_p_sampling_from_probs
+    top_k_top_p_sampling_configs = []
+    for vec_size in range(1, 5):
+        for deterministic in [False, True]:
+            top_k_top_p_sampling_configs.append(
+                TopKTopPSamplingConfig(
+                    vec_size=vec_size,
+                    deterministic=deterministic,
+                    func_name="top_k_top_p_sampling_from_probs",
+                )
+            )
+
+    max_jobs = int(os.environ.get("MAX_JOBS", os.cpu_count() or 16))
+
+    # Process all configs in parallel
+    with concurrent.futures.ProcessPoolExecutor(max_workers=max_jobs) as executor:
+        executor.map(process_top_k_renorm_config, top_k_renorm_configs)
+        executor.map(process_top_p_sampling_config, top_p_sampling_configs)
+        executor.map(process_top_k_top_p_sampling_config, top_k_top_p_sampling_configs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/aiter/configs/a4w4_blockscale_tuned_gemm.csv b/aiter/configs/a4w4_blockscale_tuned_gemm.csv
@@ -921,3 +921,5 @@ cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio
 256,8,3072,1536,42,0,5.4682,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,13.81,441.57,0.0
 256,8,7168,2048,29,0,5.836,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,40.25,1278.77,0.0
 256,8,512,7168,29,0,9.6677,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.07,193.62,0.0
+256,32768,2112,7168,48,0,293.0219,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3385.88,898.98,0.0
+256,65536,2112,7168,48,0,575.6528,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3447.0,902.06,0.0
diff --git a/aiter/configs/a4w4_blockscale_untuned_gemm.csv b/aiter/configs/a4w4_blockscale_untuned_gemm.csv
@@ -193,3 +193,5 @@ M,N,K
 3000, 7168, 2048
 3000, 512, 7168
 60000, 4096, 512
+32768, 2112, 7168
+65536, 2112, 7168