ROCm · valarLip · Dec 22, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json
@@ -1077,7 +1077,8 @@
     "module_topk_plain": {
         "srcs": [
             "f'{AITER_CSRC_DIR}/pybind/topk_plain_pybind.cu'",
-            "f'{AITER_CSRC_DIR}/kernels/topk_plain_kernels.cu'"
+            "f'{AITER_CSRC_DIR}/kernels/topk_plain_kernels.cu'",
+            "f'{AITER_CSRC_DIR}/kernels/topk_per_row_kernels.cu'"
         ],
         "flags_extra_cc": [],
         "flags_extra_hip": [],

diff --git a/aiter/ops/topk_plain.py b/aiter/ops/topk_plain.py
@@ -13,7 +13,12 @@
 def topk_plain(
     x: torch.Tensor,
     topk_ids: torch.Tensor,
+    topk_out: torch.Tensor,
     topk: int,
-    largest: bool,
+    largest: bool = True,
+    rowStarts: torch.Tensor = None,
+    rowEnds: torch.Tensor = None,
+    stride0: int = -1,
+    stride1: int = 1,
 ) -> None:
     pass
diff --git a/csrc/include/opus/opus.hpp b/csrc/include/opus/opus.hpp
@@ -907,7 +907,7 @@ template<> OPUS_D float       min<float>(const float&a, const float&b) { return
 
 template<typename T> OPUS_D T med3(const T&a, const T&b, const T&c) { auto max_0 = max(a, b); auto min_0 = max(a, b); return max(max_0, max(min_0, c)); }
 template<> OPUS_D float       med3<float>(const float&a, const float&b, const float&c) { return __builtin_amdgcn_fmed3f(a, b, c); }
-template<> OPUS_D __fp16      med3<__fp16>(const __fp16&a, const __fp16&b, const __fp16&c) { return __builtin_amdgcn_fmed3h(a, b, c); }
+template<> OPUS_D _Float16    med3<_Float16>(const _Float16&a, const _Float16&b, const _Float16&c) { return __builtin_amdgcn_fmed3h(a, b, c); }
 /////////////////////////////////////////////////////////////////////////////////////////////////////////
 // buffer load/store related
 OPUS_D constexpr auto buffer_default_config() {

diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp
@@ -1635,10 +1635,15 @@ namespace py = pybind11;
           py::arg("final_output"),       \
           py::arg("final_lse") = std::nullopt);
 
-#define TOPK_PLAIN_PYBIND      \
-    m.def("topk_plain",        \
-          &topk_plain,         \
-          py::arg("values"),   \
-          py::arg("topk_ids"), \
-          py::arg("topk"),     \
-          py::arg("largest"));
+#define TOPK_PLAIN_PYBIND           \
+    m.def("topk_plain",             \
+          &topk_plain,              \
+          py::arg("values"),        \
+          py::arg("topk_ids"),      \
+          py::arg("topk_out"),      \
+          py::arg("topk"),          \
+          py::arg("largest") = true, \
+          py::arg("rowStarts") = torch::Tensor(), \
+          py::arg("rowEnds") = torch::Tensor(), \
+          py::arg("stride0") = -1,  \
+          py::arg("stride1") = 1);
diff --git a/csrc/include/topk_plain.h b/csrc/include/topk_plain.h
@@ -6,5 +6,10 @@
 
 void topk_plain(torch::Tensor& values,
                 torch::Tensor& topk_ids,
-                int topk_num,
-                bool largest);
+                torch::Tensor& topk_out,
+                int topk,
+                bool largest = true,
+                torch::Tensor rowStarts = torch::Tensor(),
+                torch::Tensor rowEnds = torch::Tensor(),
+                int64_t stride0 = -1,
+                int64_t stride1 = 1);
diff --git a/csrc/kernels/topk_per_row_kernels.cu b/csrc/kernels/topk_per_row_kernels.cu
@@ -420,7 +420,8 @@ __device__ void filter_and_histogram(T const* in_buf,
                                      IdxT* histogram,
                                      bool select_min,
                                      int pass,
-                                     bool early_stop)
+                                     bool early_stop,
+                                     IdxT k)
 {
     constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
     __shared__ IdxT histogram_smem[num_buckets];
@@ -893,9 +894,19 @@ __global__ void radix_kernel(T const* in,
                              int const pass)
 {
     const int64_t batch_id = blockIdx.y;
-    const IdxT row_len     = phase == Phase::Prefill
-                                 ? rowEnds[batch_id] - rowStarts[batch_id]
-                                 : rowEnds[batch_id / next_n] - next_n + (batch_id % next_n) + 1;
+
+    IdxT row_len = len;
+    if(phase == Phase::Prefill)
+    {
+        if(rowStarts && rowEnds)
+        {
+            row_len = rowEnds[batch_id] - rowStarts[batch_id];
+        }
+    }
+    else
+    {
+        row_len = rowEnds[batch_id / next_n] - next_n + (batch_id % next_n) + 1;
+    }
 
     auto counter = counters + batch_id;
     IdxT current_k;
@@ -965,7 +976,8 @@ __global__ void radix_kernel(T const* in,
                                                                   histogram,
                                                                   select_min,
                                                                   pass,
-                                                                  early_stop);
+                                                                  early_stop,
+                                                                  k);
     __threadfence();
 
     bool isLastBlock = false;
@@ -1187,7 +1199,8 @@ __device__ bool filter_and_histogram_for_one_block(T const* in_buf,
                                                    Counter<T, IdxT>* counter,
                                                    IdxT* histogram,
                                                    bool select_min,
-                                                   int pass)
+                                                   int pass,
+                                                   IdxT k)
 {
     constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
     for(int i = threadIdx.x; i < num_buckets * 2; i += blockDim.x)
@@ -1371,11 +1384,25 @@ __global__ void radix_topk_one_block_kernel(T const* in,
     __shared__ IdxT histogram[num_buckets * 2];
 
     const int64_t batch_id = blockIdx.x;
-    const IdxT rowStart    = phase == Phase::Prefill ? rowStarts[batch_id] : 0;
-    const IdxT rowEnd      = phase == Phase::Prefill
-                                 ? rowEnds[batch_id]
-                                 : rowEnds[batch_id / next_n] - next_n + (batch_id % next_n) + 1;
-    const IdxT row_len     = rowEnd - rowStart;
+
+    IdxT rowStart = 0;
+    IdxT rowEnd   = len;
+    if(phase == Phase::Prefill)
+    {
+        if(rowStarts && rowEnds)
+        {
+            rowStart = rowStarts[batch_id];
+            rowEnd   = rowEnds[batch_id];
+        }
+    }
+    else
+    {
+        rowEnd   = rowEnds[batch_id / next_n] - next_n + (batch_id % next_n) + 1;
+        rowStart = 0;
+    }
+
+    const IdxT row_len = rowEnd - rowStart;
+
     if(threadIdx.x == 0)
     {
         counter.k              = k;
@@ -1448,7 +1475,8 @@ __global__ void radix_topk_one_block_kernel(T const* in,
                 &counter,
                 histogram,
                 select_min,
-                pass); //@TODO CHECK UPDATE CODE
+                pass,
+                k); //@TODO CHECK UPDATE CODE
         __syncthreads();
 
         scan<IdxT, BitsPerPass, BlockSize>(histogram + use_one_pass * num_buckets);
@@ -1811,6 +1839,35 @@ void standalone_stable_radix_11bits(void* buf,
     }
 }
 
+// Explicit template instantiation for standalone_stable_radix_11bits
+template void standalone_stable_radix_11bits<float, int, true, true>(void* buf,
+                                                                     size_t& buf_size,
+                                                                     float const* in,
+                                                                     int batch_size,
+                                                                     int64_t len,
+                                                                     int* rowStarts,
+                                                                     int* rowEnds,
+                                                                     int k,
+                                                                     float* out,
+                                                                     int* out_idx,
+                                                                     bool greater,
+                                                                     hipStream_t stream,
+                                                                     int next_n);
+
+template void standalone_stable_radix_11bits<float, int, false, true>(void* buf,
+                                                                      size_t& buf_size,
+                                                                      float const* in,
+                                                                      int batch_size,
+                                                                      int64_t len,
+                                                                      int* rowStarts,
+                                                                      int* rowEnds,
+                                                                      int k,
+                                                                      float* out,
+                                                                      int* out_idx,
+                                                                      bool greater,
+                                                                      hipStream_t stream,
+                                                                      int next_n);
+
 // AIR TopK end
 
 static inline __device__ uint32_t floatAsSortableUint(float x)
@@ -2410,6 +2467,9 @@ int64_t invokeComputeTopkLastDimWorkspaceSize(int32_t numRows, int32_t stride0)
     return buf_size;
 }
 
+// Explicit template instantiation to ensure the symbol is available for linking
+template int64_t invokeComputeTopkLastDimWorkspaceSize<float>(int32_t numRows, int32_t stride0);
+
 void top_k_per_row_prefill(const torch::Tensor& logits,
                            const torch::Tensor& rowStarts,
                            const torch::Tensor& rowEnds,