From f2881a0f43a9b46db5abc8c97db191460ad909dd Mon Sep 17 00:00:00 2001 From: Linjun-AMD Date: Wed, 17 Dec 2025 16:28:15 +0800 Subject: [PATCH 01/40] fix sink error for asm fmha (#1652) Signed-off-by: Linjun-AMD --- csrc/py_itfs_cu/asm_mha_fwd.cu | 1 + csrc/py_itfs_cu/asm_mha_varlen_fwd.cu | 1 + 2 files changed, 2 insertions(+) diff --git a/csrc/py_itfs_cu/asm_mha_fwd.cu b/csrc/py_itfs_cu/asm_mha_fwd.cu index d4c901fd2a..2efc1d7828 100644 --- a/csrc/py_itfs_cu/asm_mha_fwd.cu +++ b/csrc/py_itfs_cu/asm_mha_fwd.cu @@ -321,6 +321,7 @@ std::vector fmha_v3_fwd(at::Tensor &q, // [b, sq, hq, d] has_lse, quant_scale_enum::no_scale, true, + false, how_v3_bf16_cvt); TORCH_CHECK(t >= 0, "invalid argument for fmha_fwd"); } diff --git a/csrc/py_itfs_cu/asm_mha_varlen_fwd.cu b/csrc/py_itfs_cu/asm_mha_varlen_fwd.cu index 0189bd267c..6d2e9b8b19 100644 --- a/csrc/py_itfs_cu/asm_mha_varlen_fwd.cu +++ b/csrc/py_itfs_cu/asm_mha_varlen_fwd.cu @@ -399,6 +399,7 @@ fmha_v3_varlen_fwd(at::Tensor &q, // [total_q, hq, d] has_lse, quant_scale_enum::no_scale, true, + false, how_v3_bf16_cvt); TORCH_CHECK(t >= 0, "invalid argument for fmha_v3_varlen_fwd 3"); } From f2d16272cc618c404d83a4355ee17ba81450e9da Mon Sep 17 00:00:00 2001 From: Lingpeng Jin <103567126+valarLip@users.noreply.github.com> Date: Wed, 17 Dec 2025 22:47:41 +0800 Subject: [PATCH 02/40] add guard in case pynccl init failed (#1671) --- .../dist/device_communicators/communicator_cuda.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py index fcc7ee05b2..60ff59ca57 100644 --- a/aiter/dist/device_communicators/communicator_cuda.py +++ b/aiter/dist/device_communicators/communicator_cuda.py @@ -49,10 +49,16 @@ def __init__( PyNcclCommunicator, ) - self.pynccl_comm = PyNcclCommunicator( - group=self.cpu_group, - device=self.device, - ) + try: + self.pynccl_comm = PyNcclCommunicator( + group=self.cpu_group, + device=self.device, + ) + except Exception as e: + logger.warning( + f"Failed to initialize PyNcclCommunicator for group " + f"{self.unique_name}. Exception: {e}" + ) # if is_symmetric_memory_enabled(): # register_nccl_symmetric_ops(self.pynccl_comm) From 999ebcdc0573d9e9e9b4e71bdb86b2521a537582 Mon Sep 17 00:00:00 2001 From: who who who Date: Thu, 18 Dec 2025 08:41:48 +0800 Subject: [PATCH 03/40] One shot pa (#1670) * add one shot pa kernel * fix buffer load in sliding window kernel * fix typo * revert --------- Co-authored-by: root --- aiter/ops/triton/gluon/pa_decode_gluon.py | 57 +++++++++++------------ 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/aiter/ops/triton/gluon/pa_decode_gluon.py b/aiter/ops/triton/gluon/pa_decode_gluon.py index 37e7ddd268..9c07d4c4ac 100644 --- a/aiter/ops/triton/gluon/pa_decode_gluon.py +++ b/aiter/ops/triton/gluon/pa_decode_gluon.py @@ -1463,6 +1463,7 @@ def paged_attention_decode_sliding_window( * stride_output_head + output_head_size_offsets[None, :] ) + max_logits = gl.full( (QUERY_GROUP_SIZE_POW2,), float("-inf"), @@ -1481,12 +1482,15 @@ def paged_attention_decode_sliding_window( # ==================== SEQUENCE PROCESSING ==================== query_converted = query_shared.load(qk_lhs_operand_layout) - # query_converted = gl.convert_layout(query_tensor, layout=qk_lhs_operand_layout) - sequence_partition_start_idx = ( - context_length - SLIDING_WINDOW - ) // CONTEXT_PARTITION_SIZE + + if SLIDING_WINDOW > 0: + sequence_partition_start_idx = ( + context_length - SLIDING_WINDOW + ) // CONTEXT_PARTITION_SIZE + else: + sequence_partition_start_idx = 0 sequence_partition_end_idx = gl.cdiv(context_length, CONTEXT_PARTITION_SIZE) - # num_iterations = sequence_partition_end_idx - sequence_partition_start_idx + if QUERY_QUANT_MODE < 0 and COMPUTE_TYPE.is_fp8(): # Quantize bf16 query to fp8 # Convert query to float32 for computation @@ -1524,11 +1528,11 @@ def paged_attention_decode_sliding_window( ) # Create mask for valid blocks valid_block_mask = block_indices < num_kv_blocks - # masked_block_indices = gl.where(valid_block_mask, block_indices, 0) + masked_block_indices = gl.where(valid_block_mask, block_indices, 0) block_table_start_ptr = block_tables_ptr + sequence_idx * stride_block_table_seq kv_block_numbers = gl.amd.cdna3.buffer_load( - ptr=block_table_start_ptr + kv_block_start_idx, offsets=block_indices - ).to(gl.uint32) + ptr=block_table_start_ptr + kv_block_start_idx, offsets=masked_block_indices + ).to(gl.int64) # ==================== KEY LOADING AND PROCESSING ==================== # Calculate key cache offsets and load keys @@ -1540,20 +1544,15 @@ def paged_attention_decode_sliding_window( * CONTIGUOUS_KV_ELEMENTS_PER_16B_LOAD + contiguous_kv_element_offsets[None, None, None, :] ) - # Optimize: Start key load, then prepare QK MFMA accumulators/query (overlaps with key load) - key_tensor = gl.amd.cdna3.buffer_load( - ptr=key_cache_ptr, - offsets=key_block_offsets, - mask=valid_block_mask[:, None, None, None], - ) + # Optimize: Start key load, then prepare QK MFMA accumulators/query (overlaps with key load) + key_tensor = gl.load(key_cache_ptr + key_block_offsets) # Prepare QK MFMA while key loads (these don't depend on key data) qk_accumulator = gl.zeros( (QUERY_GROUP_SIZE_POW2, CONTEXT_PARTITION_SIZE), dtype=gl.float32, layout=qk_mfma_layout, ) - # Load key quantization scales if needed (overlaps with key tensor load) if KV_QUANT_MODE >= 0: if KV_QUANT_MODE == 0: @@ -1622,11 +1621,7 @@ def paged_attention_decode_sliding_window( * CONTIGUOUS_KV_ELEMENTS_PER_16B_LOAD + value_dim3_offsets[None, None, None, :] ) - value_tensor = gl.amd.cdna3.buffer_load( - ptr=value_cache_ptr, - offsets=value_block_offsets, - mask=valid_block_mask[:, None, None, None], - ) + value_tensor = gl.load(value_cache_ptr + value_block_offsets) # Compute QK attention scores using MFMA (overlaps with value load) attention_scores = gl.amd.cdna3.mfma( query_converted, key_converted, qk_accumulator @@ -1655,11 +1650,7 @@ def paged_attention_decode_sliding_window( ) # Schedule: Start value VMEM load, then QK MFMA - value_tensor = gl.amd.cdna3.buffer_load( - ptr=value_cache_ptr, - offsets=value_block_offsets, - mask=valid_block_mask[:, None, None], - ) + value_tensor = gl.load(value_cache_ptr + value_block_offsets) # Compute QK attention scores using MFMA (overlaps with value load) attention_scores = gl.amd.cdna3.mfma( query_converted, key_converted, qk_accumulator @@ -1790,8 +1781,6 @@ def paged_attention_decode_sliding_window( attention_accumulator += attention_output max_logits = new_max_logits - # ==================== OUTPUT NORMALIZATION AND STORING ==================== - # Normalize attention output by softmax denominator if sinks_ptr is not None: sinks_values = gl.load( sinks_ptr + (kv_head_idx * query_group_size + query_group_offsets), @@ -1800,6 +1789,8 @@ def paged_attention_decode_sliding_window( exp_sums += gl.exp( gl.convert_layout(sinks_values, layout=max_logits.type.layout) - max_logits ) + # ==================== OUTPUT NORMALIZATION AND STORING ==================== + # Normalize attention output by softmax denominator exp_sums_reciprocal = 1.0 / exp_sums exp_sums_reciprocal_cvt = gl.convert_layout( @@ -2549,6 +2540,13 @@ def paged_attention_decode_v2_reduce_kernel( head_size_offsets = tl.arange(0, HEAD_SIZE_POW2) # Initialize global accumulation variables + # if USE_SINKS: + # global_max = tl.load( + # sink_token_ptr + (kv_head_idx * query_group_size + query_group_offsets), + # mask=query_group_offsets < query_group_size, + # other=float("-inf"), + # ).to(tl.float32) + # else: global_max = tl.full((QUERY_GROUP_SIZE_POW2,), float("-inf"), dtype=tl.float32) global_max_prev = global_max global_exp_sum = tl.zeros((QUERY_GROUP_SIZE_POW2,), dtype=tl.float32) @@ -2602,7 +2600,6 @@ def paged_attention_decode_v2_reduce_kernel( mask=query_group_offsets < query_group_size, ) global_exp_sum += gl.exp(sink_token_values - global_max) - # ==================== SECOND PASS: COMPUTE RESCALED EXP SUMS AND ACCUMULATE ==================== for iter_idx in range(num_iterations): partition_base = iter_idx * MAX_CONTEXT_PARTITION_NUM @@ -2972,6 +2969,7 @@ def pa_decode_gluon( alibi_slopes: torch.Tensor = None, sinks: torch.Tensor = None, sliding_window: int = 0, + one_shot=None, ) -> None: """ Paged Attention Decode with FP8/BF16/FP16 Support. @@ -3263,7 +3261,8 @@ def pa_decode_gluon( fp8_max_value = torch.finfo(aiter.dtypes.fp8).max # ==================== ATTENTION DECODE KERNEL EXECUTION ==================== - one_shot = sliding_window > 0 + if one_shot is None: + one_shot = sliding_window > 0 _paged_attention_decode_v2_with_dot_kernel_reshape_wrapper( grid, exp_sums, From 3d84d01606cc582caf6492a40ccbb522b50366b2 Mon Sep 17 00:00:00 2001 From: Double Young Date: Thu, 18 Dec 2025 09:30:51 +0800 Subject: [PATCH 04/40] fix(pa_ps): fix pa_ps_asm .co for gfx950 (#1669) Signed-off-by: Double Young --- ...pertokenFp8_gqa16_1tg_4w_qlen16_msk1_ps.co | Bin 36648 -> 36648 bytes ...pertokenFp8_gqa16_1tg_4w_qlen32_msk1_ps.co | Bin 57040 -> 57040 bytes ...pertokenFp8_gqa16_1tg_4w_qlen40_msk1_ps.co | Bin 70984 -> 70984 bytes ...pertokenFp8_gqa16_1tg_4w_qlen48_msk1_ps.co | Bin 74352 -> 74352 bytes ...pertokenFp8_gqa16_1tg_4w_qlen64_msk1_ps.co | Bin 94256 -> 94256 bytes 5 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen16_msk1_ps.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen16_msk1_ps.co index c279c6b49ed3ab711b1dc9353c6878a20af4b972..ab2943cf705eb86300c1998b7e9b6510f790231e 100755 GIT binary patch delta 14 WcmZ26k7>m`rU?d&{u>Ry_W=Mb1_lTK delta 14 WcmZ26k7>m`rU?d&J{t|c_W=Ma_XY(3 diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen32_msk1_ps.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen32_msk1_ps.co index 74db78851ce92df60b5c43013ccaada7b6aedbb2..8e1615c8ac2d65d6eb55582b9f5523fdb01861df 100755 GIT binary patch delta 14 Wcmcbxm-)h8<_QLj{u>SF-vazW@LL diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen48_msk1_ps.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen48_msk1_ps.co index a1b245c4684e06b02f02f38f13d9e2f043f50b9f..7789b261df99e8ce931ee31341715324aed30033 100755 GIT binary patch delta 17 Ycmexxgyq8#mI(%o{*8uP4H@l~07vfzdjJ3c delta 17 Ycmexxgyq8#mI(%oK8=Q34H@l~07usbcmMzZ diff --git a/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen64_msk1_ps.co b/hsa/gfx950/pa/pa_bf16_pertokenFp8_gqa16_1tg_4w_qlen64_msk1_ps.co index b243451f07c03ef9714c64934c5de8014c53f155..91d0b0f3b50dd248bb47559f63188d1e0e5e2018 100755 GIT binary patch delta 17 Zcmdn+fOW$I)(HlT{*8uP4H^IE0{}%s2SNY< delta 17 Zcmdn+fOW$I)(HlTK8=Q34H^IE0{}%U2R{G+ From 78994e08ba77530c128de511d7abaf9c390dfac3 Mon Sep 17 00:00:00 2001 From: amd-ruitang3 <145657428+amd-ruitang3@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:13:51 +0800 Subject: [PATCH 05/40] modify test_bf16gemm_test (#1678) --- op_tests/test_gemm_a16w16.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/op_tests/test_gemm_a16w16.py b/op_tests/test_gemm_a16w16.py index 1e80071fb1..c5e297b0ff 100755 --- a/op_tests/test_gemm_a16w16.py +++ b/op_tests/test_gemm_a16w16.py @@ -96,7 +96,7 @@ def test_gemm(dtype, m, n, k, bias=False, otype=None, scaleA=None, scaleB=None): ret = {} dim = (m, n, k) x = torch.randn(m, k, dtype=otype, device="cuda").to(dtype) - weight = torch.rand(n, k, dtype=otype, device="cuda").to(dtype) + weight = torch.randn(n, k, dtype=otype, device="cuda").to(dtype) if otype is None: otype = dtype if bias: @@ -471,7 +471,7 @@ def test_skinny_gemm(): "-o", "--otype", type=dtypes.str2Dtype, - default=[None, torch.float16, torch.bfloat16, torch.float32], + default=[torch.float16, torch.bfloat16, torch.float32], help="""Data type of output. e.g.: -d bf16""", ) From 290e6595520b6423da3eb10fbd235181306e8c98 Mon Sep 17 00:00:00 2001 From: Satya Nikhil Kodukula Date: Thu, 18 Dec 2025 06:16:04 -0500 Subject: [PATCH 06/40] Fix Ruff command in pre-checks (#1675) --- .github/workflows/pre-checks.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pre-checks.yaml b/.github/workflows/pre-checks.yaml index 6bc1cd6b9a..f5dc4eff35 100644 --- a/.github/workflows/pre-checks.yaml +++ b/.github/workflows/pre-checks.yaml @@ -35,7 +35,7 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - name: Set up Python environment - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.12" - name: Install dependencies @@ -46,7 +46,16 @@ jobs: env: REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - ruff check . -e | reviewdog -efm="%f:%l:%c: %m" -diff="git diff FETCH_HEAD" -reporter=github-pr-check -tee + ruff check . \ + --output-format=rdjson \ + --exit-zero \ + --no-fix \ + | reviewdog \ + -f=rdjson \ + -name="ruff" \ + -reporter=github-pr-review \ + -filter-mode=diff_context \ + -fail-on-error=true upload-success-artifact: name: Upload Success Signal From 235cfa6c4698d0c26ab8fe9da366a662f0d6f03c Mon Sep 17 00:00:00 2001 From: JaxChen29 Date: Thu, 18 Dec 2025 22:21:30 +0800 Subject: [PATCH 07/40] fix mha bwd golden perf issue (#1666) --- op_tests/cpp/mha/benchmark_mha_bwd.cpp | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/op_tests/cpp/mha/benchmark_mha_bwd.cpp b/op_tests/cpp/mha/benchmark_mha_bwd.cpp index 82cf9769a7..b8829a1372 100644 --- a/op_tests/cpp/mha/benchmark_mha_bwd.cpp +++ b/op_tests/cpp/mha/benchmark_mha_bwd.cpp @@ -953,17 +953,24 @@ bool run(const ck_tile::ArgParser& arg_parser) } // dS_i_j = P_i_j .* (dP_i_j - dO_i dot O_i) - ds_hp_host_ref.ForEach([&](auto& self, auto idx_gmn) { - AccDataType do_dot_o = 0; + // Precompute dO_i dot O_i for each (head, seq_q) to avoid redundant computation + // This reduces complexity from O(nhead * seqlen_q * seqlen_k * hdim_v) to + // O(nhead * seqlen_q * hdim_v) + O(nhead * seqlen_q * seqlen_k) + ck_tile::HostTensor do_dot_o_ref({nhead, real_seqlen_q}); + do_dot_o_ref.ForEach([&](auto& self, auto idx_gm) { + AccDataType sum = 0; for(int o = 0; o < hdim_v; o++) { - auto idx_gmo = idx_gmn; - idx_gmo[2] = o; - do_dot_o += ck_tile::type_convert(do_host_ref(idx_gmo)) * - ck_tile::type_convert(o_host_refs[wb](idx_gmo)); + sum += ck_tile::type_convert(do_host_ref(idx_gm[0], idx_gm[1], o)) * + ck_tile::type_convert(o_host_refs[wb](idx_gm[0], idx_gm[1], o)); } + self(idx_gm) = sum; + }); + + ds_hp_host_ref.ForEach([&](auto& self, auto idx_gmn) { self(idx_gmn) = ck_tile::type_convert( - p_hp_host_refs[wb](idx_gmn) * (dp_hp_host_ref(idx_gmn) - do_dot_o)); + p_hp_host_refs[wb](idx_gmn) * + (dp_hp_host_ref(idx_gmn) - do_dot_o_ref(idx_gmn[0], idx_gmn[1]))); }); if(use_dbias) From b1278bd206d041ba7d8d30af099844b7854a677c Mon Sep 17 00:00:00 2001 From: steamedMantou <82486092+steamedMantou@users.noreply.github.com> Date: Fri, 19 Dec 2025 10:19:39 +0800 Subject: [PATCH 08/40] topk uplift v1 (#1662) /lgtm The customer has tested the code. It can work. * topk uplift v1 * topk add api for choose topk_v1 or topk_v2 --------- Co-authored-by: yonshuai Co-authored-by: yongshuai --- aiter/jit/optCompilerConfig.json | 1 + aiter/ops/topk.py | 12 ++++ csrc/include/rocm_ops.hpp | 9 +++ csrc/include/topk_per_row.h | 9 +++ csrc/py_itfs_cu/asm_topk_per_row_decode.cu | 63 ++++++++++++++++++ .../asm_top_k_per_row_decode.co | Bin 0 -> 29736 bytes op_tests/test_topk_per_row.py | 39 ++++++++--- 7 files changed, 123 insertions(+), 10 deletions(-) create mode 100644 csrc/py_itfs_cu/asm_topk_per_row_decode.cu create mode 100644 hsa/gfx942/topk_per_row_decode/asm_top_k_per_row_decode.co diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json index 090a742caa..ad1a9b3309 100755 --- a/aiter/jit/optCompilerConfig.json +++ b/aiter/jit/optCompilerConfig.json @@ -1036,6 +1036,7 @@ "module_top_k_per_row": { "srcs": [ "f'{AITER_CSRC_DIR}/kernels/topk_per_row_kernels.cu'", + "f'{AITER_CSRC_DIR}/py_itfs_cu/asm_topk_per_row_decode.cu'", "f'{AITER_CSRC_DIR}/pybind/topk_per_row_pybind.cu'" ], "flags_extra_cc": [], diff --git a/aiter/ops/topk.py b/aiter/ops/topk.py index 1c3666f832..5101a266c2 100755 --- a/aiter/ops/topk.py +++ b/aiter/ops/topk.py @@ -219,3 +219,15 @@ def top_k_per_row_decode( stride0: int, stride1: int, ) -> None: ... + + +@compile_ops("module_top_k_per_row") +def top_k_per_row_decode_fast( + logits: torch.Tensor, + next_n: int, + seqLens: torch.Tensor, + indices: torch.Tensor, + numRows: int, + stride0: int, + stride1: int, +) -> None: ... diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index b19c85495e..925deb96ea 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -1473,6 +1473,15 @@ namespace py = pybind11; py::arg("indices"), \ py::arg("numRows"), \ py::arg("stride0"), \ + py::arg("stride1")); \ + m.def("top_k_per_row_decode_fast", \ + &top_k_per_row_decode_fast, \ + py::arg("logits"), \ + py::arg("next_n"), \ + py::arg("seqLens"), \ + py::arg("indices"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ py::arg("stride1")); #define MLA_METADATA_PYBIND \ diff --git a/csrc/include/topk_per_row.h b/csrc/include/topk_per_row.h index e3bae1887d..86fcf9bf0c 100644 --- a/csrc/include/topk_per_row.h +++ b/csrc/include/topk_per_row.h @@ -18,3 +18,12 @@ void top_k_per_row_decode(const torch::Tensor& logits, int64_t numRows, int64_t stride0, int64_t stride1); + + +void top_k_per_row_decode_fast(const torch::Tensor& logits, + int64_t next_n, + const torch::Tensor& seqLens, + torch::Tensor& indices, + int64_t numRows, + int64_t stride0, + int64_t stride1); diff --git a/csrc/py_itfs_cu/asm_topk_per_row_decode.cu b/csrc/py_itfs_cu/asm_topk_per_row_decode.cu new file mode 100644 index 0000000000..f0e9a3e2a4 --- /dev/null +++ b/csrc/py_itfs_cu/asm_topk_per_row_decode.cu @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: MIT +// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +#include "aiter_hip_common.h" +#include "py_itfs_common.h" +#include +#include +#include + +struct __attribute__((packed)) TopKDecodeKernelArgs +{ + void* ptr_logits; + void* ptr_seqLens; + void* ptr_outIndices; + int32_t stride0; + int32_t stride1; + int32_t next_n; +}; + +void top_k_per_row_decode_fast(const torch::Tensor& logits, + int64_t next_n, + const torch::Tensor& seqLens, + torch::Tensor& indices, + int64_t numRows, + int64_t stride0, + int64_t stride1) +{ + TopKDecodeKernelArgs args; + size_t arg_size = sizeof(args); + + args.ptr_logits = logits.data_ptr(); + args.ptr_seqLens = seqLens.data_ptr(); + args.ptr_outIndices = indices.data_ptr(); + args.stride0 = static_cast(stride0); + args.stride1 = static_cast(stride1); + args.next_n = static_cast(next_n); + + // Load the compiled assembly kernel + // The mangled name: _ZN5aiter10DecodeTopKL19topk_per_row_decodeILi1024ELb0ELi4EEEvPKfPKiPiiii + // corresponds to: aiter::DecodeTopK::topk_per_row_decode<1024, false, 4> + static AiterAsmKernel impl_topk_decode( + "_ZN5aiter10DecodeTopKL19topk_per_row_decodeILi1024ELb0ELi4EEEvPKfPKiPiiii", + "/topk_per_row_decode/asm_top_k_per_row_decode.co"); + + const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(logits)); + const hipStream_t stream = at::hip::getCurrentHIPStream(); + + // Launch kernel configuration + constexpr int kNumThreadsPerBlock = 1024; + uint64_t gdx = numRows; + + TORCH_CHECK(gdx >> 31 == 0, "numRows too large: ", numRows); + + impl_topk_decode.launch_kernel({&args, + &arg_size, + static_cast(gdx), // gdx: one block per row + 1, // gdy + 1, // gdz + kNumThreadsPerBlock, // bdx: 1024 threads + 1, // bdy + 1, // bdz + stream}); +} + diff --git a/hsa/gfx942/topk_per_row_decode/asm_top_k_per_row_decode.co b/hsa/gfx942/topk_per_row_decode/asm_top_k_per_row_decode.co new file mode 100644 index 0000000000000000000000000000000000000000..ea9de63a66989b0d9f9adaa6107ce898a6097d7a GIT binary patch literal 29736 zcmeHweQ+Gdb!Sh{?Ck99?qIN30E-0)E`E5#pBr1s{+NPC6iIS^>K>*}} z0s#a7W%(`~&JOWK#DWkUi>D~E=s1b<;e5zfcPYE>ZkI?@SGG&3Blyq3Rld8_Rb5r0 zs=NP^@~M2>`@No--Pt7wfh3%A&V-8U>95!Cqx<#i_j+b8KX>Y9xFq#)=Kjg-Z=O^5UQz0rl$|CrG1B){i#R~rQxz%^2Kk$J0sphq#V(NFVzrL_r+PTs3GXtZ4kQ+aHc50ZKO^y7EVeRX=*9Jx}4)?!0GB)(5Zj=5q7tfv@ zp8Oa9+QR}cKNi3hAK8@KR0$&chiAUnFHok{NhCa)bP0r!(-F^ zXGdNi9_lAX{dM8=UkH|ixs~dH(UEgw+P}N2oH}&HaT_Prm9QKtn1J(%8U-;qN`)|0|z4G%zwf zJh{L3OFBWW04PrAkIEm5vB zUqAVPDfR!Q@vJj;la@S{eSY8X{rbCWyOd|Y{XfFKKJe?j<&Rc%f7`}qwr!9gG1lLo zADQSMyf`w{fB4X${R2maj`p7UDWy*>4t2yQ$PK&6K#8 zkMX3$8Ec< z&55Sc*Q4z_@K2Q6Z9C#l+NScjOwibGBo8_%)sZHjy(SmHpi3i|B z+-}Es;Zhpq?QQZb!Lw6uIVsVVwt6~Wf*7ITV_f&``u!S{q-$7#c^!?uA63@Vwf$IFmbIi zn=3LK@Y%sN6&By=mv*e<%+D--XI{s=*Ati%2@YAv1sy4Oes4+XO535w+8rmA?y`Bp zx|!zr&rfu$OOXgb@tX?pB^+|)q?8mU2b9T-`WdWR=H`lWaf&OK}V;v zxRlm)yFD6psGM+$$uUc1PRUB#na{>z`Mt5ZOXs;W54x`Y8v5Lb8J(PIx7EmAEXjZN zeabd0Z}`S z>W*WodGJMy6B6`jJ!MU+p+^z)m@i9>b@Zr#aeR6(XKTkvh;f$63#EdZ08OU#8)c_t zV-4;?N^{gw@64Am|G;;gcs{t*iMgR-e(!=jn0@;Dah48|TwVMEe5m8MCiuNrwDiS* zTvt4~s)FA#$rZ9IU|-O8FBUP6ElVf4s*o%AtwXMI0{S0v0B(C7zQj9ufu}Ng9rLz3 z!`Y2YXQvMQb^||}e`aU|lk7Cjzx%-RUh^-2XU+WcJBOWOgTU#9wUERz zo6Sw)m398k++A}XG}~OaRL}!IT$t88t^J=XjIJZE&ZZP_;5)ZUkWm7%x`}iqwWF($ zGBc>Y>AJ^8*KzJtWOiqbx9QOKchI`?5QmR^^kVEjDEHXQ1g#uNWssleNRwhHgim_m4qHpgC58rYX^uR`ZG zus3e6c5wD)$BFDm{3$EkD3$FKZwtL+KlK#b)nMdF2Z_3bR4$xYsnmskI#8FC%2)1H zmy*i!mAVrw{G{LSgcW{9D(_<7S!;-d_^u~UWNYf4mC83dyzx%B{Z>0pkVO7?qTp0VpeT3%^2pf|kyW5_>_wjnRS z7x_;Zc{S|ipFn;e^k&4%???WVMqZD4`9sJbG4ipvmp_L5r;NPO;O0r6G(4VMVTN5; zO=n9%*Uiy&s6ai;zP-lw z?&a89-(%QQ8Rjp|l~6p9C%NizJx}vff}Zf_C)14gjKl)=h^1l+#t30wVwjsWo`Uf( z*Hyx?98SJReUdTf5{*fFN(61xrr3GPpDLu+HEgD3Jf9@L1o)|`5mrE+`V-Edczo$f zj}gN%0-nu#xsf$iXe}5eqhOSib2rQ7%gLKz*hx#XADKqCCPm6(UKvJrm_GQ4LcBH3VI1F%NfRkLuMj_z%Fl^w`XOFIfs};FX z-;dZ_i9z;|QKekypFlo=u>*QW_4JJ9=^5SAGYO#^;;PUyGW3kxm=$_KJVWPrdM3G< zo)K-KlL(f0CY=MlC7wM!Bc44yBc3ggHOk)Fv_*S-8? zppNv+z3NEMRO)g}-sktrSu*rYho@&c_T{oQbwNFobNj7!!k&p(IB^Q|2vKU$}Eowo;@on9vstb`;Iu^BG&LfpUs;N0Wu0+dUhE?`*u z47$KFKD;W?GxWe-HeVv$fWu@=Dk|XHlC(=#*9N>Pk_Ju>~@E zS6V2g*#BLNg|6bP&|!5t_7d1{ha|~iUCfpqlbj^%JjRYGwsfsn;0LU-biK&>&fd2}I=>{@ zWsPhht@*IEdiZEh`RGyC4nORkD|Q??%w`WCe!1z$JK*(O#h=IAYC1wTxm~oFy+G$d z=kQ_Jxqa*HZLFPW75TpOdV2~S*z?umb7*rVcTMd1awA0)zkAEcac~LeBdkH*$bs&6 zZxy^UuzgKxOfZAaon9F_t+B@AEWF%uF$adLzD6G#q2e;D@cgLf?8rJ8LI?5)JJk%_96- zA*0q%Bb&(`&bL1XKBBKdH0R8&vYEpg1E0;qc?~J)kamt$gR~W6k`B>m^PxRg2|;ey zb2pGzq}jAq&BHzA!-rkon=bF$8%;KAR(bym7`M=C=C{m$%(c7t-?Rb$n>OHo(+2!+ z0{2f%{wH$z)*$~A z#Q%iH|6GOttseg)_rZT%@ITR0P9$ph9})adyxi2}&VAy)L;P>5oBxS~nE#PB77_EG z@|gdTjriZZ0sosf;D7T5{BQnI@jvPEUuU|{fBi20C-eF}`Jb%tUw;7pN4LoTZ{L9) zc>w-@i~9Ncr*#AVw{F1y)(!aIx^@1)jWJ38Y;FGEAO9V|c53GT*7-km$MtQgI9H5f zK6}0nZx2XVtBGV!Ihl0VLBiETI&{Ho!e0j~#a~3f;2I!*7U5HX|BUtlEy6Uh4op+5 z13G`OJ=cmqgLV};+{Zqk-n9>;S-8#9Lv5ZOYV-6^Te?OMJ&rLgX->$2{FJ1J!T~*` zqF#;96;(%FvU$={d5c&Jdh{Q8EznsEHWi)6bjW55c!g6-fnGRtPoyY(%9l{^#?f28)|{+HjKfh(%+Y7OQbjGOiJ_ha%H}{HkW%Y zO<=CNdLx311HPRa2{B=}Xz*1qQ|C73zAY^l^@57?`-ihkPB7h}XL}az2JEez=~vW} z5=}CykAzW=ay^dnf^O>#Nfx!Ejt>3FG9Bxh9%1Lzx%nB>%)^Iz_3vm-dNvdJ?8rtY zza3_aPC7sbKJ13HEWIne333X|>00hey(Mh482HlHcjtQs8dFAy#?9$@SGb^oV4Q<8Z0FXDVwjN9jqtB@@Ymj(O`g(AU?MVw1) zLJ?;jTJJ0~u^z{mWt?-U9kmgDZ@@06ba4&}p%2-SWG9-vqP_6Jg=E!I$ahV65WVcH z#lQUJ7R$!ji|j?Rg`(0QN$_9y*lJx_SnW|DSMuE{kn5_3^OUA%3R*0CQ!}!m2EE(S z^)8$`)*%*H4@hMeDeE0QHt?l<1HxD$#Zgk!GqUi}P9k#*^>l@YQn&}nA9DFWkuTMT zuM#|l&k^T4MrZ%fZbk6CVGr$xi={U;Jxg}EtX>}~w}?PBMIW~C=$RzdsSBE@Hn#*Ta%da-k%vp{$| z2Ndutz0Psga7t4heNy52`$-R~= zcf&v5V`-6h)(u6iz}wU7`az=m8S7+9Mj{{Da~UqdZj~ahC~cI6Bv}*X3S_6<)})YZ zn{K~XIqx~oSkGmyW-VrB-&J^51a1MFefyI1OU$|fu&*+=jV!BKO)|4z-a?yqw$SFc zx6tP8EwuTqEwp)S(>5G*u`I{hptFA(YeT|3wGUpqlxF|y7Gi@eeg2Q7Wi={xg#JikmO+gb51q8$E3Dz}d(m<&H!r9FI$Zh5-49DYT&yr;GtK1CP)Ky5kviEjJjqC9~) z=lbH(>y4yw12iO2w^PtPN_uyvOB>;T!rBQRX$SD^0RA1HI_}|#a^O$p_K5`B`DCCR z_`Bul+H&CUmiLHq;$;I64B}yfU%Z0n=z0RU`1Qm)v|cx{Ul4vEVNL$juV9xY-{;e; zLwYK|ONNX>96n6wHdOdBqPPaFD@q|hsqX0K%2*m5J$0A48U>01(PGn)7CEUnqR z5u~21l)-2aWmKo3f59-a!nbAsmNhr0?^!H1RbW;7Rbf@(Zw5=X=B_H%%$%ys37eN_ zB7YEzrhyy$ckwj#dNb>&i5{hibvumag|&`K7#v7L4EKUvdJSb}L0Mf~AiG0`FG|P# z3`<>E@M%^9G`l#p0Z0{hvSV%DMh4Sb1pcUkX=drnSO70oYk4k#bBj-NuwNBcmFCT0 znbz`E&^>3W%i^r&(;RAe#-sTenq5`7f<~8?#RsGX@~fjID8Kuor508VEjY6TX@P(s zzh~G&Dv_O0TY8jM-i=FCLVl7;x5KSyFUs(xm}1UetI(`N zek$f7%~_Yfdtp(ti|bR9H35C z%3zZ~mpsh~U!%2$663V$df&VS*N{uZN@sYhl}HsfD!}Ewo>R z8(q0@jnox%J#>XDm+F2+b3nsBM(e6NFHTmU=huRwKSJB zs_RPVkk%^AB)5=9V+b@p94*ih+eS+*tgX@_d9*-RJR~hx*V{%*Ev&85f{7{C6?DZz z(t>rpZM4+F+A1v-9Z{jBsw3`iy+B9Q(Gt`V_qSeZVb#zQ&=KLldijd50kv*NO9=gd zxTZpDC=zB7@&l61$8o2@N>uehg7iVe)q!;8wE6S6y9s^ZmAQKHd2(W|MZ?Eg0r#27 zk62KlTk7;duwNBc75+xBgl++>By2v?2Z?|_AQ)ljJ`k~{MO=NZDzxod>#EFDE_7Sf zekEBcp8e{c8C>1wm$~-NF`-iqiH~*mtKY8*3$%FfH-aV3n1E$ii{f0Ut4s5iXTQ=JSeD^K zLaIcMyXOkoT{kccuwLklyHv4D>5MCTYlF_wbe^U2onPkS)JDu$Q=`yFy6&EF$xn|n zjj?B`VwVQ{Rbf@(Zv;#D2*}r8IoB_WI2PFqKz>bF57A4&o70Zu-JJ#!UWg8izns_-|0MR5#* zrC82Yc+%(KPZ9V1g)fO{L5$0%i-q7VLtHBk>fEx$67IrfZBQxRdjaM*Qw&6ZaEl?< z?2sP=cYreF_sa+$l|(6;i`apn9lmR_{Ydr|+PyNL1~D(f&=t^of_CUC-2I=^2`>Cv zfQKdmzPRrxaKyV8!v6+*89b%IxZ77!yoUoviEzYOon#WgkrR%PpYWBD4~f^v$Mqrk zWiC&t3}-Oh!zTHN`2n6{%nFWxhb98YkdId!H}L&u^5N5a7Cv8>Qwri=AKc=EBm7|G z+X~<)6ONcqBF=Vy>j5|h)&s_@;0SmiN8qUXc*UV0{bvq^Pcd5`$Ay&IL+^vggrll< zQQSR%qe3`BFd=GBehptgRCNaQ`-VCLI0p0&#;o87c<3&04EuP+)ldaT8J;qVDf>9i zq_iF#I4XprrgiDqdjmLz2uBD;rS`OUh3?YPCo5&O`W`q2^gYI`;0Sn7fdWUZf@2tQ zF!BduJ}Mdly`KFb{J<%_hvI=D!VxjeWG#*=;RwNmeH?xNkdF`TEq>W%dtwkrz(aR| zW5mZRu7xW&h9YVWj)hdL#{iBh;fS-3k+~k^W0-J+V8mS(vS)n1fs2o8&-i7&4MKI% zCh!ry65%(14-0!jg^z%TCIZK31xFZE{<9hlGv6OTd}NaO9=-EoBCc9a89lCE`ruZW za5juC6F6HfZ!ge$L4+@S<{G6Cmbi=70=#R_kFT&6@YXML&sVQ_{*D*KS(IY60tn8jnUQc!v#*iTkI4_y|KxA}}unUeId~jTiV+;V0m2iWeii0yt5e zq#h^ev2Dc(@HWLss>4ay04M0JZN&-jHpPk8;iPVW6ZF)!;skh`;w0DMq;G%|^wPHC z1bCa`q}1U=F~Iuy1UPqOKhKdd{q7@yaHAHH3 zr-?m>;szV(Mc+pk)XiJcae+OK!M7ZMFZK)QBe7rL3sz#kK%B6`GtzyV>3&AK-#b5$ z?&r1eVW$ckPVAWkpW#dzgb%&5>HbIWmj&S~c%L+Yzkm-tArb!teAt^o_^8{!27uk{ z*#Pi&i1)wU{);7gpSB)8*8Dx;qYgs|`Bv>995j6R^vNQ6Kh}p&@59!^$NIe|eAM~y z#d|curVhY|=~@FnxdD8v#e2d>-KOwy9YngXw<&y>vNiDO9X5J5S8qdn zU!!*q7sPvQ{+YQVVM{T4+Cf6ekbH$*c0azqlX# zNOf_);9R`FxL+--nz)}g7fBxm<9;*;?~ao{C@rc>7t4jg~%W8}c` z$38|59Dign^5tNh<{@I_!0};XlC#Fi*VUK?9N6M^HsW8|KVLOA+yaf&Tb zT%5k)XeF;LG*f&DG4hS#Kf><99qk0@@%`c0=QoXMi5NG=_1_&4@%h>q)Aq!G0x>4K zS6Lt9fSdl|Vn6}-k77W$`?X07Xrma-cE*6re}otiedDk`2IRg+E!NoGF(AB;(&aGm zy=bw%m|Y;Pfy-~p?t7B%H{-hSUCbGcbF`Yl+_v)G zxpaya|8VY`_&#DcVn_5&-W!A3j!9yNU*GbS5W?AH{lteO<>Vr3=irbb%44_Y?5ynHs# z@J$+gs>qULd^a86v=#4+;!6te9u3_ z#P^Lba5C_{QouN`%*<1n8NbXZ)e~;2HGvPxovZ)F`(_4Mc01I(Us0m?o|HASD85UK z`*jJ>OEhWtc6wwdzIiUj!8fw$8{MFn#vr)34amw2u{{JfyP5l)YHG`+Mv4Po{RbKc z@2Pf!w;({o|A^YA>dA#qMzk`EcIg0i|4o;E!91t?# znF0^UfN?Q4;kAlyWQM?hM*LSi{;MwkZ>Z@`Sx#9ul#Wh(6Akild062&_;w@N(s{#7 zbQ13k@HvJwhSWB$UGjPF%R=y5b$J>?O5f+EGR-T~krMs?mFP#jnBf}?p=QbBv$IG% zLHQhgSGCOWuJ8}bt!yeup4(Kxt8iMi#zW2`9n zP6)Fx2Kpet4%mY4`1T0-sm0#{6bAiaa}j@dHCp_6#q6lpQ1{CJt(X4&o$@$6{;k&# z;u3$R57^6Bl2apN_?fKZ8T@!w=G3W^nf=Fdy}8~@_rB5bv2*y@q=|v)JpIB_|I~Q@ zbRNIJG&yl$s()}`tbgj_#KidIG+GS4I&f}e>>TQc@e@mDE}rWj89O`Pe|CJbe*iy? zG(OcoF*$yAWOR6H-_+#bzR}UwE&d2 zXATb?Jv`WZ=)j>fgGYvX_n$p-;K*M3jjzlg{c2Rk`^_Q`;d3L?neP1b^u*NhefyBf zUp$i=9KW#drGcU0@iFo1QsWmUE=~{EjGmbsAAI6)?qF_z=D?x-M-ScyA{7R1{2=53 zCzbX_Z9VqkZTTG~mKz(N9%i|rFOK0Cr&#XX*u`9aU@FhN|5N$&B+E^X4-HHYu-x?U z>(iq3z=e@P6c3M1jze-oLy#NGotc_q#4h?pFGfFMH9c?!X&xZlCpX=`pGew1NFVoO zOZ_Kb`OMRwe*W27F+6aN(!Xquzs``oB--Rg=1``J=wHrv3i@I``Ahd#!2t z(pa>G(x1Qn@g5udhjMw`@9(d3KkXK+tML8)UNNI@GZ1XQzpoekw7$QOzn|_xU#R*@ ze_s#zsjr97MEnsA!N13mp)!Ab!AxpEjKN=@_XbtwKH;<@IZ?F*0e*a^20Q{_<{wmtmp(r4IbIb+!x|`6YInmBzYi0fo oqTHw5YbugnvFk8)F<6Lae_X%#0vB>(^b literal 0 HcmV?d00001 diff --git a/op_tests/test_topk_per_row.py b/op_tests/test_topk_per_row.py index 7f389638f6..ffabdebae6 100755 --- a/op_tests/test_topk_per_row.py +++ b/op_tests/test_topk_per_row.py @@ -157,19 +157,31 @@ def run_top_k_per_row_decode( numRows: int, stride0: int, stride1: int, + fast: bool, ) -> None: """ Run the top_k_per_row kernel. """ - return aiter.top_k_per_row_decode( - logits, - next_n, - seqLens, - indices, - numRows, - stride0, - stride1, - ) + if fast: + return aiter.top_k_per_row_decode_fast( + logits, + next_n, + seqLens, + indices, + numRows, + stride0, + stride1, + ) + else: + return aiter.top_k_per_row_decode( + logits, + next_n, + seqLens, + indices, + numRows, + stride0, + stride1, + ) @benchmark() @@ -231,6 +243,7 @@ def test_top_k_per_row_decode( top_k: int, next_n: int, data_generation: str = "random", + fast: bool = False, ) -> dict: """ Test top_k_per_row_decode with seq_lens tensor. @@ -262,6 +275,7 @@ def test_top_k_per_row_decode( num_rows, logits.stride(0), logits.stride(1), + fast, ) torch.cuda.synchronize() @@ -282,6 +296,7 @@ def test_top_k_per_row_decode( ret["context_len"] = logits.shape[1] ret["all_close"] = all_close ret["us"] = us + ret["fast"] = fast return ret @@ -370,7 +385,11 @@ def test_top_k_per_row_decode( for ctx in args.context_len: for k in args.top_k: for n in args.next_n: - ret = test_top_k_per_row_decode(m, ctx, k, n, data_generation) + ret = test_top_k_per_row_decode( + m, ctx, k, n, data_generation, False + ) + df.append(ret) + ret = test_top_k_per_row_decode(m, ctx, k, n, data_generation, True) df.append(ret) df = pd.DataFrame(df) From d3a4a0f1f5c83ee104a612e02447d03b0e38a45b Mon Sep 17 00:00:00 2001 From: Yu Guo <82124926+yuguo68@users.noreply.github.com> Date: Thu, 18 Dec 2025 21:21:57 -0800 Subject: [PATCH 09/40] fix missing return in mha_bwd (#1688) --- csrc/cpp_itfs/mha_bwd.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/csrc/cpp_itfs/mha_bwd.cpp b/csrc/cpp_itfs/mha_bwd.cpp index e2f97f8541..ffd3ccc2a7 100644 --- a/csrc/cpp_itfs/mha_bwd.cpp +++ b/csrc/cpp_itfs/mha_bwd.cpp @@ -127,8 +127,9 @@ std::tuple get_heuristic_kernel(std::stri float mha_bwd(mha_bwd_args a, const ck_tile::stream_config& s) { + float asm_ret = fmha_v3_bwd(a, s); #if ONLY_FAV3 - return fmha_v3_bwd(a, s); + return asm_ret; #else fmha_bwd_traits traits{a.hdim_q, a.hdim_v, @@ -225,11 +226,11 @@ float mha_bwd(mha_bwd_args a, const ck_tile::stream_config& s) /* drop_seed_offset */ a.drop_seed_offset, }; - float asm_ret = fmha_v3_bwd(a, s); if(asm_ret == -1) { return fmha_bwd(traits, ck_args, s); } + return asm_ret; #endif } From 97e760ce6c8048c5817f999b991a7c4378ecc0f7 Mon Sep 17 00:00:00 2001 From: la <46212055+junhaha666@users.noreply.github.com> Date: Fri, 19 Dec 2025 14:04:11 +0800 Subject: [PATCH 10/40] Remove the input parameter "out" in gemm_a4w4 (#1679) * Remove the input parameter "out" in gemm_a4w4 * update * format --------- Co-authored-by: valarLip --- aiter/ops/gemm_op_a4w4.py | 28 ++++++++++++++++------------ csrc/py_itfs_cu/asm_gemm_a4w4.cu | 3 ++- op_tests/test_gemm_a4w4.py | 24 ++++++++++++------------ 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/aiter/ops/gemm_op_a4w4.py b/aiter/ops/gemm_op_a4w4.py index bd3759f98c..752c724645 100644 --- a/aiter/ops/gemm_op_a4w4.py +++ b/aiter/ops/gemm_op_a4w4.py @@ -4,20 +4,17 @@ import functools from typing import Optional -from aiter.jit.utils.torch_guard import torch_compile_guard import pandas as pd import torch from torch import Tensor from aiter import logger +from aiter.jit.utils.torch_guard import torch_compile_guard -from ..jit.core import ( - AITER_CONFIGS, - AITER_LOG_TUNED_CONFIG, - compile_ops, -) +from ..jit.core import AITER_CONFIGS, AITER_LOG_TUNED_CONFIG, compile_ops from ..jit.utils.chip_info import get_cu_num, get_gfx from ..ops.gemm_op_common import get_padded_m +from ..utility import dtypes @functools.lru_cache(maxsize=1024) @@ -66,12 +63,15 @@ def gemm_a4w4_fake( B: Tensor, # B:[N, K/2] f4x2 A_scale: Tensor, # A_scale:[M, K/32] e8m0 paded B_scale: Tensor, # B_scale:[N, K/32] e8m0 paded - out: Tensor, # Out:[M, N] bf16 bias: Optional[Tensor] = None, # bias:[1, N] f32 + dtype: torch.dtype = dtypes.bf16, alpha: Optional[float] = 1.0, beta: Optional[float] = 0.0, bpreshuffle: Optional[bool] = True, ) -> torch.Tensor: + m = A.numel() // A.shape[-1] + n = B.shape[0] + out = torch.empty((m, n), dtype=dtype, device=A.device) return out @@ -81,8 +81,8 @@ def gemm_a4w4( B: Tensor, # B:[N, K/2] f4x2 A_scale: Tensor, # A_scale:[M, K/32] e8m0 paded B_scale: Tensor, # B_scale:[N, K/32] e8m0 paded - out: Tensor, # Out:[M, N] bf16 bias: Optional[Tensor] = None, # bias:[1, N] f32 + dtype: torch.dtype = dtypes.bf16, alpha: Optional[float] = 1.0, beta: Optional[float] = 0.0, bpreshuffle: Optional[bool] = True, @@ -93,9 +93,10 @@ def gemm_a4w4( It is used to perform matrix multiplication with 4-bit quantization. """ # Load the A4W4 GEMM kernel - m = A.shape[0] + m = A.numel() // A.shape[-1] n = B.shape[0] k = A.shape[-1] * 2 + out = torch.empty(((m + 31) // 32 * 32, n), dtype=dtype, device=A.device) gfx_arch = get_gfx() if gfx_arch in ["gfx942"]: raise RuntimeError( @@ -114,12 +115,14 @@ def gemm_a4w4( # or bias is None ): splitK = 0 if splitK is None else splitK - return gemm_a4w4_blockscale(A, B, A_scale, B_scale, out, splitK=splitK) + return gemm_a4w4_blockscale( + A.view(m, k // 2), B, A_scale, B_scale, out, splitK=splitK + )[:m] assert ( out.shape[0] % 32 == 0 ), "Dim0 of gemm_a4w4_asm output needs to be padded to multiples of 32!" - return gemm_a4w4_asm( - A, + gemm_a4w4_asm( + A.view(m, k // 2), B, A_scale, B_scale, @@ -131,6 +134,7 @@ def gemm_a4w4( bpreshuffle, log2_k_split=splitK, ) + return out[:m].view(*A.shape[:-1], n) def gen_gemm_a4w4_asm_fake_tensors( diff --git a/csrc/py_itfs_cu/asm_gemm_a4w4.cu b/csrc/py_itfs_cu/asm_gemm_a4w4.cu index e68a18de27..c76d9107d7 100644 --- a/csrc/py_itfs_cu/asm_gemm_a4w4.cu +++ b/csrc/py_itfs_cu/asm_gemm_a4w4.cu @@ -113,7 +113,8 @@ std::tuple get_heuristic_kernel(int M, if(cfg.bpreshuffle == bpreshuffle_en && (cfg.splitK >= log2_k_split_en)) { - if((N % cfg.tile_N) == 0) + // tile128x512 may not support N % cfg.tile_N != 0 + if(cfg.tile_M != 128 || cfg.tile_N != 512 || (N % cfg.tile_N) == 0) { std::vector splitK_list = (log2_k_split.has_value() && cfg.splitK) diff --git a/op_tests/test_gemm_a4w4.py b/op_tests/test_gemm_a4w4.py index 8a1fed83f5..99f62455ff 100644 --- a/op_tests/test_gemm_a4w4.py +++ b/op_tests/test_gemm_a4w4.py @@ -1,14 +1,16 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +import argparse + +import pandas as pd import torch + import aiter -from aiter.test_common import checkAllclose, benchmark, perftest, run_perftest from aiter import dtypes -from aiter.utility import fp4_utils from aiter.ops.shuffle import shuffle_weight -import argparse -import pandas as pd +from aiter.test_common import benchmark, checkAllclose, perftest, run_perftest +from aiter.utility import fp4_utils torch.set_default_device("cuda") torch.set_printoptions(sci_mode=False) @@ -98,13 +100,10 @@ def test_gemm(dtype, M, N, K): x, x_scales_shuffle = quant_func(x, shuffle=True) w, w_scales_shuffle = quant_func(w, shuffle=True) wshuffle = shuffle_weight(w, layout=(16, 16)) - out1 = torch.empty(M, N, dtype=dtype) - out2 = torch.empty((M + 31) // 32 * 32, N, dtype=dtype) - out3 = torch.empty((M + 31) // 32 * 32, N, dtype=dtype) - bias_f32 = None x_scales = x_scales.view(torch.uint8) w_scales = w_scales.view(torch.uint8) a, avg_a = run_torch(x, w, x_scales, w_scales, dtype) + # out1 = torch.empty(M, N, dtype=dtype) # b, avg_b = run_triton(x, w.T, x_scales, w_scales, out1, dtype) # b, avg_b = a, 0 # err_b = checkAllclose(a, b, msg="triton ") @@ -115,23 +114,23 @@ def test_gemm(dtype, M, N, K): wshuffle, x_scales_shuffle, w_scales_shuffle, - out2, bpreshuffle=True, ) - err = checkAllclose(a, c[:M], msg="unified api") + err = checkAllclose(a, c, msg="unified api") ret["us"] = us ret["TFLOPS"] = M * N * K * 2 / us / 1e6 ret["TB/s"] = (x.nbytes + w.nbytes) / us / 1e6 ret["err"] = err - # kernelName = "" # "_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E" + # kernelName = "" # "_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x512E" # log2_k_split = 1 + # out2 = torch.empty((M + 31) // 32 * 32, N, dtype=dtype) # d, us = run_gemm_asm( # x, # wshuffle, # x_scales_shuffle, # w_scales_shuffle, - # out3, + # out2, # kernelName, # bias_f32, # bpreshuffle=True, @@ -144,6 +143,7 @@ def test_gemm(dtype, M, N, K): # ret[f"TB/s {tag}"] = (x.nbytes + w.nbytes) / us / 1e6 # ret[f"err {tag}"] = err + # out3 = torch.empty((M + 31) // 32 * 32, N, dtype=dtype) # e, us = run_gemm_ck(x, wshuffle, x_scales_shuffle, w_scales_shuffle, out3) # err = checkAllclose(a, e[:M], msg="ck ") # tag = "ck" From 6f20772154f0c12a151cf883b1c6f5e4f50cd3db Mon Sep 17 00:00:00 2001 From: shay-li77 Date: Fri, 19 Dec 2025 14:54:59 +0800 Subject: [PATCH 11/40] fwd v3 hd192 optimize inst alignment for causal mode (#1663) Co-authored-by: Lingpeng Jin <103567126+valarLip@users.noreply.github.com> --- .../MI300/fwd_hd192x128_bf16_causal_rtna.co | Bin 30872 -> 30880 bytes .../fwd_hd192x128_bf16_causal_rtna_group.co | Bin 31008 -> 31016 bytes .../MI300/fwd_hd192x128_bf16_causal_rtne.co | Bin 32472 -> 32480 bytes .../fwd_hd192x128_bf16_causal_rtne_group.co | Bin 32608 -> 32616 bytes .../MI300/fwd_hd192x128_bf16_causal_rtz.co | Bin 27168 -> 27176 bytes .../fwd_hd192x128_bf16_causal_rtz_group.co | Bin 27296 -> 27304 bytes .../MI308/fwd_hd192x128_bf16_causal_rtna.co | Bin 30312 -> 30312 bytes .../fwd_hd192x128_bf16_causal_rtna_group.co | Bin 30448 -> 30448 bytes .../MI308/fwd_hd192x128_bf16_causal_rtne.co | Bin 31912 -> 31912 bytes .../fwd_hd192x128_bf16_causal_rtne_group.co | Bin 32048 -> 32048 bytes .../MI308/fwd_hd192x128_bf16_causal_rtz.co | Bin 26472 -> 26472 bytes .../fwd_hd192x128_bf16_causal_rtz_group.co | Bin 26600 -> 26600 bytes 12 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd192x128_bf16_causal_rtna.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd192x128_bf16_causal_rtna.co index 64280d615aabc38a1ec87940e90128d8fa1f8dbe..6f14a99186f24c40c1bfcb9f7609457904dd254a 100755 GIT binary patch delta 725 zcmbR7k#WIC#t9mX3KKP#J5KOr00S7!AOPVr0x32io=^bcPndv8LxmX%CJQo(3u366 zY{{suh#?9!1Fm;-Bjai%WoA~cy7~h>`{n*?Hq--c(QMb~*JxbPpuybS(7>>09|J?f zW_OiFU&#w55I4X`9x#KQVb0`-h2oMMN@3yWX57~ z$vH)+vdf@)J0=?zi!)A`>{-0C-h?41F)1}C-oU`j9Lh8_aCC-=8acbdn8qe>rilxj z>1GI5>u78SQ)_B!2xFQX7{Hj8=0-54qcOxbb)d~ou5f*>rWP<2Zmw|m7@HYQ-lLeZ znWyv(Bhv)m$r}qrC-av}a>CQmWW#cEA%=-i-xfk?{>i_~WG7DtDrA5Ptp{s8SFXgk aVe-dvdteY4R@ieflt47dGJ-TPG5`QOUXL*V delta 664 zcmZ4Rk#WXH#t9mX5)(C-J9hXofB}qV5P&7c`U5?i{Z$%$8Ba{U zSST(z#RTF47%2{BursWf%vdBYX1~C|PMG`=C>xQ3s&_V2cFJVMVsXia22@!QP@pq$ zFmy}~EZ$jf#E_Gil$sN7U|?nnWf~edI>JPaoSk7zV-q;j#0AcDGlZ*kG&X^$H8pXA zG0oguVN6SN0~piM7-E|`&}JuBxIR}?GnfiDSGaqO%?vhgC<$O{ozqx%OmzBwj0mYXyED44vf+>UX@h2Dif8&=n?XX$i4H9cZ(osVPj@*#NH3)!Ym! zYG`0!x>=z#fr*i0a$~tZCp@)H-dS!gSOHCY70|TUF`2Q#oUvlEWraPX#^g#M$uW5+ LkgS;evqBvJkm!Qm delta 723 zcmZ4SiE+Uv#t9mX3npqVuV?UMfB+aR0O2wMDK-$Z5W;7eh)NegxeWYJ22>vtkV}j{ zsQ$_RjDkvp?bzJJxJpSz?!RV3J$yho+S zSJK23>LMt`1E$#-7$zqciA(-yK$TU3$|_8r2$Wr6hbn6Um0dCUAW-&&8>%cM$k-Vk zO#TRz^@u^0O@Zp&GFh=$T(ZC)RkjK$YcV+xDC>i!cLG$FV{#)<_CgM--epi(p2-`F z#TgkUpDf;4@4}Fin3S3mZ(v|y0A(5)o4dl82F?~R1xBupFs88?oN3|)XSx}~)jFCQ z!PJ_X!L2YibcTsqT0pGO0NU(mY77&0Hh}AMH8+8Z8X6cFGfcjyV7fW3REvp;!Edr* zk?7?4<&vE6#5MU~xw%jQG^rIplimcr$=&6$lhrFE84D&mR@gHtOl|~{43iH6$%4s@ GmFfVz<%8h> diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd192x128_bf16_causal_rtne.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd192x128_bf16_causal_rtne.co index 96cb8f53034b34ec4dc1c44c792b2ae19dd31de0..fb7cd19af76140cf5fecffe0c433471823ead802 100755 GIT binary patch delta 684 zcmccdm+`@0#t9mX2@^G!J6;H800S7!AOPVr0x32izEB3?UzmnU7eKfS29pID#RV}` zO}1oISHuv7ngQ3lxsh?Tk}@+ZS6%&qp8azFH5=-I#%i`}^lLP(XwYD8ZfIaww2y(I zVY9o+!C=V?CQwVE6c3nYXP7ftu|iz3pb}M92`XzbIS?po;eaY@0hO(o+z6EYP>3q) z2bGnWyb&nt;(;og0+n4b`5{o2BMMcv3MwlzS+G)EvI3$Qu4DpK_Q7OFpzNMfxCEH7 z3@UqKa$%)7Enwf2l3CL7k- P1HD}cBpoJitWyU7bVZZa delta 662 zcmaFxm+{75#t9mX5fe3+JDvz;00S7!AOPVr0x32iK2ZkYpO}V97eKfS7Lx@T#RV}` zO}1oISHuv7ngQ3lxsh?TlC<1^&4zlQahmNK{Thuc8Z?-j8<<(S>go^lZ1z_<7|eKL zvS5X{Oy4=CKm!_3*1m;ouRTXCNBia3dEwyMnYw6 zCSL@~&hST-&4Z$~i7L#|@sRIC}V~I}y diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd192x128_bf16_causal_rtne_group.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd192x128_bf16_causal_rtne_group.co index 80c5caa0ae06781bb948bbac0bafe18aa4b33283..c7338d0d02247eabdb9adeac5faa3fb6881287c4 100755 GIT binary patch delta 676 zcmaFxkMYGn#t9mXFD7a(ck~Eh00S7!AOPVr0x32i_9%z&J*K151rRR7j>&?I;({2e zCR;M9D`JR3&4BCO+{n0ENrst~tFHb)&wjc8nho_pV>R0~`ZXF?G-$B2G)(qT$>C^V zShSCUp<(~#Eh=Y%8EqyvR)|Y-m_i)@rNqHBJA=dIjX>EKm8i11P}vQW9|C1JIH1Zp zLuEfq7OWJP%qT*YjfBchnCu9Y-Q$5On-7(hm|O^yZHPjZZH3BCnY<7v`ymBYb~aQt zV)8|xtVtQFtO6(sm^c_bCNoy;taoF`NlZ%3i8nB?FoH4-O$`lTOhYF}m;xhLXBgAi z49+xhgEQS+;A$NmO<-zG&EQs;o0!2wEnOj2r~_?wbTowtJDVd+GlVHNG;ns=oKT&> z#ON{kV68qUJjqP{S!*u10~+}|$|2tLnCw_*&bVW8Wt}}^#^jwq(qr;ZAh~0*WxYB8 DourHE delta 682 zcmaFykMY4j#t9mX4<>3ZcXSA000S7!AOPVr0x32ib|{DN9j2qw1rRR7fysi5;({2e zCR;M9D`JR3&4BCO+{n0ENk;C!WP-Gv zDbDCHnX_tVy$eH5Vp3{Oyn%s*0hDQIYTycE8ai3P6d1WW!kETpaHfeHoayEQSL^6# z1XF8j2Dif8!~`a4=>oAr18B3OqcKd_*&Jb-AxyEMfwL3Cj~_dYvTWhRKa}_KXRW4+2St$&B^(KoQ4! FbpT9ci?09x diff --git a/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd192x128_bf16_causal_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI300/fwd_hd192x128_bf16_causal_rtz.co index 22445db8e8b1ef357deb50cbcb4c7a7578ba0cb1..fd84e8a2bd2e87353bcead7b56dabaa4b196ec42 100755 GIT binary patch delta 670 zcmZ2*g>l6d#t9mXD<*0#ca*Sa00S7!AOPVr0x32imPmr|C7Mv_0tlDk#biN7aX}1K zlPww56){AiX2A7sZe(1oq|D69RabwYXTRKk&4zlQv6}4~{Thuc8Z?-j8yXlE?PFkQ z*zB&tY0r3Ja$vH!cV9=0%a@QP-UH=vNt9_1j;%@ zpvp!8W@P*$J_RaOWT z_)HuO5|b~c?5sCo$Vp5}&51WKFf)fST`gQ;Oe1GF)7S*gG;x75UChm3YAsC-U`$gZ zCm7Sr60X+539i7>)DWyd9cZ(oGu#o*E^yOa4NPH1xH+3_Hb@I#Vw9NNn4!-JPb`yn qW|#|hK%>0_nv^OgGiI7Ic1*U+v}d%KTnQv4Chr829g}}%ssjKqLVle9 delta 676 zcmZ2+g>k_Z#t9mX3npqVcNDN^00S7!AOPVr0x32i7D$5d1)5Ol0tlDk!(>55aX}1K zlPww56){AiX2A7sZe(1oBrW$}v!NbnoMyX5zeeMV1`X!s24+^Sy7~h>oBdTd?Ill` zKrMq(JYbrgVanvfWN}G@TvS;lsI0-{g&JWqGnuWtTx^IVK-W5oZ*bd^2Td zy%9rBVp3{Oyn%t4DU9iA;S6IMIm4O8CUB;S3!LdR0~`ZXF?G-$B2G)(qT$>C^V zShSCUp<(~#Eh-}Rk~XGLtDzJRm}Y0-nEWtVT(YAGRaOZqyJ50min!zkJ5*TZr&(xjuZVWkzNvS#U1_l;JFs7R;oayT53==hSg)@!K;7k)YIMc<_ z1g6%~%oWBoHL-*-&79zBEu0|=)PXizn!z36h_Kn&4Q`sNk?~|kCDYA!)0CNj&SuQi y=Y%Jl$(EVsf(*@2k0wLu$&H!jObp4BcV*f!ZkYTt(;leJGRvNcA#HM1mKp$HF^KX2 delta 688 zcmZ2+m2tsU#t9mX3KKP#J9gMJfB}qV5P8F0Ou8yQzC$;kcJY^Vnsr`fL2uhF=oL4&2Gfti)7uKqyJ{>dIHIh*&W zh}bilOum>bF3DgDwHHc>gK2h#hRKX6;*u+JQDt?ZvMVMVf@B>~Wu2k2FD54fWj!)c zWh0@o8zxT#%F1}4%I3rM9t6r7grUl|LS?s1{s@$9h(nc~4V67HSus^yvLzc;Rt^;M zOdJdylLJ$C*1Is|BqpWi#2Xk`7{Hiru5hNSqa#ez$Q8~sHiI)w+~7^3C{PF5Y-t8}gd@UcXE(TMu11EF8I?>oKTT6+V(ggAnW@hSPc4%j zGtGs5K;!)fG%a!PPUg;(oxD6#lIcg%WknDGgoF^e delta 43 zcmezHmhr<|#tk|uGIIYl8|oPt7&O~8`ZXF?G-$B2G%&Mr)zu&9*}vICWknDGd*l$g diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd192x128_bf16_causal_rtne.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd192x128_bf16_causal_rtne.co index 20d2fcef3566c77259c2482cb7c47b658dea947a..63727e47a9f7fd4c5c69f9d3308eb1b6afa6bd55 100755 GIT binary patch delta 43 zcmZ4SlX1mQ#tk|uGR&-8b@c~&_RIa(Y^Y~oV9;#W=+|gm(V)TH+_2e0B_RR;a98QvsvvSqdAL!XH_g}N2o`Hcuvt6TKqj5!p21`rBW)GDY5dd^&5Sah~ delta 43 xcmdn+i*W-G>8Qxa{nu=$XJBB^Y}e@5Xk5{t!P3&e%*s_)f1qdoW)GDY5ddq)5Sah~ diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd192x128_bf16_causal_rtz.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd192x128_bf16_causal_rtz.co index 9ec9f631061b7e3ec85141d4398dc74c735bc118..a9d1aac642a5c6eaa7b9d87666242979312bf07f 100755 GIT binary patch delta 43 zcmaEHj`77g#tk|uGR&-8b@c~&_RIa(Y^Y~oV9;#W=+|gm(V)TH+_2e0MZp;WcJB|; delta 43 zcmaEHj`77g#tk|uGIIYl8|oPt7&O~8`ZXF?G-xn4H!!nu)zu&9*}vICMZp;WZa)vw diff --git a/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd192x128_bf16_causal_rtz_group.co b/hsa/gfx942/fmha_v3_fwd/MI308/fwd_hd192x128_bf16_causal_rtz_group.co index b0413127b26911f5e61b8ec87e000891ea5e60a2..8d543b10af0b4bfc0dfe7ac9eaa84cd9c1667373 100755 GIT binary patch delta 43 zcmaEHp7F(b#tk|uGR&-8b@c~&_RIa(Y^Y~oV9;#W=+|gm(V)T7(y-Y>Wq~sQcux>k delta 43 zcmaEHp7F(b#tk|uGIIYl8|oPt7&O~8`ZXF?G-$B2G%&Mr)zu&9*}vICWq~sQZ?6zm From 7df7b36547f4aeba868584629ffcbddb42b97d07 Mon Sep 17 00:00:00 2001 From: JaxChen29 Date: Fri, 19 Dec 2025 17:38:51 +0800 Subject: [PATCH 12/40] fix swa case mismatch (#1694) --- csrc/cpp_itfs/mha_bwd.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/csrc/cpp_itfs/mha_bwd.cpp b/csrc/cpp_itfs/mha_bwd.cpp index ffd3ccc2a7..f90b2fb421 100644 --- a/csrc/cpp_itfs/mha_bwd.cpp +++ b/csrc/cpp_itfs/mha_bwd.cpp @@ -463,9 +463,12 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) if(a.mask_type == 3) { + // Note: sink_size=0 is passed as the 3rd parameter (attention sink not supported in bwd yet) + auto sink_size = 0; auto generic_mask = ck_tile::make_generic_attention_mask_coordinates_from_lr_window( a.window_size_left, a.window_size_right, + sink_size, a.seqlen_q, a.seqlen_k, (a.ck_mask_type == static_cast(mask_enum::mask_top_left) || From 1e7630fa1c9d4605bd5bbf664085f4467fc3aeb4 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Fri, 19 Dec 2025 04:39:19 -0500 Subject: [PATCH 13/40] fixing the fp4 gemm tune script Exception caused by tile_m name inconsistency (#1686) --- csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py b/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py index f98dac5d85..469afbab0e 100755 --- a/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py +++ b/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py @@ -149,10 +149,10 @@ def get_asm_kernels(self, file): shuffle_df = ( df[df["bpreshuffle"] == 1] .reset_index() - .sort_values(by=["tile_m", "tile_n", "splitK"]) + .sort_values(by=["tile_M", "tile_N", "splitK"]) ) kernel_dict = ( - shuffle_df.groupby(["tile_m", "tile_n", "splitK"])["knl_name"] + shuffle_df.groupby(["tile_M", "tile_N", "splitK"])["knl_name"] .apply(list) .to_dict() ) From 126d28fd2593f9505202b5d08d96b7a2ab2d5e54 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Fri, 19 Dec 2025 17:48:39 +0800 Subject: [PATCH 14/40] CI: Migrate Triton tests to aiter-1gpu-runner (#1690) --- .github/workflows/triton-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/triton-test.yaml b/.github/workflows/triton-test.yaml index 1aa47ac333..0bfb935143 100644 --- a/.github/workflows/triton-test.yaml +++ b/.github/workflows/triton-test.yaml @@ -29,7 +29,7 @@ jobs: GITHUB_SHA: ${{ github.sha }} triton: - runs-on: aiter-mi300-1gpu + runs-on: aiter-1gpu-runner needs: [check-signal] env: DOCKER_IMAGE: "rocm/pytorch:latest" From 1127ab4ba38057f0d598c00108d9c6f00666f50b Mon Sep 17 00:00:00 2001 From: zufayu Date: Fri, 19 Dec 2025 20:24:27 +0800 Subject: [PATCH 15/40] add ntile 128 for a8 blkQ moe 1 stage (#1695) * add fmoe co with tilesize 32x128 * add ps co * fix pertoken co bug * add co to csv * add 128ntile logic for one stage asm * fix mem fault during perf turn * en vs for pertoken kernel --------- Co-authored-by: feifei14119 Co-authored-by: zufayu --- aiter/fused_moe.py | 2 +- csrc/py_itfs_cu/asm_fmoe.cu | 18 +++++++++--------- .../fmoe_bf16_blockscaleFp8_g1u1_gelu.csv | 4 ++++ ...blockscaleFp8_g1u1_novs_gelu_1tg_32x128.co | Bin 0 -> 29288 bytes ...ckscaleFp8_g1u1_novs_gelu_1tg_ps_32x128.co | Bin 0 -> 29736 bytes ...6_blockscaleFp8_g1u1_vs_gelu_1tg_32x128.co | Bin 0 -> 29328 bytes ...lockscaleFp8_g1u1_vs_gelu_1tg_ps_32x128.co | Bin 0 -> 29784 bytes ...f16_pertokenFp8_g1u1_vs_gelu_1tg_32x128.co | Bin 30232 -> 28880 bytes ...blockscaleFp8_g1u1_novs_silu_1tg_32x128.co | Bin 0 -> 28776 bytes ...ckscaleFp8_g1u1_novs_silu_1tg_ps_32x128.co | Bin 0 -> 29224 bytes .../fmoe_bf16_blockscaleFp8_g1u1_silu.csv | 4 ++++ ...6_blockscaleFp8_g1u1_vs_silu_1tg_32x128.co | Bin 0 -> 28816 bytes ...lockscaleFp8_g1u1_vs_silu_1tg_ps_32x128.co | Bin 0 -> 29272 bytes ...f16_pertokenFp8_g1u1_vs_silu_1tg_32x128.co | Bin 29720 -> 28368 bytes 14 files changed, 18 insertions(+), 10 deletions(-) create mode 100644 hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x128.co create mode 100644 hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x128.co create mode 100644 hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x128.co create mode 100644 hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x128.co mode change 100755 => 100644 hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x128.co create mode 100644 hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x128.co create mode 100644 hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x128.co create mode 100644 hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x128.co create mode 100644 hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x128.co mode change 100755 => 100644 hsa/gfx942/fmoe/silu/fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x128.co diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index 67dd0df318..abfefda272 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -645,7 +645,7 @@ def FinalFunc(): doweight_stage1, ) in fused_moe_1stage_dict[get_gfx()]: if q_type == QuantType.per_1x128: - run_1stage = True and (inter_dim % 256 == 0) + run_1stage = True and (inter_dim % 128 == 0) elif q_type == QuantType.per_Token and q_dtype_w == dtypes.i8: run_1stage = token > 32 elif q_type == QuantType.per_Token and q_dtype_w == dtypes.fp8: diff --git a/csrc/py_itfs_cu/asm_fmoe.cu b/csrc/py_itfs_cu/asm_fmoe.cu index d22daf3058..d3c2eca940 100755 --- a/csrc/py_itfs_cu/asm_fmoe.cu +++ b/csrc/py_itfs_cu/asm_fmoe.cu @@ -260,8 +260,8 @@ FMoeKernel* get_heuristic_kernel( uint32_t tg_num = 0; uint32_t num_persistent_tgs = 0; uint32_t round = 0xffffffff; - std::string arch_id = get_gpu_arch(); - std::string selectedKl = kernel_name.empty() ? "" : arch_id + kernel_name; + std::string arch_id = get_gpu_arch(); + std::string selectedKl = kernel_name.empty() ? "" : arch_id + kernel_name; int vskip = 1; static std::unordered_map> impl_ptr_map; @@ -272,8 +272,8 @@ FMoeKernel* get_heuristic_kernel( { for(const auto& el : *cfgs) { - if (el.first.find(arch_id) != 0) - continue; + if(el.first.find(arch_id) != 0) + continue; const auto& cfg = el.second; if(cfg.vskip == vskip && cfg.smf == smf) { @@ -675,8 +675,8 @@ void fmoe_g1u1_tkw1(torch::Tensor& out, // [token_cnt, dim] const int token_cnt = input.size(0); const int block_m = 32; // fmoe sorting kernel and fmoe kernel only support 32 for now const int estimated_sub_X_cnt = (token_cnt * topk + block_m - 1) / block_m; - int model_dim = down.size(1); - int inter_dim = down.size(2); + int model_dim = down.size(1); + int inter_dim = down.size(2); inter_dim *= model_dim / gate.size(2); if(fc2_smooth_scale.has_value()) @@ -839,7 +839,7 @@ void fmoe_fp8_blockscale_g1u1(torch::Tensor& out, // [token_cnt, d int sub_X_cnt = sorted_expert_ids.size(0); const char* enable_vskip = std::getenv("AITER_ENABLE_VSKIP"); - if(out.dtype() == at::ScalarType::BFloat16 && inter_dim % 256 == 0 && fc_scale_blkn == 128 && + if(out.dtype() == at::ScalarType::BFloat16 && inter_dim % 128 == 0 && fc_scale_blkn == 128 && fc_scale_blkk == 128) { if(activation == ActivationType::Silu) @@ -850,8 +850,8 @@ void fmoe_fp8_blockscale_g1u1(torch::Tensor& out, // [token_cnt, d TORCH_CHECK( false, __func__, "Unsupported activation type for fmoe_fp8_blockscale_g1u1"); - impl_ptr = get_heuristic_kernel(inter_dim, sorted_expert_ids.size(0), config_map, 0, kernel_name); - + impl_ptr = + get_heuristic_kernel(inter_dim, sorted_expert_ids.size(0), config_map, 0, kernel_name); impl_ptr->launch_kernel(out, input, gate, diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv b/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv index f25b1fa86f..1e06c66b40 100644 --- a/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv +++ b/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_gelu.csv @@ -1,5 +1,9 @@ knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x128E,fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x128.co,0,1,0,1,0,32,128 _ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x256.co,0,1,0,1,0,32,256 +_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x128E,fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x128.co,0,1,0,1,1,32,128 _ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x256.co,0,1,0,1,1,32,256 +_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x128E,fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x128.co,0,0,0,1,0,32,128 _ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x256.co,0,0,0,1,0,32,256 +_ZN5aiter52fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x128E,fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x128.co,0,0,0,1,1,32,128 _ZN5aiter52fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x256.co,0,0,0,1,1,32,256 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x128.co b/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_32x128.co new file mode 100644 index 0000000000000000000000000000000000000000..f6ad84f26ba8569ec6e9fbc094bbec2c7694535a GIT binary patch literal 29288 zcmeHQ4R}-K^?#eBX`A#5Xd$gljTjj)h$U^Yc7c^oQ4tC)SbUfvG`CHqwEZ_laex2y zHvJF~5D^6(ZVqKmH#hg;rrRh{K|w*Oh=UC#exTq$amo}2=KnkIdvALaqm_m|bI;TG zgx@{qJ?Gr>p1k+mn|ptzw@#TpRTmdmVqoH>XNOswvQ1E;Y#*%?8`I-SSHu$WZz@Y> zMyScRL=1lcgGw+J6BRpBtNT<`VNhZfqKYhC$mE9<7|RthbA)5*H9}TV>qohkE>gyC z561=}vd7h=l>Og(=Lt@$kY(f-c$Mf>8ZQ^NF$z&%+)t)e>3 z1gQUjF&l8gHIuI>ox@n*7oNplpWowHBsJDawf;L-ZgzMYtNjOAMaM53p30^ro?5r3 zvQqNugtJQ-C25+o%NcbC5qGe&hQE~LtB)c^_t=EC{E`)!#Qe(c~N%O zYlcNpoX&TKlhq7MqU@~K45vkLI?owys+gswSQ=$-y{1?Z#p!&fctv-L3!?0;*A#tG zoX&TOle<%Fh_bg{Q(PLw={%?SVA(a@Iqs69?621(cP)$Jb>6cqFICs$t~F8i*L#*5 zqIjL>EH{;1+nwYCQFhmRl0S{&b)J)auxv_CDIbrrzuvQaGK$xiW~rV>y)}!wixU(9 z4C*O+8seA*)H=|cqV)yS?_V-2uB)&K|l5Mc8{PgA9@ zdnp|5Lx`{{yd7(Rqp=1!-iH7WNezun-9qCI>IZ@G=y{9HA)Y2(CUvct{F6EK2$iBD@fjV~#K_<_OdK5<$`) z7o~lP5Y`^bV~$V}bAWnz2oYY0bumZS z5Oai$eTeW#Q+*R|3pZ73mc;{oh!b9ohhmQK)0iXtyblo`Zt%OkQeAz|%6PmFQNnBS z+n6If8FPd`olAtL9Mz5WOB&pMsd}+g*W|9MSt@zmbo>7$ddBoS{(J!s7P<{QwKdgs zh9|zDCzsuE!6zMyJxkqHwVo#TP4$h7l;C`fiH8OU#<{PXQRukg_S9)rtsSQQ$>M={kxa;a~^t-F2+9mEWP1Wvv=h89GqA7oLG&I)S=xLI= zGwMm1{npVa{STgXgzfn*e1D3I`s)bOywfAivn^|c2|V#>v_$cwu) z-C=j@*e@M7d2W=d8tYL?sz{0YnC3>Q(O*+vw^n_IB%bty|4U;7^?2H&J}*;-zY82d zPq@_QZHk`k)ps_w>2Z?IE_Aial5hL=h4YPi=;%#$>f zkPXlUX(5#W(}6P31`Go8fNY>HSVH5#QW^&?pmAUWjStcVW#9^?v#rn>Y$1bTSO}j} zNT(CX7{?A~v~9(PB{LF4x*Sn0Hvf>$SVQUZ1S1YG!ijK_?rP5QMC3Cz5S~%O#V&Yt$%N9z8m6 zPlkx|8ZaH7!{AJ;1>js@9`GJu3Gg*wDRA&$tOej)U;~hH4KS~Lc4#*05Ms0OdD&P& zm-A-E%0gvaR|efhimp6V&UNL`4OMh=LUXuo4s;hQy1AjbTsIdwyP~@`bS>9i3*9A( zt|C;ybrsMJQ*`q}^SEvvbl*~RJjW|sYfBYb)@^DogFL%i{z-E=z04lTmgB0xBQ3Zd62K~mOGmhLZ}Uzne1}PA!oiU+Zk(PUR%Ih zWfz~H&(`7d*$b+xe69FALtbBUBAPwPfhvPt2Y(qgcTI7vjh{Ad?KpgvE{jE$v$6sS zYw(#u-^NPquF_Ip5ZkWO***(9*qOx+b!M}_bzZ<;@0>CL`uX!^`Fr0BP&{g`@mDIb zl9OdQB}K%VO0mlAuJUqU5ZkWuxjxt23ZJXuI^R_C!|@+tFLh$cvl%ToKF2*Nt839U ze7G!+7!eS28fXs=%?=@Q~ti`Q#w^Lmrb z(qP{NvLEMKn}Yh8v6hyhcj#J>2XygQQNH8kaW&){(@ff z+fJR{m=I!5@cD<>?_h60J$?_=8A7b1^E;0|`P~rP-I+Xe?G`6H(}~{LJZ9pqts&j@ z&8S=`q+39EYe=`SIdm&NRJL+!C^dB$9Ydk?bdG`4RC^#j-Ja0mx?^HMzuKjDt<7=e z$gTU_l<%r^mZ$pFkVKgFjY7oxooM3xK}`js^Y>SOk0$SPc9F&;|Sx za6GWHnflHqZ(_1IMyrzZlM;Q@pO)dHa3v=C0*Q(C7H(roZedl!^35h+LIB5z$>a-| zO!lQ|D{*RHf%oSJbhtDq5x7kjxFaZV7c=njJtYEPl?Cn(3Op#=YWHLLq{$a?OqYjp z93WrJabQBgu(dU@ZEFLL58B@v_R(I}qBjxh+9##*^2;STjt!UP(rRWN%nL zDV5N;iu{!IOXJ%0OET8)I@-^V^(I2y=CV}2-t1yOd|i{Bc3qQob;DrjmZ}s>;-jWQSY`2X6NBTt*bEmr2fe z#LveEKL)*e+|V(L<2l8WD1VmgH4Q0^3pw>w&2q`(d)G+1+yHQ`WXfFz+#n6jT>-pL z8kzfdP+z#M9Qg-f2;6}-ryBhdIsW_MFn0Tl7987nko^;#|UoVqB`Cd`y z)cYs#Iwbx|)Y0?=>OsflsP!_{C;2H+r_?QFe6AM6qd8~ajaneyh$2yooO?wra_<+l zupl1IJ^KmF8SzHAL@jcj61B+P;!pi-DCRQzYtGx=W82%jL!~ahzFf0RTMe?I-JqO< z8kFOq6nWY3nGz0dpOm8ZE0XQYC#4V$TSa=&cV#<2J`S@fe7z4UCx3j_ghMx%rKo+5 zWcwp!DTKowD@zFqzqi>`?w3&keIE8p=xnd#`=!1}a>4S-6vDBqpudFg?s7$(q>)xySE}t~=*p#m)~Ud`QnqzEaK7ZU&H!F76DI7#&wj`ct|@(I*S>H8#)dP>yGu?6)K_8(IniV&Z$_-@pL))~$ZB)jev zbsB#^^doruPig)|h)-Dj1nNWU4d)S(T~CSnjNgL#2>Ua%?kvi>>$NUecdBh~b>Zs{ zeT>%K`X!|M2YCk=u^wSgYQ~N26Yx*f& zH*CN@(lHmpf!onT=-NQ-r#qE?x?Aa|$Ymqh9YGI4F5Ug~n9@&AK(F>w>fLHT9oEw$ zd+n#gdu)5_D^$MLeoS_9@I++fPp`{q!vQ>BycQ+3UPV_1N|{@2}NQ@tM-`KXg0# zfpYyvvVEu0Pj{1ExsFw?N7U;I>H{`>)`UZkMD){RN<`kqQ+s5ub-}t*ZF{SWws*$b zPvcnI)@#oOwm!`s>U^4Q?v&&H)H!>iX{+6}uDy)oJ?-TjH?+^;cwhTmjvL#r<@m$) z3XTu7&*QkMy@{?Tu;xaxR#}ddOfC9qQy{*^)S9r!25BW#A=5NCN zKRQPlhZy9LuBds-WgUefT`|WT{oXEF?>7(z68r`$u+4lE$Bwj{xPI9nF}`Am7+-mT z7+-ziO$OX6w1*OKPmZn+>3;8(_w9G)h&$~iyIw9mG9|(ErDIwqdiI8tO zE9CzRL7r07Y@Jfv9Kt<7#u_F$FE27Vzf)|=iHkGm==J8D_;_+dcg%|tJP|@UU;Fo72_)}z*7;7uf7oeQ(VYvd~-YQxwPYc@Q?7B zwRwFuy7x<;AMTCXo%}wq@>$~^hf}$?%w}cpcgnbNGY{AToC6F1zXxpXSqF5_r$f2d ztc}NJ@y|(?1qRq0F1N`7JJ>ufFOvnXVBh2NN?G9QBBig+RPLYB^9s73*)&t!vkBr{ z&*JnZLS4K*ji1+St@1X~573(kb(w>p$MIn?$U7h#U{7d_znI&9+ukbgB0cO0jhXpr zdw_|sBQ(WN;&!8kx5}@Q9(IJL%(Aroz+~7Frp8xtyV0Y1&Ibo+9QK5%nf|mT$eVIa zPtvCm4op#U&J#JOXAMfDoCjJ5rBRLCm#N9@^<$B9=gsb$H zU4-}QEiVz?uh=SaUnSimddq8skLfK337^ti{zkY(Z_!?F70E^IGvpcVGv%4>k!vSQ z?`_V)y{g{k)Z09=5hojQdYeOUdFQcddx9-`JjxGs1uKm@^v^Db=NU%&r(1>oY3TD6 zeUP2#%9mc@tK8y-_L}8pT_S6jo)jJiSgGW{Qmq)q5reS??sK@{$Gjl{Skf# zek1e;G=94@et&sFjQ=IV@2|fX`oC)Yc5D0&J|)HvM))22qtGAH_`Rg@``Z>V{l(k86+h;=Z@(dkb6G3!89Xnn0d51{4tx&!M&J(MI^YY?e;>FDxDohL&$*9z zYWL&zQVDD1IKd|5i8dimDiQMJ5+P4nAmphFgnR{S<$lxHI*wP`gnX4v$XAyLd3uSE zuUR1E84H9wlb(}j#C_h$t|?Wj9))f$lpm!l;xzPfO;R&q-*QGMmUV~nhob@!lCi4{2Wc+ecvWM&dr2FGw*_4 zTtgp)Y==GJu=oeLJ$+|API}lA4$J&a+DYJW*b$D5f0o;yW1SSd_Iljj-4%a&;xW#d98?qU4J2Oq81| z>!UQn@z#&{HG+Dd=rEm!pVnL6B;2mI93|}1TizqwC$14J%DMa`>E6&={y}&|Z~2t) zL)Z|W66bw2?%Sl>skgjCxLa>IPIy3Xd7to*Vyndcm~_YVmVXkS&|A(Bp4MAFCp>%Z z*9g7MskeEk*9g7Mp|`y0THx^0qK6%h=xuL2EA($@dfQ=5PkeK`7=JUu@2%&B{wzf*q~`coRe z_cVV0JR!#a8R7TI2SWde#_#VMzfV6D+Yy3VtCB{FC@cY*%LjNy~ z-wBQ1nbTtYOoZRre+m6rjonHvd0QW=YZ#gmW1uh8PPY8QvNzd!P zgabmHku^9!5*MWVnI+jr;ee1GaYfbvi3{?DXO`rreP&6r^30MrBjiV1knU%eG_HAO zNgNQeA)d$vAaOxxF6uK&;)IYs8W%+S%#t`E8_DiJ%mvXtvm{Ok)h`+sMElH=I3ZMn zXj~BOGfU!xP%WZyLA1{-i4#J#_$F~d`1%BAtrU3?Cxno=AnG&ANDc^%6Gx;RNL&!U z?hG9=$;1gEBrZrK2ZZ#*4XFSU7lf}Rg#$vI5YiJDB$5L{dg6x6CqHF>DhGr(A*3fR zNF)b@^u!JMcXL6s&n$@(LiHgoh{^#WnK&Udf8v7h^#$%9$;1gEBrb@`0U?<BVn1?ldmG_L8V!~r22 zBX}YkfW!p}>!;v^kUk>{7bL8of)lcl?Eb@Ckg$FVP6*X6ISLmfte=7tLN!Q^!UYNI zr{IK8Epno8LBjeeI3ZMvoNo{pB&?r;6GF#7aY596$|5)*G(HR*k#Zn$LBjeeI3c9B zg9}n3{I;oFRMHOvH-wIV;(~*To9fsI3ScOaY889Xj~AUD>xvOBXL3~$9^tI55J|K3nDlv{alb~ zd-}N`;yFn_7esl^)6WIrTocX>`367dx$X29xgh*}3l0eJD~J<9dB)Z&lTU*C|BZyP_EIqAUs!aKqyz@gix;0xF9@Na6l+W;)GC+{alb9eoH?W zL|h~Eb3vx<>F0uoYlMC-h;oh4&jtCfxJGFHMCF3e|5wto6^{n_YtID&&5CSMkRBz{ zYd-#8D0-O$Fj>J3wVR}s+{z_A^4S&1IHhubJn={XW|7^oA zt*x)CmKxm+o+cmt1tzz@-reNGFDEuOEcUxAJ#}vXl7@!*#wJ))F7i~@)Kz0#!jD!i zTvF|>sjI4YSJgMVJ+-y&dcV7&vA(LNR`Or!Z>+quw)V!wqoKJ$s%+wdEzj%qcV$4Bb*;sa%l%KF9h%bSdTlCsIO5a@%q*b>NUJqx8;_n7=br?Qzh zVRHATvtX*KEtp!QZjPNeN!^?svW@BIl;7^~aBLcGbB=0Gp?I4xZyWcuAe zda3R literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x128.co b/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_novs_gelu_1tg_ps_32x128.co new file mode 100644 index 0000000000000000000000000000000000000000..eb8a1339dded3948b915897df7c85b77644847e0 GIT binary patch literal 29736 zcmeHQ4R}-4`9Dq4v`zX2w2)$3BUV-z#FDmHSzzTG9YTSE#fMo!lQxCYc1=;--__gn zLqI^RsNis8DDz`;b02QHjW#MMC@AH_!3Gl*6dd?5Wr`E?|Gnqj)853mO2eMH=V^Sx z@4oMO-}k)d+>`g--19TNZSwRfx~Ql^0~0SjJI12aV}e5U_>o$1Fg==d1uPc-CbD>D zgqnN{Me`TXuLhGCtJ;xT_Nl6ZpfE#-5?Qj4$qy+omMLUr3&uxg2w9@mk8*vaN^M^q zjOy@MiC*ZXpAOhgGi)^hhix_GqGjdf?z$G?@?fpgC7oi!I6boHmQo%oG%17^jP!;Z&KSHB4tcW|$qu>0)O%S7w+Krn4S1 zEC}Otu``?`Gb{|#S&tb`3*&T=Gu%=#OQu*9rneqbED7Uuu~VGdmEyuMz4e%)Cydj@ zPH|FKinU>S>oLXUVVo{%FkTlu%i?x}ntOn*IR`BWIMFU?Y3M_rXmyC%mk0vO~q zldir!2GCrHcW;k0z|Kem?CwQ?hdkOz*xQQ;+Pjh62OS>r)w!3 z>qUs5SvVePfRm91INgf?54mgW>boZ4{a!=}nuL!c4e&{%0Y2+RfGs6nIR$&7ST6!- z=U`8iA>s(J5l4vcO@yg)HS>_%n+TfPxF;$l;s}FTJLqFk*ozRs6EP{`2-6~tFugYs+}h7Y zQEwsym51VpBa}oOVL>k_v>=72%3Ff+yk#RlSJtkb9DLB5HdPB6uR| zBaW~<;t03)BEnWTzP)Oegwl%`!86euafB5SM_Ap92oKfQ)Gq11nOfJ25Wy3%KH>-) zB95@B7ZDz>ilkeg^G&$nax{KTUT0Ev#h$l_FIeYsO&ZK3}NK|C+pt>8z`C25Z1z7#hI10y>>Q#yGYzqqPIa zI_7KFv!-?fYi`%EH{12mi49SKsMbJ$F@+s#SJ>-#A8%(Coqsc?v=I0N@HYSIj)jV` zV?1l=aK^hD6z88B?NeP1@ahP-T&B+hx6QOCxf*g=%RYxo$8uTgK6{0Z*(+QgJB}A} zUCTcEbBrx4U%a@{1Zo+_>@I1N1NWEZj+Io|7L&!`3aJfb$&mf5V#lU2fhzv{d9i% zP!^F577A)pmsgbInE8_^0lxL{TN`jF&_Qk zMQtMkBYE4Efh#%21>!hPnB-`i&^)%y!R~F}%kIZ}E#8me{Uf}$;Qdp)+u8%{zIN&{ zz<#LWohtrJ#UHEqpo$NxcsIw?Kq~JqEs(}BA&|gvSYQ~(j6ep*%L12iydrP~$9{o+ z9N~XE?XJV0Bm6nSpCkM^!k;7jIl`agw+j#NR~WM>y3cQ;`Ot56v^ngCj-CCRI~2CJ zy^wXb>zkpo*%alz9b1^U#=NdQ#E6CaBUCCR>WLzWKy3g+A~+0hm~ zZEV|EoJ)trq9|!;zL<45r_eXEqAW*Ik;ji?N6~DLg&k>6V@KNuv%j@p!d`2iJRbT5 z3l!zM-}O;Ex=2M|qsEGlSCoVV5o-#?D$a5g7km6Tb`;O^IOdgj93?k+rjQ>#-vRb= zJC;0~(TLCI*ryb^79H8yiZX1NPmHO5mcM_Br|@>ioeq4C&1^s?(?>BgGj^K)4|Aa# z*kC7JOcd&~%i8R6#hcv&J>$uKtfMUf^Jm6dT0z&8M&toq^tF`lSY@o$swkIT=Hs@A z&D)zDZAP^ZBl<9+kI?vv&edRKoAw#qI>r(4qrZdL$;pb+ub(f#o^BW8`$Ic%e0*bP zwguQ9+fiQF81@&`n%}nT^v0L~dyhtg*#ketlXmzbF4OHR&; zX>{BS&${^=X_YhxP6wvKHyb|{UG7CXipgR$fSV`OnW zd5DztLcUzeSQFHqC1oOGLB2xDL|%eCRLU_z{sv_Gbo7U|7#5=_xwk4;r=%!KYN}74 zF2>E07aYrGtqMy48i0_8B@2023eW<5EHE7yFXUksArDJ$RmP~F7e&WzQ*=s$qSFC= zKm)KDXau$bV}UDxCg3VyJa9EI5x5qZ3|t574{QUb0M`Qt0`CP{fE$2=f%gN`ft!Gt zz#jpvz|FuRz%9Tm;6uQnz@GxMfsX)(10Ms90R9|k1O5V-1AGE#2mTtE2mCE?H1K!8 z0^n1?F~C0n9l$>U#{t_LsP1fW#m0-zXnA~YT&#!c(+ZplM{KOe7aN<^$ZbsVjjViV zuG!>?@!>OKGI@L^Q`YjN)wtzbh4<%&b-45w3f!&;-02s%n;Cff-a>%~6oH5Q0*@%x zszX>lNy=p$la(PH`ze=m>>uMZbTq{_chutZLH(^`AMR(3dK00pWnv;<57ztmn&Ime zYldmpE6GTZ>zvv(>pU0hd;@HfF&4u9EoFQx!*cH9`jnMriKHL6rYvz2*&!Fg z{#(5~mtlGLDI}*n>gDYNABSFkZs;?M&vSw$R{1p3Wg1iz6|n2e8Ekzwc}{#_BAx|FtC#mjb(G}1N4*JzqaXJs6bir7Y#R3)NbAa?uB$?< zi6XZyV*+rt+mta4SmGX%aV_u$_wWo_SJHM8bj9xe8B>7s+=DZw0~fgM88d)4y2oVD z{A6Pt5Dvcw^Q6{ul1FS5^JjZtVuF&*$MHVRM>f_2;qWIhUuxYadBoFVzHHktU&8() znuh|!CmeGR=7ZK5t`8(THi~%~_W<<6c>GUj`~`?lIOa*r53M&`M@V)&E#_z3Hq4K( z|BTk1MO}A2&I{I^wC!nL__{+Kqjh(+a&?Pd(YI)IW0Wy1q1Q@zt-C?J-(yilzb~3) z=$IQ7Qcs^z>*)@vi}-wM>*<$V|JPyshFxks-9vu={(fLR?AhyTJw1UsK-X_=J*9rM z^^{~;Pfw}!^bGuDJ*9R{J*E4G4d^2oV2U%X&JryGHg{PqVvid#WompRt{4Jw1r|q;)Ur>0z~=9#!k<2WmY% zi}}#j({pM)J+IbN#G|s;2;!)iS}s@7A)qq!J-7Gu`d({pM)J&$@iyt_vB zIPMYMwmpseE7en+Q~LZ5*@1eX-v5!DwM(t1dq}U|$Ex=u^1g!VfEDMOaL9>}dU{H& zr)TCQbltzG_aRgda?mfKeFy4+dY?&h-Y&JC?kP)<_dD{wh3Y~M`XjWT2&t#1)Ovcx zn;`Fx*-PG#r@b`nx|~62g2cJ zL+a@{wVs~Gd?sPdMfv=1E&m53BX`DCSAnAEJ3FKzzb6XG7}gIkldi$NUKU zBed>hjqGt=u3#xhZa8aFlqk2UQD0&5MOT`dVwRZf z?4?fo4-JM~U@kBZm;=j|2Wh9b$I?q*C^v4gA&jcG;F)7H9w#m!!cdIzf;kB4TQcJuOS22 zY`&ReYtqeJzhZ!BUo}Xyuf9aIuf6nU1D+LT1!C|_j_wcXdG9s%AF`*5XS4L3nZDyU z;n^^|=JX*uV+$35i`cbXb}0hO*>||?Q3Nh#)45!w2&`t;ak*9zSjT2?xn2=?6PwB9 z<%+=Xvm!3vst9bLXT>zeg*)o_*qkri#K-2^BV_k0LiQXKa^;_eTy<2)OJ5Um?Fk`& z?`9Fdw(L{%*cmF^vdrCs(^FH;>4OGgy=~dIVt}6c(7tLA{L|9R8JApQ&dA6xXIy%zxe4v7FTrjZ z+SgtR|H%&IHLjrr&s`p>?Y*^>0w7`N}ZE*2pA7L!o=t@ zZZ~p7_x0ciwZon;G1Z&240%(o$#ME5!u|t0iqt66HELL*$wsSH^SBOi*)mG%1u@dYU(h zFf+rO^r@r!yM@2-SNrXaddstfEqcpN!n^dA7YXmtTe$5k@hoI7>3*WO>?7Q&w;Uq; zmEQ6f!Y5%nt~>5_(yi26o+n(Rx9ld|sJFaK_<(Aw#yvo~NA;Fh2_M&6ju1YrxBQK8 zo8F?m-zrcFT4pFST4pLUTSC`Pn%>orhG$hhjj5+`3`U&6h||*;ddfS8P21~l)Po8U z==7Hvb*P`6Hs`a9^v^U2{WH+#s(L>=)0yku$J?K65$(^0_&s;K&_Ac~dq(58eWhsM z9^$v-4x!(n@q1R|_xu{s{(Okv3wH|r3mU)YG=4kp674%f{C3?f^t&{E+ckbK-Xq#y z4DoyEKB0d}#-yPz&=Z8YSN8|Ur#_#0^MElDjey{vk=wH$Jy`b^i`xDW= zH^i^=L80%|`0dpA?b|Ba_l5ZFe^}`EYy5U;{0=-S+7E>I9sHTlAJq80sPTLCanb&2 zh~J-oDfEBV_`Rg@JM=5jekjE6@Nb0vu*Pq<#_umriuS*R`2F?wLjPBd-yV(Mk*7ub zkr2P5e-!$o8o!q{et+90+W!{f_gaV0zozkfMfGFO`wtoXxRx~mpT&M*C2%|N4&d|9 z*8z6|*8^XK{`H6$*LsLLpCCDCDWE ziTh1s>p5Oy74o%KA%CY($kPjjeBDAJ&sZqrnRM@y67_jIQ>Gm)DvV`WIOcLAU?t7f{xHc0GNxd6-aSweGau)0fhekih?ddz~Y0|@5de{*TPu-vN32+4L2yM}Cay#vJ#?Po7_Jp?75BWU;<(d=MltgGxP;Klw)4pgB(XEsJY@h8|9dn_M0Ta(HSDwg1F~+&SMhPT=5-@ax6&O zpF}t&L*(j+dz0rpEkN7{!;gyS+kMYukoO2ZjiIN!>0aR2Gopqa3#o0dKPU9B zYiipuO-+1bhiHEz#P7`)g#Jy9-|HH`^rtj_$2ETM92D*Eg!uja&qDuqjo%54-|54m{d9=mnZFAC z8I9jt8ozgsiuQLy{N8&_=-<=$y{++k|8>#+eu&=(Zwmbf8o!eozki$%?f(ez`|xd{ z|4`$1O5^v@DbfB>h~L@23;kJ*-#Z$=f1VNT{|xc__&uTjSmXD1jo&99i1trH{LXzS z^yf5wr!{_`o)zt%hWP#KW1;_-#_x>A@3V8F{j(6i^Zye1^BTW*HGZF;7ww;i_yqz& zAJF){r~2`Ggk$PG!W-&6!g2K;;VtzZ;iP(x@Q!+qa9X`bc(?B!f#Dkcb-hQRXX|JF zVQv!wmHFX4a?XJj3&kHiJ(+Os74FdPuFBd*ALAaOyyuxCkr z+C58>)jdn%jF2C3LAv%Vsa>;YNgNQeA)d$vAaOxxEOO71I3c7D#|6>uSrR8?6WRTj zxggp-OX7sk{DtF!X!k6M6GC$kjtipQvm{Ok%|$pah<4ACI3Y9_UnedIU!UNt6(KL; zgb)%JMDAIJazLn^I3mSB;)3vXXJ}POCQb+;aX~^kAfzX5NC}X*Abc&U91!A!ke;|8 zp&Ss>6E|c5`KkR$4hV5VNKaglP!0&`i5v2t=7MPVEQu3B^Fv$^$pImmI3YBC;)3w? z1@0fo#0eoJE{NoSkW8Eq8aHu4w0oAs2_ZdkK_mx+Wa5O-c!>+b*PF@#Ax;SCi3=h* zAS4qf=Q^IgTg6b(aA)CnVzsv;*s;A(D z(EP=R;erI!Q*c6P4&uXbL4xWjI3Y9_>0!7aLG=`z5SokhuMrm{sGfooLZ5%)g2;Nx zLO3AQJ`^00Vjyurg6b(aA*9a&7o<@5ZI@hB(hmhUgg*bo1qrIB;DnH#xFCw~+bKC9 zq#q4#2-N}Ng7CGaa_Wc^Li#*#LHxpRx8#73el)lt|7k8rP(1}Fgyv@ixFC`PLb45< z5E?&mL4xWjI3c7T0WOH-fRJnhCxpgLT#%r83Qh>=9pHjU4hYHPzzL!85*H+>o`MrX zdIz{5k^@5WIB-JxxFCI85PKgNtg18`LPaF`Ei4(G$^spz?azV(BI3RnWCr-!#U^p%a*=ac-hhRtC zkR!k^E(pKw zYW}~0Phdyfkk5c$#s%Ryg99QtA(Ue{E(p&R-_K zJXdf)C`aOiP>y|EkZyiU9~VU2BlK}WrtR(Hf{1&BJ}!uQkI=^j`JcE)X!uxiK~{IT z8eIzgX53w;o52gA|Iehc16zgsb?6d-W>vPRNL!2aa^e4tqL)bkYgO^==YbkJw5sjc zvgEHxs;r7#Fuk%GfBkBV6@L@UI(_;it3Aht|E$?pR@GEjxa&%5o%J62%T}e{n$mg? zevz`ScB!|t%voLPT~=FLQ&$g*vL()n%IXTVyYXY7ig3)K$4< zZddMA`B%F~;SXk&Eh-zeI2va)wrDXPG+lj zFR$m;&ZU)QP`j(@YOpk1E-VaYE2~*bzl_T0_d@HPi+~<@iz9)o%DKp0RhpBNKe6jD z_iFWURPOi!shwcYlZW>4qvTKV$rL*Muyh;8XaOB>Ef>oj@n8<$;Rw@8Zd3Ss$cKzEpFho2)|;r>TZihp4Yu zRhIQB3GzP!fg0&A>yX4O)n2wse^t%Wa6qwTyR2Iht*W7Hm-$P)6zxRYknOTgN|bd{ z4a3JEj|snlH;qgB%ioEWSco?5t57T8n92|N%R1im`?HzCufpLpZ1arl>X(%-U1=q*bHp6nW!kSSN~>pBRe%h4)vEd75O zqL=;>=b=5M3+}N~?cbvsCO}JXnZH~=H$o?K=Z@+>Iilb(IeytD`<1c>2jToTtNyVe I4k04@e?eservLx| literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x128.co b/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_32x128.co new file mode 100644 index 0000000000000000000000000000000000000000..7a540745fd9245fa037a34e9f1a5ca7c4eeb12f2 GIT binary patch literal 29328 zcmeHQ4R}-4`9Dq4v`zX2w2)$3BUT0sVo6)9U0~%?bO?nOEI!N-nzZRCZU0SC+~0q_ zO+N$#L_|@CQ&Hx0b8{a$-A0KD3R;v3IM`s~2Pz7RQ>HjE|KEGgJ?%}1RvPxqJx}8k ze)oOP`@Z)*C+EKR*hpazo|uiBAXo>NtYKyj83C9+f@lOIxGEL+IT7K+N1LYAoYqg<86YWvNh zST97T$fVTs>7f56e$rp|Z_bfMH--A9=aXYT8-8@YvwhLIcGrVb=78@tD(wwkd?Rp%I2zQT zKHnn^9+%sBT|;%F2Q6-Q19VPby{p3Wv#Of9MXs97wp!N`&;7F=vei{q`aF$nv&~oi zV^91qY}dMKmU^6vt83kl3RCBzrInSQhBo2yK)uT?o#G=n{e02&P98!=YP`PQ^FVc; zJv_ySd5U$lHP?S_hUM~PsS!@rYlfBxPUkwq%VdVu2q)_`!x0gj&UJ=!Wrn#CPS$IN zg%O<2b%s-9hQ$$1)@z2-BRHMo3^$d}mMNA*I9snNmPc?p*C}4so#Mg>XX`aZZv>}v zo#K@46ze0Lt=ANnMQ}RDDLznoWp|F9N`&+Enq=ql2wvwr%d!$VA3N7ZIA8Bs-WkE` z9A~+y^s4S8?~8D{-jn=!1g~?PPd(BdqE}ga;ez>KFGcrq=c$MCd@Qk2=Dg zQAgO=hX@Zh)-~d>aATEbTHM!%IH9BQVAK(Q9(9CY_94PU^**QDQ(MC~^O?6eR z;qf!{GL~hhe1AD3)6^*LoU!)pfP&`+Q$*{om0N6O3sO29c}I7~mPl8UuXm(`XD>p->Wyfm~|P8w16JamGLi zVZ1T0kkDWZ@V(CjV?ZI1&XO1SVHZzRgEeR{3=iUS3hHzM8ROW_ zjMl9<)-ivFp0#usSZjwqHo57>z^biz!8|?|cO@bhyv78_aQWik#T*lY2^=R)ajcuv zI)0sl-O;g|-HrDeydS~)r+9C|`)7Es>j<(99dvaB*-uowMa5sJc&~~NsQ8eIw{c7l zrt|Y<1T#1$1(P_A436ZO70lvzLGS{O7X>fkI3PHHBm8foU19ihgg;04bA&%f_;Z9m zNBDF6e(`}l3S$;U_vMW=KKlC{>l}8&*4+bJw<>IRM=|T_(6>Tovnk5O7yGY@5#vU7 ztq#Y!TbO&e-|fzExZPewSJ}jrl7n_OIxYZNJ&q^aIEO6L6gU`##^1JNUGgcZb z<+@VnE>Lx4!7{EZgKn6rn;V?Vb#tM+P}R)~&f~gy(B-JQtAbZ?-Br+Cr0U9pR1;$ef+xd_$(b3i=t#?_~X{%GljmD zmE<@|O1uFaJ4)twE$m=N20PR-l>NQqJoZ}0)QQk9SfD6B{Gp%XQE`pET#c2Os3=KE zBGxpDRhHu@EAs|$>?oV(b<8XGI?Av1P9r~D|3UUr2c|rm*^KLR{F92Di;fW^6lLT{ zzv$DzoWQ^|Z}ClzTO7EK&1_H?)5kC~Gj^H(7h|Ct++-(RTnuY=yREHmccR%d*gKKz z$2-;~Vf@UPOUqF^bj`>Ey4Wiy-|@XU(van zjBMj8MvsnhMEuwvAa-i1q6`?|53;8^MF0NSL7W!<_*v_M>`xsiFRY0D6}9H~9Xh=+ zF329|{SUGyU~j;9`~j#l1X+8>_a1rj`$4v=BXQWeEq3-r2Wn%}*h!sRgSu;)Fml15 zZXw}~LEWOJ;EniD*{ZF<KLpaBSZ zWU7!yrU5O`#{)Bgi9#M}5%S2)cBM$YUKAa>QPC+)icSag0}a4dpb^*(j0Y|Unt&^T ziNICBWZ)WLDsU}uAaETp4Y(dS7(+W)A9TL8?BhMGS#KiLwM|au^TE1-&lx^nF=v=|zLJat$=m*5^(I2y=F((7-*Uux_`D`N?Yt%#^P21p50@qr8Xtw9oafZ8 zndfp961uY8{EHVr9>3EK6QO-k`&d)9h%*?!%h?6l0Njml%ued&F2+ z5Rdvk^l|hV@kTntSY$pW#v*%*FZuIf=*ygM*zcSk+ur&eEOGGp<(O^SYETSq2K63P zuU-!&$jge)lyF$v(xia!dy7ruere^<=iZL8c$Icjn4?o1Hw_aW1Q4_PV(rx z#Q53nnVh7I;Qcs3<1qsBfpFC07%#Q%lRWwD#YJw2+{)8p`y^_1E* z^_12Pcj6qW=nLV%?WiHNHjwpnr&>>Usr3}OY$UrQs3FLuyPh6Z>*;amWj&?ZE$iv< zo*LO}Jsr_w+gn|s@r>_M>*+p>C(V0VPY1nl|A|8#! z(0v#Sn&+~f9#HG)A+??&9*xD&ljyUyo}N1ou{Q9U)X*M5)gvF)wj->9DAGo|Z) z*ml$db^S+j&Q7(S?jpUqj#bwqa$P}nz>3eBaM+QsdU{l?r^ic^y4Nr2I)v&$F3wA6 z-;R2qt}{u_->KHqT@^`ky(8BxR2On_K0^DEuzGq_t*6I*NpgK8*EyeKUdbBSYu&)S zlD56o4H~BrmSAoJL?i5RN(-R!>i<_4G8xOW5zF@hC)m z!lHc`Cv81Fpw`nv7$;%BpT?yS@d=AghSk$kYCS!T@e%e1Y2L{i+3UDq-bvfu#zk8@ zqphbgtaa;Er~PQB19Nc{ zYf+RKkEvN-W%9>Xn_A))o9yhxF8famhCE;%Fdvu?EC3b&?Xb^-eLn09xP5GwJ&C?A z#&+ca^MU!m0$>5qKGEHjH?guQf1EqWwB?fa zf}pO5W2Syjm!kI>2>o$BLl&^rd>zO3l!w`YoHw)9<^zWcY}nK8Tk*nT@>3l)Kj*cDuMD*`Lo54h}A1pbK4;Bt*3 zu$EoP<$6V61Dna^Mn&MYY!;W7DFT1YO1ON3B9QL~P#+g=Z{U4%J%26ln|qg#JueH{ zyHCi~e-U!cAt5h$O~~~}g#1762)XgNke7Za)S{*! zo((cqKiPh1p~?QeB2#8ej5$-UH)qDinlt0#%nIA|ilWyurFhdT{y06$N=Ptgna$>` zq$G1zN{YD|?X6}#Yesu}ik@W-7+}sEG{~Hpo^H+@G6eH&(<{pd>6stxD~G^8Bg33^ z-g)M%tSocZ`RAKk(7x(C?3SQ?&H3=3>Ofu-n%eLzrVY=D@5N`<>h@acnJ|5RcxIYo z=g)`L&l=A@?CLXTHhbuPyMhNg^MTF4xj;YghrpJec|gyC+SO;!+IVa>|C|&>pr6g< za;qY+oz3U+az)@u_CqeOQUtCkRO{+2^|>nTW6*O>UKcU{0$2mE7`=&57pqU<_cd#a zvW@fu^d>@G`XK0WeOL_24#)=B6B=VLp5rHCIQALb+vT_)-Y7vwSI^J9@rz1d4yN$KI^BJVV%~x9lLiRd0EL@OHh0 z+s+oxNOqI%e!b-t!p(Zie!}1AEq^6^9JUjB;%+0|3ccky!qs|9C*fUs%S(j!sJ3d{ zy`+0sZ+VsQQN86L;Zu6c-wC(qE!uThp;Fj3Q<>Q|OPSRcK6f(o?xqYp!|JV1z4c=# z;tWNc-ulp6-nnf0?m)91Tl~SUK!s6<`q^c3J;O-B zUkdYk`Cg%aS>yM-#&7riqJ4LmU)KXd-=*=}q49fVvuJ-M%x}*_Lcd4jw^QS{_hHe# zH_UI}FNA)d#_t7<->Z*`_E*FF{_<;~|BJ@&MUCJ7--!18VSWdGC-et2ew`Y>zdkP7 z{~G4^w?7E|-!y)^G=2x4672`W{0{v|=nrZ9UefseeT!)SdzjyAV7<`w^_s@-Wz~<{TRmfLZh5UnJA&^*ecm*UjP`k# zsdB4mFQ2ZF(O!PJDz}ODtFO?=XutXgs@yKx7tGMeXkT!pD&Hj9ubHWl(SFS=A%8DE zUQrSf{POvvN!QxFMmUW7nicnG!eOy3{2oo;ecvKI?#+b5(r<%atfAk9oCABp;js^J zd-~3LjP$T49G?EWloP-aup=B5`z*Jk@1UQM9(IJI()Xl%1{@7LLR;*c+)n$Q@e68) zJ)tfAW4=b9TyqmzQV8uyYR>o$NV(=_+?GOU&$=yza?DS7faAC%HCKFRqa5=yewRWx zE=%NEnD8vmxhP4^72m-q$HI&~DTGB?B3DPkn>^M`|vfP|4`%ij>hlA>!STcnBPZl z3jIeKzjrl$|2QJr{}JZ*@jF8QvBvMH#_yA(qWzOFzmx9^{Yj1Adm6ug9vAKZ4Dr{~Cee9{p`yBhWMVM>55V9i<$$B7hM9$dDBtPw5CdukvCUHr~k2oUTdzsX(*~=tu2-y&SscA(e7muS7amE{f9Xs+PzHTiqQB);)rPXGKnifV-SfWqTS0Rt_Y1qB#wx7 zFO#?;G#1|`jtHNh;Ifqw&}(IisGEowlBmEbA$_B;;oRN2I%+QoE*}5;ufwjNp&l2_%k4NIeBtg!E|< zI3gkS6kL&wWcMHDh=kNra7AeR5+iU#Lh32FA~Xhx5jY|t^%PtY8jH*b9FdTE3a$u^ zMdr7NBN9?i!4;wFpEx42p0Y4*2(=FfccctR9FdTE3a$w0bHEWP7Jl0#N0s!$!5N|J zpEx2R^%PtY(i2BS5q>))H-z-#z!{-BKpYW1w^XhjaYaa<4~|Gc_;pHd2MwCbLh32FBBXbKBO_wxORR@)ewyevZi3^LhF?B2E1q zk$#RyKS$)-%@LtB)+lg80=VZBS7aNII3ki8LNakhh<8C85wa(42+711=_Ebu3AG#% zvLkNDZs>_CvKJVMBSLmsZpePv5ohEe@T)i?JZEr2C}-k|P>zu}B0N`cLnv3`icpS` zI3he(a6>3p;)+nNkvJkeS8zipN8*Z5j{O{w9^OkoM?`Q{`Z*%gclUEd#QsS?M?~HK z>F0?hER^g6`>saIU+s0mwt|jSR?dvM5gcV=ZJ_k zLO(}DT_f~!ME)z*2+KZ|91;5Mkmjw}GUTt_=Ls~cvPDJOVx-ps{6A9kG6`TesCe?r zApgr;7PTE)m;B{WWmW8k8P&D;t4>8${H-VJj2Tm`_FNnOvyQm9rmnWi)8MRkHG1hU zHaUHD&PFeOX|bVxiO*T#s&)F7*4NiHG{T}{v8$@OwhHYY{K(~^rB%-A+R8d-WnF{Q zRa4`v^EvAq>ME;iJid#44HXyH)Lgq{3^YITR5WtIn(KDE@?7r9f^nCO8|TS&xhitq zxp`wR$*b_V^DZg4)H4=;2&!UH#mMfo&p05;48w-%GW@2H_LazmIJR)UR?FBBRqwbY zR6k7BAD$ShAEWByFAvohs`|39_>)z=!5wNpQ`L`N9ID?-q7Gc+n?v;!Hq!4j-VS}F z-)UsF+PX#$v$?OY#Sc$1TUG5+o7d&@GWB25H#RU^L!H~z=wh};&$32t?OIY@0kx;5 zp$;FZ+l>#D*(&Om&@Xi|`Vq@U*CL=7-r`6gt8p#z)HuiH6--qRb0c$%f&|+`f;4KMDvoi`E}A$ z9;hh){+eu-*Q=Dtmui4?lXXGjbk&gR0-Y;fm1Vt1f&9;4uul5Rx*#z}wU_PEUsbaV z98hf8F6)Ryt7<6QW&RS+M?29eWV@_e5@p>|!|*=HW5Vy?P5qMo@^?uk7Nbr3D%J`( zz8G)vmvy!K_fE5gVfWt!6rwPe$G`%-$(Q;-FL`~+zvD;0FGw$0m*x23C)xOG0{$hP z^gp3q-zU`TKUEHtFh8UJS3*zu=c?@zr2ns_LPfHrm&AEE?jAedBL2CrMxa$S#Q#dq r???Sy*P;EpIa*|4oO25!~Ops!pKyg literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x128.co b/hsa/gfx942/fmoe/gelu/fmoe_bf16_blockscaleFp8_g1u1_vs_gelu_1tg_ps_32x128.co new file mode 100644 index 0000000000000000000000000000000000000000..2c51b59c2b8366bc5eea92996fa0c2b257757714 GIT binary patch literal 29784 zcmeHQ3wRXO^}pHKY?9r)fCM&}#E6jw3TxOT80kX88!bX00fQekEZI#q5Rx<-6#KVv zvUv!|%ZMTtTZ>X3t*y4Tm9`|Sf`Wo3JZxy8qJoMAA62TTxc}d|cV;uYtc_&pSNna5 zU-->A_ndRjy>sWx?A)K5+osN(ri+Lu&@=HPv12SkIVLDjj=O8c!OTd~<+Et~8_!~y z0c!Fs5Y6YIR|%$Kv|>kUwNFJA00n76RFNeLnf#CfW9dR>mO$*DC1e$~ew1r>mD0W@ z5S8JvEJ^59KOMxc|mQJwoJ zsQ-+%4B*7;rc5uK$C&3=_GJ#2+iqFvtgCibx$j)F&0?>sa35hMt-rF^%j%cgt4i%< zWlp#Iu{x*SQF?P-WxW$E4o4kyrS4jLne*orRW*z4Rog7p_GQkW&3@QYQ(o?N*0XIE zcjXVA(Z8_VWUpH8EL~bz?RZ?6mM&giUhb@G5iSqb+8wG>bO@(M7T;3JL#UDJZ>e=Y zSeauDPVrHmVoi0`E&n#d61B6$5S{g!p*e)p#m;cLnqfwW&U(!-D}>X<&TyWZVRnel zdd)CDgww^&aEh8?L5R+J&2UBtr;D87){;4DiiIJ1>ovua5Kb36#p(VO7lr7p*A!hL zoGx~XQ~W8`hUl%=6jy|By2vR$Sah8~$Gvih{(4Pv@5&Hf7d^}3LUldv-4LR`-m|Nv4krZ|WbJ!75`w(I46wNeb^dUgta>xuj zLRQ!jM)e`W*4dhA$nHacz-h=0J3@Zg5o~>k@Q^d$vMA_7h`@=M5_W_cVMmzRmk3Vn zby3)t2m$4xIP3@|VMkclhX~uSAm(T;jYWNk5x628VMlO<9igfZ5gu|*(N08dA3_99 zM19y1R)ihl);>ho=ESF0&61G&5F>CVn!}E;GVBOz`Virv`kLCMJr7eG`Vb;;A~uE{ z;oh(#Z0SRUN9$|q@mjdPLbEI$=tG>q*?1`I2#LKV2v3I{;m;QmVY{WGu4Z{{soPnx%voJuT3)%r=_sYw|1Z-X)9?BI0yY->2KK7T zifa8+=jq91UxfE*%QE|l(()>Med*0LbxW1typ4%XgToU_Znd-u z*E{_w_2kTcXQ^}k0Gk~FUEhTdP!SQou(&H~>q^UNmRHx;K5TK{vTSip)!~WZC1Y9Y zz|9V z5BO@2@1p58DV8;3(CR+lS6lyYw5(>jjMX}QhEJ~_?!&lzI-NkqIJPoFMhA{{%+oHh zrglARZr8Cl+oj0(h6rCotIx-n%#O9o>~;JeZ)awm_W?|40q_anZQeBb0n{7>SG~}?B{Wgb=<*?TM)(Rc7RybT%953R!mi^Y} z7+X}nWJ#mL<+3li=STJ><=DxUJFJxsmRGTaH9E?YOp1!OK@jq`}L#whabv1{89v!JADDlEGU*7-jGl z5=I-miwN}wZ!KYr!7CG*4BicdMuYbOfDHKwk`r}aFQ5R}3G@O#0I~r(FJ0ILz(k-7 z%m8|U*+4c>=PjUiU?H^w7g0N~mf8pDyfRR`9_SqB_;UDK*o=PkFxwo9e(*zWqkW@! z+m*g6ImY;6I8K^kYn#+OzRkw&Y42qB;ddQ=AH(lY@VgbiKgVxdyN_*ZryhOm#|o}j z@D~dHRKW)od|1J|I41j&d4DOs6pnGeIF2KHBRHn{(l}n`yNu%%zAHEm@D1Py|J!MI z9sV5Q&k_C{;m;BN9O2Iq{v5wsaQJ}Cm|50+c^l1#^nk6+X4Q8b9N63;v(EMc*3~XG zLuawb@|9P5u8$CNM|RCNTibf(818X6GHnirOV*V)FuCxEm5qw>LROBW$u`PnlgZvT zYLqOG9__g&NyK>-n22*RI2q>}I3Jh|ya!kSd=*#-96T838aN+V3#42<%#k_QHy88Z zV{>u7GFWbx^%lm8d_`PW1l?tduGm-1b;ZyPQ*`ru^SEvvbeAi-`M&vFHy^r8MR&dL zdak=3x+@f2iLZp~N}wCA=oa`EaNPpvzNP4Rj?-;zg^Dcewl)+&p6i!?-cSs=*e^fU zFc0#4zx=C)`H)Ne@^2chhkS!y{(VCU7!xbBi8z;%0a($)_{xas=Hr3V^Ib(d=c$`a{ z*(}Q`DW0eeIH%Azv%*YUVWG>5V_V@|mzf=DPhm&fhqAx7U&3B%pE?oxg$rf*```Ca zJi19mUZcc{jg{rNI1y_a#VXFU6&Jg_IJOnfciHBbxNIdixTcXGu5TZExgATM&1%H; zIsR!`U5mD?ELk2g!Xw5sFw;9Q$yIQ>ZM_ZGv55`pVp0S%F+-Q>|1cN2!3|c@MMa=K zJ2ILbj#!g(uxldOkGHkOVg5{5ODpM~(uh2ui@cWd9WRg1$dKjBF7t3(#OCeIwl;&( zhXH*U&_{55S?6dlur2!yP95Wj_>td3?8HP_9x%Y;W82%s`2N^V93Rj4*=;`dr*@PV zHje!jwdQy2I>`{_V^8t%``GVcug84+0jSgaSZn)t9(($`KDMttc39gEEBm4y_soVd zllFG_bT>9&=6pWgBEs8zy2TB?+i+6Znhsxl{BXL4e2Ix1J@N6Gp2Wn=s7BlRNgipP zO|rG6+EQgXEzKjL{z;9}x~RtSt>YUF9de_s#ftGpVJtboXc{-&AEkJ{?r1#CQ9I=E zzVUnw+Q!e6JOft1#SVR1AhX{06qX5 z2HXnF1U>{D4*WSV3-}0dB=9ldDBv%F7T~Xd*}x}&R^V@dxxn86#{z#3%m+RV90&X( z&<6Z7a00Nsf$GjyM|7;XM$2P!VxnDCpH|{j*rKCdp6KYzMs8z_ZDi%cb4*58ln2*{ z(dhCRjhQPF*5HwEHGaQ5ti!FpK;TYU;BK$Ly-d&BI|~FJlm&Ks1s;(zs=Bd!66DJ` zCd$J&4v;VBI55hi?^qSx+)<0`gZkURK03e}B_pA(WpX@U4;h>In&ImeYldmpE6GTZ z?DeZA#}gXXlAp4Esa?B%Nyhr!Nd5dwG7{>x6~*)QmMQw->zeGe>zZV&YqHlrS`<%c zcpQG}I;VEcI?us6zZW)%7z^RRmNGt;5!su#K5121Jn09oEsNhmcF2Wr;5IkUWkl{K zndGEL-MoG98-$foI}&s0Jk`;>Fa<$agIx;d5peOoCo9Gm`}RyvNp-2 zkKHWhIsX31d>*2o6!U0&3iCnN<)}?E%}?xhF;DS3iuhQ~h(}`{dN<|*@kZo}xk%kC z<|6%mF&AdUqp=Tt3S&mR5jHUwsoTX|r0;OYe>Mzbnfo=zo!MjC+qiv&Hom@WbBrB& zS>K{p?m@N6^-zesGH^}_hqX+OQ|lGUnX4wp5e{EVdQo@fOn!YFV^jHhA5={KIM;;3 zwiU&xb&llBM~mVJhd*8v=M{d(*);B#R04fA`X#isl<|J4E|Q$Psw|Fh>{{rr;QhVB zrgOjKh2)3+2(8=Pyg#a=BCt>=diSEfj2lurqQ}mZKpt2>>QXj4LIL9G;Jnuq0^c+3wWb*TpGI%(gbeB?34`DuO-K+KVuu@NtD)sb3rJkO_d}!9RouXO|KO115+ZqPhsVI2^TJcN1D*3-jEJv|D&cpmGddCJ0iARKunsGgox>ghSm zm#{xb^O2AEgyRljp0xG!uu@NtVxEM3H_b~v;uDTL6I4&nD)san=115cp>?O$$X@3K z>rS=pZCr>u_Rm4rV`EZ^_t`Pn&Wwz<6_Nmk>+@= z=D41oGq4s%vL;!Ma2gw>3Zo~o(%2NW)M#Zdby-0JXoY~mqC z$L%A#ta0>tF|sQMmi66FCfUj&x~wN6;1|)A z1Iz{H0`q`*KzQ%Z9p9A|`*ynNk3m?zu*d}?F^4{Y@$JOkNHnmh3O|6x&rrVds zo0?Zaz60`F$m`=x{&SVsB(EIcG_D#{VO*VDX}n{|*BBe@b6}qf`#f%c;bZ%D(uJR^ zmCi)ds)~W8)s;!6JC+VMt-KTRU6AjFylJpFSLP;pO^VaF_L2(Yy0l8;`b)pYcwnCc z`&`)Par+A&&*;<(AI}=6*|fG|sA*kgs%ibwbkmy6kbeyMe#k#fH+>V{3(~#IFhno= zbomWCu4>Km>BezPl@4^tl3P#ciE``HfX$|xIkqO;%=Ie=iT2e)MEjacMEkl+Z`R`- zVx}(&@9OBek=_Geb6>YLRlKvM&(ieSzX|V**)^xSt&A;_1ukaSa@ipZEN9>2vP%}Y zgw5n~l`OEDUB~5GSzsNT#pQZg;7x2cmsiLFf5-~Ce5)+5f!-n07#HoT<72bGa1$S! zW1og=j3$}>(7g402Uz`33ZWD z0>7_iG|4+jKR_}P>XHXRkL$y%mv=+f!=BI(c{#WLc4m{jm-MhFG$iLFbONJcM`(HHOU7_4?99*a#2DzFcx-%@sVZRZuF?0`@s=vhdp6@vO8fp?!A<2VvLkPI51Ah zIa}nMm@+7VavqpAD1mZJin*NQ;5a4MT#;*1N=^dhIyg<_YL1!Aa~>L}l_A--{Z*S05McuLk-3<<~<07meRb8o%x*MSFLU-{Id1{b7yYUX9;h zpAzkV4f6ZjAB6sI8ozxSza!g4`;j2Oqkj_mqZ+@LHGY5JA=>{Qqs%RTo#X0PhTYo`KM$8lnY zkSApbd2)e}rxXZz>LMXeTO{P^tcm;0U>iAJlOg15GlcxT0wK>V5b|}4ggk4JkZ04g zPm-M2!Q>f73k#x|jAJf0LY}C|jiP<-B#n&rxsw&yBif6nXk@e(PgUe5(LR5gMn?Pm z>5AMe+Dm3=WVDxDqsT3y{f28bGTLwWo+7u3_JuPwGTIkjr^vU9_8VtuWVGKnTgcyu zj+W(^7?1jX(x_|pKO-E&eJum`Xu@HUP5d5BpM8&$9`|O#VaazvFP@>_ft(3@!r_q* za(nvBdW!V0Cmf#q+k_8*S+FA<8TlNyqtBoplOA@2Ba;s#dq3o7_(Onehv1 zhdrSs`6K>}K)GheG$jyP74kQqcOB1=;V&3FAPl!`;#b+kU&6e^}0^x+TkN7i! z`u^w`-G`r%%x@6xlFaWAc1h-U2@i>91haB4|A2I_OXhzPo{-F+5`G99!ZYH&uf{!2 zx;>KlEy8`0`4r({$^0JSQN>n?`!VTGO6GqNo|epC5T2FHUlN|X@MnbH#?;$5)Mtd= z#?V{d^ek}f8BxQI1=Y6KpA-7mHMQ-SrY63zOSHcc7zm(|;5C(;C0GG=A?L747c@`Mv*|(7&(odt2l8!Rw;^gCM^T z-xT@}HGc1C{Qh}DwEr{6@1wVc{v(awNsZsfCq?_mL4IfcA@pZ7e(!4h{&iZk|0~Gv zllO)G6OG?LG=86cDB3>_@;m#H(4W=#oznPyc1E;+7UcK&Cqn(;kfdQ@Rst7@Q(6~@UHTV za7uYbc(4B%f#DwgO?^h7ckZV@_ka^3^WWBJ&v_gXdY=%mmr470U%?F_F3ASmABiL4 z-^(QX5Zn;5BM!+%AaO*_+shBX#xgo?AAw6+Kg1I52C(g)!nj@m! z%OtJ{%@1)zRBi~##1*0O6Gw!vFK_}$Cawq}aYR&Z2+711p>Y#OM7x(sToKX}M?~d@ zkW5?=8ZU7~_gA;c9SJ#j=-ZV1W575QJ~h@4kX{|z@p2QJA5jF&hf{(4IG+ImVd zxOQY`0Ec8FkT@de)l;(5)>D$zdI~NH`RTzC@z+yo*VI$uhLDW`{E>Ts#1RRor{Ic^ zJ}Cr8B%q#xE3$>`{>vPZfO-n92+dz?2#!cVJq1^U<{&l%MQXU z6`{FE{RVMF0_rKaB6R%|M?|fsEQlLI?Zd$xDFzZpB%q#xD?<8Aa6}4(-%gdIO8Vj8 zjL`K@9Fc%}3a$w0i6bHlzuhV~g!E&<8KF8r91*^@6s{d{MM$3uj)+(I?NzxUq#p~; z$bXt65>QXU6`}bV1&)Zy4I$YAt_Y2vI3fY{6kHL~j{-+T<%W=K0at{^O&pPcdJ3)x z>22VMsN4{eCx9zL<0XzrKs^Olg!DFWL{x4F$rHd8>F02vX}I* zC)9F8$d0%nozN3kmbA5ohEG@T)i?JZEr2C}-k|P>!KEB0N`cLnv3` zicpTBI3he(a6>3p;)+nNp*SKuS8zipN8*Z5j{O{w9^OkoM?`Q{`Z*#qI{P^yV*jL{ zBckm8^m9Z62ZcBz-{Ae9#VgN92Fv8DYgIDo12ZhojLU(|63ZoN5MNguY9pu>)I({HOC0fhI*Z zD@a?8^s^BE&lLR_1u&0-XTJ2+(4kpr$5tl)u_>}5`e9~eHU3)GxD5Oat&Ev7r({^O zE%+}Z>&mK{>I!FFX|289MSlsa)Lm0r@4`1&*3~X^mzLS9OWn(BYisK2VNtf!UQtAuokS9WDp)lJJrL$kzLR?mfu zY=^_1V|SG2jlF8@SZB7~UY701&KYx6PMOn@b5-8e&N27{TV;#OM)=b{?|>*Y3>&Pq zR|Yo4D*EG=K>ZL!-+61Gewd zL7AnZdb!1Ace|MKFX`*+n5C}9VXwC{OTBYNJ-4Vbml z-_cW>)%u{y`aQtTi{w1C2 zKVz-X&sZk#Y5%~496bK(pe9|mX!ifB`v2Om5eKTjiu0ik>Vg}7s`T$t4CA1sUp0Sq m{oDwhnmcz?{#hgn9#hA!wyFK9vI_^H{5utYeUL+t$o?PgO6ha} literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x128.co b/hsa/gfx942/fmoe/gelu/fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x128.co old mode 100755 new mode 100644 index 46d316a2cb05f72cd93f91fa232ada9695999c7b..8ecdf7ea30f3d0f09536eca17aacc9014fab83c4 GIT binary patch delta 2173 zcmds3Pi!Jp9DXyU2&4jCA#Gjl&eBTix^9PJ0r#MTEw$RUV)Ot8GYl|L>J&!)7)=k* zQWM=^+U{!%*%0F<8{@&m%}!%BYtwQtSq~m+_TZX$=*Ei&4{P?IMic$L`Lj*A=+W#W zFW>vV-}^D|eS860zfk+PDQhiqZic2P&a`PgK}VN1Pkmb3nM3L@5IHqY0hnQS@*3}a za1>>YB93e#L+_(u`Ct0y*jr_n|Dn& z@49Z@j&2@&`u`{M`YE)w_u-$b5XQPXv&v($HypOZoZ{Wsu(LakxG%27&;<5%$B|^1 z#L$RkcN}p$&R9MNTl=6H-wC>u-{Z6QS0>Hh{XsH`v%;{NLI!Tyokn64NuiO-?lj`R z1TBSQ&}^g;Pp25?w66h zmx_g)SmI@&vQQ9~M7~_)D+@w}FUd<~J}ngZa#fa#r3&mwzbWMMg&fpHJ|m{8IX+*= z7Wr(k#0!$d7t6d{DrWPNSROBz(&Li!_R^D}EQ;xhN*Fei5&S|XI~h0;3A_K0sl7@~!pgL}VICLQ0f) zHXI5X%L!k^So$OP)PT=tsNv94#&T+sHI|WJ7)MKIVZOh)Va5l?P0Ej@Hbw8u;a`hx zyj~fy8E5I_@zqM;7<*nj6uYS5elK3GJc1w$)T^d){rvK7E>-vWZ#xdA+POVMWbpJ$3 z^x{&#$mTlVDPfdSQN~1Zb2r?WWeF(S4WW`|fI2rqY=Q!hdIwrYm|9TDmq;#W3{$#7(>9twtLfv>tl6-hm|FSMEOV8^k z>OZZ}wCt%2Aunf=vR6yWDo_g?1bo0DpdNS%XaJ^xMqmbL0%AZjFbgQa9MB3J1=@h) zKmd3a2m;RmyMTG%5g?)APbct#MuyJcvMgc@QShsTBqzYm3)i!vBi1cu$TAkVYKOXamumDk4T zV&wiS?SnSg_=Y`0;H-^~j+YqgOVchXwnJVh4CyjX<7Jn5+8EL@9}?4Rtsn6rkyP?3 z>tmP6`+B^*ub1hyo^|JSu#a6PulI#`y`QRGm=PDlDi;S<(OmDj7jt3ef!&@u zUn#R%xQ~na6#&7Srou1`6IcJYMj; zkOk&(=6%M@>tY5B@6*YtYDM(_XE>$_$1-mA_$&UDfyC5i^UxN4;Th;i+B4?k!+ceQ zHf$M!7u|8>xzx-++Dryb4|e`NBE`+#z~dv1puJVIyYI_(#a K9eXNt;S(`!z*zpI?2O z{s;(&h=L9`hvMHpH}`FBx{VSQ6dWjsIM`s~4+;(xr%Z8RzWX`nyy=@5S8e#N&s|sJ z74G+W&Uv2qIZw{}yeIFuO>dn%eTpt2!DV3LrDsQ2f;=X0$>U>n;$V6r>57>N|E9B4 zW`vq-U84C*7?jRbG|7IXR?jI_5#Y)bqKYh2$Yh5Q7|Rzjvqxg-)k0QLYe%t`*2?zf zk=Q6i_9S(pmpq>ihhN7|wO7YC`)IR=#`ks4r{;VX?9>n_jIaBmbMbsq!1Rj3J=sF7 zqMG|8sDFnxD{%ZZlctu=VJz@-_hOIF@3t@UHr0FU{CBS0Y zb(QX_DzD%Fc$3%dsl2JFw%LmokEaQ`N`Irf%KLClUBg0m-DZ2ed$ISYGas=xR9E}G z&1|#XU;9I^>1XyE-E~X6m5XZYJx_?Pl?#_tS9_Z}gvo=AZjWkYiedEV!ka6(3)NHe z&5hm%YYQFGAwI@KY^blh`Cmp@p`I)=#>sk&&=SMwd`CD{jnEq7WIaYWG=|anj&P0| zVL^eRF~-??3~^};qw^f%gJsu*W89_0IA4!J?phYZ>byr;UaHQ=U29^T zujeQ?#IQQgQEn=`HXP&wF;3TWkdMZ&I?q8qST?ykl~2YvU(ZoK9mDD?qg3ysp4!FX z)=o?9Bf|b zZm#l$r^1n5xQNKYJMlU=7O#U7y>Rf5x3Q@?oP<-o@DP!NPvUj(S-cL;_QJuY3cs3y z-3hE04zxMgonVODgDGwgslD+qb&e(vnZ5C#sg1i6vf}n&iQ7YNFFb6Tq)CId7Y-t) zgDq|kL*w=^q8A=E&D5l!pcf7z(@+$*hvK+BID6sYA#cQG;p&Bp$V5zv+rzZDJxuS7 z2e0G2Mpf1~+=z{e+PWo`qnc|f3mr>GIf^I$-rm?$ zd!xJA8;+n6se@$akWmUtH`sT(*?EaeA z`yvmX(uCda*0EpMZ*t%0t!`>SDk&o_b(`i!Z2fjzO#Ye9?XW-QC*3_bu*AU{cRrY;x+xPW_r3h)Rp z2s{O3{d7T^G%jE!Pyt$jL0|!p_16Vm)DA4AcHjbP2R2gs09{Z4u3$Rr3Z20kG8l%0 z@HvHaI)RLF>|{pkRvhbCpi9r%x(uwnOP`qDa%*tq*1}LB&&9o|hy|~ap^;p^EOZ&i z_C$x`U>ty$I?P2%fy&CVw@%}O1oA7=Z@3mbaw!VutVnggl67P`sGl@Tu z_@Kl`B;LhwU}zveUv?;)V_GPU$j(MRxju(e6=6Gr7QjYyX{W!w@Ho9_Q&k^<< zVb2lv9AVE9_8ei)@%yfW`xM43itdYBDL?uLoNJv9!`4at+qWugPnV1B?b5eHXSXZL zWtRo6O%Qpbe(g@@+B=wMNWkN2t^q= zGH_3p@bfw_6Q9GNftU-xxxfP8JwO-mbzmuQ&>+kO;9OuMkYWumk8O5nHu4Z+v+;RZ zSsv|J1My0HA4mI}!8!}9N1=0UzbEO)geg^(K*GsWppLXJX3u`$-pJl24x z+9p0fpS9EHvlZ1;``Yk%hCIGh6AC@afog+I2YUrMcTRS$O`JA%?O1%4PK!lRva5>vytH)z)_jppx z-a)?c)PJmVZ5r}t##~y4+M#Pj9MC0RMe&YR##*h4a`D9h-WR@kd%JV3QJ%wya~N@s zX#a}N(_&;BUp0Dlj3fLfUJc)wnTpb{Um(Pu=@R4nT^Dgv0%K>c4YA*Mp}epX_Gi?Z zUw7&B#-tE?ijP0UeuMr7xDq~KN zY*S?-5kbCGm5Fo%d59_}3Hdva9n*0>v?Z}5MJc>Rxgsk|Q3ehS=yS!mS?2sBLs_T7 zvVaC4k$!eo<=>eDiO6i$=L7ciM@t-Mc4YAdTAQfN-`B?WMeq@?%) zDJiz487pyXUxD`*2X$B)xCCxf1nvk5+{FyMeUD4vYl^@FL4k)9Yuy1%pA6+`}TJuLfC(^pT{z+XuU#m z)?c>kdL8+zfj{<^1ugFFI z{UR3@_@lAsJcTjC-!P}hMeZ{q7x`QK>7NhASZ06Caof84_B8HLsguty=d6^i2F1`} zkoTZQc|DXOE-OA$!oeLA)6{xJvTgaqG{PaPNH6NHV&m7x5jL66_W|W(kI$NL@aD2K zwa$@jd#o&taL5y7X+dH44x7U5vMQi2z*-0@sr8iF zHT9I%4I6NdOpJxF|8~?6S{tbKbf>JRyJbB^EE}ocQPdE`60WDmWj#F!y;@JHcB}Pt zNOz6wv7Qd??%Pvcp?sS5%6hsV`J{QT*3*Nso*tI<^dnhMPa_}NdU{6I({r+(!XM=# zXFqa5^IWZ`2W34yEbA%!Q7&>$W6aumdPdgMbEv1oyK7{R;~vr7x2JJ`t$K>jl&=55 z+ffhX`j2GWPFYWPlU}Z4<$6S2S5O_W;WLa8x{UX;PR1XSpUP8xq z)C0NBB)Mp(tf#xH($w{ix^AJmP=NChI*vxw)8n$9p7f`w>mzlY^Eu{~S|fX`8<_`01#6dPdgMbI6zI{~G0^82$-M z_9IW)dU{aS)5FM<=zoCnQVjouC8wk6=^0s1&mlje{~?-pYK`nMFPL{~-=6ZKt)21K z(*)MO_1bfRtyLO>wZqy-FIh76My=MZf@XG_Vcm81HE*9ADj1G~U-Tdi)J7 zWcOVI*(L0C98JLY!Gyhqz#?D~a5Qi<&@mZylVLX*c9U;tQM}CDTB9=uY7OSLMMksI z3b_q(2jts~X7W42B;sk8@pQ;|I%Pb!eMLMYQ`?jo^7+CyjjP>jGI!LZm^*7z&9^N| zH@7c`d^_Y-knc!0hjW!FVp}F-TP|Z;A!ED!D`NX@);G`9GH<4Nc};)wirOsm?TZGP zm)!~ZF35L7UO!0W$|B-fDdSls<5?}^x#KJ18JYXd<5}som{-;0m{-^4n(tVYZ(eyX zNb(!vw++um*_9^_ zI2c=?2wcdn;<85(Sk12HvQH6s1Dnp}Iz?bTyN1h+iohl|gUii|z#G|2E-zIC{*aY& z`4&YWf9_3VT(G@~kInt!jeKmL-9q-hB4powA=my%$aRN>y!Z_vHy#!8f4nE;=95BR z@}ZEIo)+@Wp9=Ywb3*>F5ah|lEqRkmT0(dR##rM-#}&mXj_;MEjmD*`*&JT5O&1g>B|;POgE;Ob&o*JjG+ zhxELjp64~s6wfSzxYw}+eF~v2QJ=x z_y4Y~P1!|y^d~e9EX>#gG@&11O5#M`Z^Y0x|R?3UBBGtXw_SuBka&yb`ajFx4cAnx8B10&Jxc8_K@zUddsVXoAs6hgum2V{!I83 z`i|@NyNz_q^_CY1SLrRg2=CQfUM9R>_LY8LBi&$KHPGk4ND$)Kzl--MW2>pv1yXQ4_JMI+iJEH7%-X-)qHFn!Hb}!v6+Fy#Y`{R0{ z|D(okyT)$Uy`p_rl-=$h3H@%3-3uDKm+u$tFGtzE@)MzdMPv7(#%|9~Mf;v8yS)zz z{a%gT4vpQbn??JpQFi+t5&C@^yPX=l*B%q?uSMDI|C!M5*Vw(Jv3vaq(f)dr-JgCT z^ncRW{ZV6g;FqHPK$P9VUkUv|jomJd-JhQl?SGE4`^#^I{x2H4-5R??&xrOzQFe!a zFZ72sb}wt}{<=l9|24|)jjclehQ{s{X~*369WVrOFKYunhyAiz;5Ojxz!#uz0`35= z1HJ_P4}rUY8-Xu(-}{)m?f@RIxL6a%@m3*EunKviOURR4LY}-p$Ws;wc`9q;cGK88 zj#pZRe3ezmSG$Bf-6iB}76^I90wK?&ed4TyFS?jA?Qp5f#B4a`ax3KVLLMaBXHU?` zXrDb%$QIdNK1m~^y?nBeb7cG6DH<8=bEgV9SGHG7)5vJAxKhaZvi-WNG&0(+yIROr z**<@|Mn?PmYlJ*lwqHL(BcuKLnL_@a$)qUB$pQ5_T#Bweyhb>J`)UuB#E4n;r0;fc@le)Jvg z6Vjs};qZa`GCl*2KtDoz;@iBR_B-2IYDa%U`@oO+8i8UhNN&p@bfn3c@g0m}Ey%tr zgV2$8R|dscl>8vaF=;YZeCMJVi?V-}K{zH)#9EyEJde2~O~#7vP!wZv_Pz|lk~|Ts zGx=>E^SCq_E50*PY|iYDGYH4!eazPg>T{AKbRT|JZ+VMwyWVn)aIfC-KH+|`MzF|x z`6<%9ski){@TlJM8R19hLwH);_f@~|kZz~m@-E?Sz2yYqLA~Vz!o#w!^!o|vj_WP| zAUvtJoFzP?w|qf(?whX>dKyzt<51TKJ&mEKxM?kLTeDY3$zJE!y9WvU~3pp?^WdPoD=O|MA?NxLLbuDeJJht z8sUgsBfKTo2=B->!n<;fa7?Zd-j{2H6LO94LEjpI;U4{MT_e!5<&*ymfcv2Ex62gx z5*LD=Cq(So(SFrea3F{?u?F`?;zES??5KYX4g~chuEaVZaUs66XGeD0Jv)-+o*i)} z$d0%W;XONQ*X-F52ZH(#Phta*xDYfJbJK8gv5nV_w1rM z5Y$c_iEf;bVRCoV)Z2ZHp(jrhOjLTLBwh!a8iAufcc@q z1BnX}QBT2%AiWJ-2$!(irgAY!KLp$ey8ej^5m8UUi6A|3ArxV^L*+n_ehjz~R0oI) z!RMCblo2O_^hMx81clu$l>M1x8l%EmcLZ}=FlI`F`(D;c95m8UU zi6H$5a3NF<1j%-AB52&ig@~x9;6#w#2`+@nfgpJtI1w~n;zC5!Q*a_k?*tb@>@q-6Kc5-)Q>n2d!Q#y#B0DiNdCQbzL3y2HB@4MhYkW8Ei;uXZ=Lh$=8I1s1MPMnBOfU&p`{JyKxA^t7a2wk75TnPIADq6Q5!`WkOjWuv>}EB#9v8ylLM(W7dSyQa3j z2JK$_XyL*oHI=pX)eV)^4NaBqy1L2+e`RA+Lv?MP*MFJ6sp_)2x*HddgysfsRWlc? z1s;#P(Cw)nJ?8Q;W4r}!cU6I>pm5aXg;icp;pL;R@Q%VC_NiJ}H7p$Vmkx*;!mx2T z4ZjtmeWh{%`UWih`8RdVX)Sp-w*?)%ApY=uR z50I!cvC`#{`gssz{qEj+=wtov9<$dsG<%udb8|g@Mv&QS>X+DkZoiMoe@Wlm#OzHC z9(S{w*_*vfn|W{d;@T>xy>(3u_((k-e4xx;)v%a;d6Cgi5;nUR0)4O+M*>-$d!e_k za+G6|Lmn24zao6-D4L>bi>4H-hhr|Es2+|l4tJPTI6>7;o-if8U+;_0ntuDQa?-pR z89H?SB#0hy&GL z#ko+2bEoOsc!LnEvSS)Vx~Hh|tMli2=+xMSh4^>PB7y4o)i(9Ks_er-EPGZbdc3K2 Ilo)OQzxf04g8%>k literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x128.co b/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x128.co new file mode 100644 index 0000000000000000000000000000000000000000..e6446caa0b38da6de4d8eb6cdca9e4fb3b8a25eb GIT binary patch literal 29224 zcmeHQ3wTu3wceSTOhP6v-V;pXh>-$?F-!uDbfCi{fQk?xVDO`cAv4JYLXw*V#a?eb znLGpp#E1$OTZ&R2y|(tYw%V3tR8X*>JT0_PQ9;3ik1AEHIQL)sk(?REMl;;++V9Kw zP5!gi-fNw;*X*;-p0j_4JMyPbF~r2UOia9t>^O^&#{@2U{8*(pm>x^IJQk0?$t;nX zp(b0GXub#o(wV||*^kufoKh76t_&e6WT`?XJA}YkrjVIE6d#)|WQBS=iuJL@vVB!3 zRtu38apw@^`3yMxDt5|Vjc@kJIuDKStDaBgd=~8Jd|&rP=i>RKfaw*62h)UFp~`(6 z)B&{FfD>+*JhfmhWB#AJmwJ4)Zu=5%O{KS@_MX*S?e3bg+M}$f<>z*HN!>DcMX|f2 z#9LeYM2*+&DZaI)yv~ajkEaH@;@WC=iT9^v6;+Ge6tz%p5 zwdFtX#{bNIi@Rc(w|Gf;rRPb}wRq98(o%0tvoLwM+U-$B@ezz3TXb77ccDDh-B#^= zxID)Z9^&IX#Hz}Q+x}&QMe1a!5l+@+gw_Z~mpa0!DneU?lXV&4&Nas!t4kq z>oUT;2u7DW!pSN^SA>&w8R4`DMwd9kEk(0bhy@YO)@6uA5sWT%h*LX4To~bOU54n3 zV05WNoZJy&b%e8Z8RGH?Mwd9mhv(eT5#w$x!uh%ka`%b|R+l`=!U8oPcdv_ZzOJL( z5W(saN4aIrjU7R5j&QoJgM2K4)g=z{;W_!8seCHJ`MQqsnFv;28l}3Addin}Bqtyo znAAO!?!H|*(A|i4Z;#f&&S)L%?uLU$eEKBp?S=>aYGn7pXdN7m*1^$kIM}+#U034k zmpNVPos74c{dzvDXLW|*b~FL z;Xt2*Ju#-JJ;X=tA+b9irq0#nA+xP5S>5vt* zhoMn>7|{(6TW0FgklhUjp=rpC+CyH{9-Q6q@Q63$vT$|7MQ9=>N9|!+)E=gH$AedY zT@-Z3Lr8fjjM_s{)E*Xe!^2igh*`QzV_`RZgw6<0)E<0Md#LD!hey1V^@*tNhKtZd z)J5%KdDI?m?}mr1UVMAiO$n_VK0-6m6t#yHQF~b34G)jhRaGzPTuiO&hKtZdtdH8m zhNwMk>V}8M>#FMTSh%iCH!U`I!%t{79*Nq+V^Mqf?{0W_w7Ry~Z)II^Y58)mraFyv2)3 zM~x|7R8dv3q_)Ie;hj>QS6nt~*{I^ms#~y8QC_jEcvM|kF+Qyv$MR8*y!_wWt82<{ zao2e}g6fQ#{n}pR{SRz+gq-|de1(dM`I)`8th%PSq-t4ZUG<~(+S`^cs;W3LDcW%C zOFVd1)8ToKf&IdMtNRvjX-yRpN_laq&6-=hHMQkcm91(gN$mH8|E1PGM(p~ieKXl% zBCt2@bg6wesi*$xJ&5aQ=g4Iy-7^FK^ZHLdd+1#wj_FM=we2J%VauY;LevhBMitOS zNF&j`dh3qE4!;Na4r3x~!eG@tUs#uaJ8=~=*j5=#wxG#0B#3bZ4F-XXacp5`TRV;o z%-?2Yjcq2@)Mj9BwHafR>tlj3Ex{mT8av*mu{ZEO(Z;NXz-FY>1$+v4M__gPLe1Ph zfi<_g6Fv2s`wtC{sh)aRwFf<(gfD`3%ygu9>T_80ey7L4a#+iLN11^+$~-;?ju&!W z^M1z*j4do(ytu*R^SKw_|6lIKr4{7EQSM=*%NDZ+Pbt^OkPfmDI68KFJz)c4?#{VO zbmuPl+UGX&a~s{x))J>*TjX)J8lA=ll{02A2l!-(H3#@sv)LT5L7^p@1KHGIGzVOS zapph)VZ1r8kkDif@V(|<=72`B#T-~im|zZU2569FtL%>DtzyfLqE~Iu~HMRFO1T>(2KG0{J6U^arVI$6CLbr{HI1gUb zHZnMpw_P2)nq#kEFOCx@J6k6C+f>&|u9qi2! z_IJ|mI_x>Zo+Ioz!k#1SIl`VJ>^XkVb>x7?m{l`;aR=qYxY^n2beP(w^=WF?*xoi5 z+t+4ng3fN&w5zZ7-xwouNBx?d&epq`XNce9$#Qx;KFv^C&$NQ04mKh#09hW#Q{OnJ zQ=|UQ5hFBhJ0bc_a00#`fXAPVOtOio7e&)%V z9h{9k1leqSUN)Ay&v6@LbAofYZVq%;NL^vDkn0Me8!UBmgLAoVE_7E)-MrvDuA2v4 zmek!Cypii}gzhS-D+(5IT@iFcq;7t2KG)5M?pspFW1Q-2Es(Ni*it_S^6U=zr}c%9 z3p?Z|>gPh9*CGGBejena4*7rTZ-jhPhy0uRBFGCmo|OtPODYZ($f5K>+qRE-^2>CoCO8G0FIpnvwc=}v@MMtYa7J=+IAUxy)AzN^a~bf z+V{WjCx3L4ioI6)N=($Wq$J^M3i&F`auyc)0yuUS&ht6v75SV+H~FTJ9j@;nd!-Fi zp3P{$^*QbtP0dB;(4m?(Y?xn+sZUm*Pd}gQPUqcDT*nsHcONsxFbgy9v-~G=VHi;F zAYEJx>a)kzy44f>mqkKY0frXXu+`|cCZd@soMv?UI1-R5AQ zx8a^yKWgIc_MqYBdSosbG%O^%BWPGuAG`w}DqGziOims`*HADum7_m7Im@4#nibdJ zynCYGxYlWOwx&DNH7z5Q+*I;hf8l24zj5iKr$pOaGxat0AOy={B z#?utHO&b>+$LFAP+;q*)%vmh8;J9UAyhhk7ErDagAi`P4Z7de^^mdJGNxnkK$OFlP zm24F9l}g5(p!O^!6JHAQRZ1rE667IDjuY~?AUmexd}xbfahjHMyLL^#ewx<5zu%ZH z#?4X}93RSBG}aGj0zw{^D&%4PfL7?^f$6|RArG?(d02XjHdbCQnt|P+8MJ!MU;z4o zCSViL3~T|$16KeOfUAIsz}3KH;96iRa2>D@uoc)3xE?qFct6kz+yEQ|d=Qup+yu-7 z{s?FTZUznpZUJTi9{~;l{uDSA_!w|F@Co1u;C}<{z@G!NflmP)!2bc}0)Gu01N;pz z5BLmlEbwP&W~#(A;ITQNXS~AvKmXiRd|1K#DH7BOW<}*;Ld=+-OR+>_qqfg)C3+52t2CU zDh^}%q-a-gOw|T+?5$nNu}_@e)IKS`sl6K42c2&n`{)2`FeVThnkOam`Cxm1&lx^n zF=v>5zLJapslRFEq+~+#8nTn~m)iC7mt@S}^>m(}7!wE$TjwP6`IaTl!{;^i)6Z*? zF|Vn=>G3(qgytt*je5=J^KnNyS(Q`!tvEu?)+8fb09MEJ-H)fHft_o2VaR zA?&lYmd7$I_W_OMevjAk_5n{qudW-qW^p|yS>v_OGCc_c3u1x}V`;tSdg{PBuOZV9 zZ1pB&t^jWE4$fQ!e9${Q^D|p#-zMnRd;4Uz0Ppt>%3KZHAD;GfJXYnjUvy<4^85Ei2tR?W5UzO2VIvV9?&R1iO-5WC2yO<$7+Q?8uOt0 zkPG-5mM3zNzER{N^C6K7EBw*e2R)53!{0Ec$VK|IA{Uw4YLh=3jIqrAisR1e?Az72 zg9T1LznrrY+D)3N*(C2l)$)2MKwLI_ri6o=Cnc$RMRL~4NlAo5){tJ*T`h}WAIDif zpYMGO$sV6I;oz-vl2n}|IqUH`NrXe5oRbs~b|=^rZr85}`fQw+(9vAN&r5ZYc;aK?4Oo4ms_XkMwllc6j0_Q{w6oaY^sF&((T>&Tb^yxBW8 zgYq*J^MG*peaMrn=OmBVDDr22Xi}0kl#k;B%EwU52g2b`BVV%alRV;CkuUo;9fbEC-9_=lh$#{GXvyKQ!NPA0gTKtjN#!ZOD)4 z|2fS&tDJXT<^}Uk_3bJzeBPms(Y(7xyQbNw8JqRGvD((o>gjW`p6;N!i0e~dPru~;zYgO!?UMC$583_e^}u@cXK%=QdJ=Vj?%(=)O6Sqn zQ<7CZJuU0$S=gz1O6|IOO6!IVI7ceRLfB^qY6z_jR6X4#>**d@PZ7%|>UR<~1hI6~ z)6=q^o`qi3Q>xvno(}1(kzLl)p`CrZsw*+b< zLtjtN%X)f2)>HVSTnsvdT+lpM_4J6Wr^jSHg+I#0pmP|rzMh_!_4ESj>F~}P+2y!L zboT9P++V4l;xnb|fA9{}1G)YqIct}!r+Y{**RgUvqSh5u2Wj*CA97vT#@C*r=ge+gu~B;)zkB` zo?bw{ME`@7k39G%9D4|P($~`?vYsA8o<#q{l$SjCCmee&te&2i_4ESrBl;hud8cY* zmwCawQ+>P2i@tV7TTf$HQ~Qk<{O!-NN7|laTidjlKeWxBn9!c(T;Dv0Pj=rnkzLF_$H^FcAB@?T1Iz{H0!IT!10DIW%ZFV)?D7}aYhGq) zC^K06Bv>8h;4<8ZKaHDm5lAKFNy8j{l0##R(Mk_ zE6e&=R+aa&+_hwYWyL*^?}dCHOdJR{S;emtwaR?C{Q zL6)`U>6W{fWLj2lg#1Iu4?+HMrsbRP{EKR|d7w!P8uIG5eWzt~&@h%`y79n1%~)$9 z^vBhjGJs8%TRFC*+{*PU`ik~d14aAl%S8Ly%WpN|Sy@&v4$q`$eMHYYuYK^aBV9b3 zr06Uo{Z+X=#>>%PzBIWMo(}F2CH; zi1yW&VYdkFYcGd=z7uhcuW!aPhh{tv{V_hXCXdfX&pzq%!?VFG2Y(JKKWlu)cgSa} zY}TN|4h;`3<^vmmbAf)~_koR_^MIatw8&?zdVg#d|C}^Upr6g*;x3-AwV!B7l1xi!mk;8e)wpysq0Cwe6(uZA>6E^zREj zt`Dn8+X>l({)Fb(D|!EKXEkcONss=7=KeV;dx7!jN0<;hiT4{Zv{5@qdh{bq=szdr zFfbAQ2$N$=c)yV&I_vRKYDa&<lO%Mdrkn%c{lLf9us#+)r;PEG5ZLNWKr=$k?@ z_Um;e#{o$))?5*5zqFhbigiGSh}GI_5|4RMl8iM^#A;2OlR~i$$`G-p_bTBrXC}#5 zosAmBmY!CdLYSFRoAQ~nb8Q`P{qlgL!DxM+u-RzcNqCRZ`ZD2tMl0_-OFRqMOS+#J zt@{bL8m)&3e`&P-neb`!9pCA9JLy&$tuGR;Fy$#JNA2_Kc=&LMQ8WdZKD0JVRo;#3;pXlyH}+hb3b_46u`Z#5%@gz z%gTYc)a3bH5@0{ggntETth0#`iC z!ZDW{AWsnT0NFl!qE1Hp>`6kl%J#y^IvMSS`9dBf+viQu$!MQ9RmkbGy=azr$ zt1*SpktAcrcQA@IJMG>SLPy5EDHLOFuZKB~Ns_VRI~T>6oA#>|!Z8^l*1TRX@R-LY z$yo6niek)5JCH&+Hbcbf?DZCpd3=(L72laCHfP#LDTL!QKH_Tx^_=85-G`quTHhqx zVYHqi+-J1DOL$1E5v=lF{sHOUFk1gkc+zP7l<-6JAv`DU`^xVL((N)@-zMB+w4Nb6 zVzjofD-3guDyN5*kyJ2>J`;*ZBO=ovfXLsg^ zXg?EXclIwre^zJrw$ARoW1{`NFuV6(7y9>gcJJuyK6pd4e-LK(;afugq0a7<&hGCg zMf=~w>^^!&=s(igoz~fXd|I@B9Ascc(+@rs# zYXo|>eDqTNaUv*xk+=~0Jv-t=P!1w-A@qB8#EGC>MB+l|_w0xhLAm%QaUuBp z1ZS%NaS&^;P6Xv3F#;DNq@IEkLAgkez=a5@r{F|TF4Dh2T!@f*3Qh!F z|HOq*^^}EiAgFx^I1+_G;zESfQ*a_kp9L<2OW18!Tujmr0XKrKf8s)f)KhRGNKae{ zP1x;J90<~n0XKr`0C6Gs+>)F!;zW=>7hH&du-mOT5TqXiZp8mJ7b2vdf)hdc838VY z;y{pW2PcBYPh5zQdJ0Yi=|_MIp*RpE+rf#TaT6CJq@IEkL3$^+5Q+mq@_29}XuQOQ z2&t#wM3CMIE`;JhkUSooh#oG4*iY}_LiBJU{uPgi6CAx7oBXJ@q#vU$2C%>bI3nA7BJzR)sdwaMLVvW$lg^+869xlYc#TudQ6UBvC z-R^1dX!M(5z0Wp*7eW7@MMFEb%J}Q>Wdbcywo0U}HF|mQ{|3=3K>%x!c1S zJGSijYqFH3*a6eaEAg)%jkV$5M6ykvKH28Tw&OqB(5ow|D$Bez#ntXQAN|Wl#kEz% zbw2zeVNLbY+Ts#-WpVAY>guYRI`k-6;w~$%EJM2&KL)vISy^#;WocD$X;n?JyP~4F zsloqG|`2WxR8kX5)Bc`J&u zvqw+rILx_59*)X!Rbn=Vs_s?7~v~&0Z={jbF8?^D5bg QgGlz9rG0#uL72$?7n+lW5C8xG literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv b/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv index 401f30905c..bbc4374b75 100644 --- a/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv +++ b/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_silu.csv @@ -1,5 +1,9 @@ knl_name,co_name,atm,vskip,smf,tg_num_perCU,ps,subGU_m,subGU_n +_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x128E,fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x128.co,0,1,0,1,0,32,128 _ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256.co,0,1,0,1,0,32,256 +_ZN5aiter52fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x128E,fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x128.co,0,0,0,1,1,32,128 _ZN5aiter52fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_ps_32x256.co,0,0,0,1,1,32,256 +_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x128E,fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x128.co,0,1,0,1,1,32,128 _ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256.co,0,1,0,1,1,32,256 +_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x128E,fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x128.co,0,0,0,1,0,32,128 _ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256.co,0,0,0,1,0,32,256 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x128.co b/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x128.co new file mode 100644 index 0000000000000000000000000000000000000000..910e3553fdda70ca7a8d07d606868d7be00d57e2 GIT binary patch literal 28816 zcmeHQ3wTu3wceSTOdgXL5O9J?95GU$FosFMm<}{Nf~W|A1Pp%EFk~i~NJw%sL9y2x zPbLom0TEHuVrx-qwb#}@wAz+1Dk#{XAYh?|iVsvQC|0Rr#kv35kL1iSHkzT|wcnTV z8~(G_-fNw;*X*;-p0j@^w-nEqW{8O?GBNQovLh@;9upMFU`8zI3Rpb;O<_sQ z3^myniRQB~Ae||Um;Fes&M8$PP?RM^g)B|TWQPzK%N8=ThoW|skQM6fC|0dTwyy}q z1|c$yI?+p>&w#_PVyEoY_~smKa?|*}>iJa8XTwf~Kw*5<7oCgelLDq!7ZzBP%w^2~bJr5L*XOb?_B7UcYJGRC+H7|e~86uG-D^I@c1udk}|d79W} zyRYW^p7@{HZ*bKv^;9gbsdGOjx>hV&T2XDhqy4p*?JAp z8^P#Yhd8x6#D)lG>ovq>5sc1ph!2)t-5q0>7U6uo2HCYdg4H>XvaCeS$F8*z&ewaC z_e8Kd$5C!7y{0?J2O^xV_aJ{7!Rj0b`Cw^rPbwdeaK7H7d@6#~mqw}Xqwbm|-N^|E z2PSpTq`Pmg4s{5??&t3cpn@*Q2JRK6nU8!pG4%_%vDvpZCGRrgEQ3!LAtA z2M78b?20i(?IAvD4@rITaQR$a9@6^aL021h#iU2=!5X!P%szP7G*y=dTOS;RPKTVR zJ&cIj!{|PE*fdL*hTJ|l2u(wN)E)|=_TcP;hle~Nmqk$@T!bcKYSbR4N9|!oUp#p9 z*F{NRJcN{ovZy_jN9|!jA3SWvgqW?nG#2*3N9c@jNA1BIwTIe1czDP&RiB84KDY=? zL{rormPPI1=014X?7_EJ-IUP!;3G5>ZBctz9<_&6eem#5Q+>nYp2gJKKDY=?#QLZ` z+!M8jjeYR&NK<_i9t$^B>!!s6eee^SjfbN4@YAS0{BIvTJlx=`aC_?NduGPteee>R zi{C`;;i;%S{Nd|(c-mgwSiiKP!sn@8;;Cz@sH$1!aaYje|CeZw=}Er7fQ^Oj4zAjo z>N?XCXX(jgS4`k3`x4i(imF;yQ^k$-jfe4qxQ>WmkGcD zw8N$L+oYcQtM^c$TMkVT>6({LyX6NHKYqrDV|uHX+HxY9`mrcd5REO;M`!`;YZ(v;ud2PYr~+`K3{mB|1>!!7}F69B5H#f*&MJzp(U9Exzu1Z z2Z{*e%z+ZZcynMOp~)QJd!GsBfJU;#99T=3XbwC8&>%lfa+)C!02Bdt12y0gU;ua$ z$Oae!G---}X+RBV0|tP(KsL}2D57>?3AF{>~E!AVc2toJxADc zggr;tbA&xd*mL}D(ZRhMV^+=Z#VwQ{;{(ohPKRl0$-uU)8r$7j#P)O=+n}@CHSLm1 z{MW>Yyivb4r*qxy%st%icIP+|vm%91E`CK<2x^GDxkMVNnx)LdChE2_-kmq#EKWQ$5T-Gf=+B_HXyl(mD z&GR6acgw$Oz6SEO-SU&o<&YP2%fD-$5Ba)oxwAPgh}@uVE7wRL#CIr-I9-d22`LAN(49)+G{f0Zf60DBENcNRO>#ZI5FZUR0_r`4)y85#b# zwfIb-Z(}7n&XN*u0LRXfIbJI})S1B!cMfHL?L42o)>%9W`UML#?R($zlRql1u~$f6 zNlBWPoGg4zBVT1X&ayIZ0LRXb%{F>)65u?P10kW?|+%mj6aB3`3e7q>GDTEpE51&FxOI zc!qcJnrcb&vd@lTkwF35i0iSoh<*`HBs ze$#0%n&X1(2|oTHdlLOk$j5Ji22+rAbbja2r@k9xyE>DGt=r;YpLe1*Hjka$wKZtC zt{IsN1`P`dZwVR}H3x6OhssuM4W^_Fr)wyfmd4Sal9J<3OUsFCao#@JZ(QRvI@e`7 zGc_$M%Wp*eGqxDl#I;Q5n9yS0sr&fkS|I1FgV&fJ1@z0W*Ocf!V+x0&TztfWv^B zfH}a2fWv`50geFv6gUd_C~!3Je}Q)3&w;tX$AJ#uuYmc$-vGx0p9B^Fp8^&Fe+P5| z{{Wl_>};mGv&kKwB(Bk_q`ZW9FV&~z_$Zw5@m_y?d`=7RlbF=Ps)pxT61{PLTqB8z zUVma@&a%{1xV5ju`-_7HEDeeTZqo$r2ng(ACf>fgNZ>wA;DLa^Lz=Dj0H#l>b|J?! zZ5YP^+C>}(#`#TKuZVBk+JNhW&bO9*w3oFQ6A2CNQ&RYRux;RThR;{b8K$4FBqKoT zZ(1=Wh0wg3?Bx8VcK!S%8S{5No#)5KL_)*n(iA@5a>RM~yrzEoc}+6rHT5?=Qkp_& zehhYMo>RMSp66kn--AAB7z^RR_DVjMk+~bVK7B=H3h9Teu1wiT{SXV`z|B4$%gFo< z8p-L8_;~w}$Dmi&4PCRio|CQd+GpAB#PdpGf(~Owja38lbF36c#mgT z_DbM=o>AGK*?Ri6LATyBFuMbIw`XYfD&R(sBYO?-hn~W0%47T;;&Vv63;Cq$Zo~$S z^hx)MJg3}0h36summ-gePaq$3U5?(MQGSx17I{k9Qp(3_g+Che(7TWe_#0Uua*=tj z$VK-3A{SQpqp=Tt0%L~1kxr3|%%?>zvbXqBJ{yLy%=wDr>pf+qi=zPCmb!vlF+P zG*i1t-h&$C^-zMiZ1_wGhqX^hR`rVHoE1}&35TyHy{Nld4!=H*uwp*n2bGaMK5N2Z zn@f{bog+Etku|7Kjup=5_ZSgbZ$3<=9N{>t0K&a5|1HkGH{M3 zF>5-o+%qieO5nAgQCT#vRNtx4m3ancO#{yJ49%JWT;OqJ%>-WODa@k$jKDk~9Ca7+ zBe0`q}z)Dy^;totO7ep=+qz6JRb{XeEW6u>`W;a$iF z%`@B|NOs;U@-*>&=tpw@pVIgX;GeMY3FL?78}1_{JD(Q$nYab{5&b`>d1saLuGhR^ z-l@L5<%Q2X)G?ZOmuZ)^8#QCQURS6Uwuj#<)xGXox!z+jCBH41ZQ43FCaj)5BkSpQ zs*AWj_4V{i?*D5se$!4_Pj`{szg`clM}PLZtfxm&2k8E-ucve#eLW>v)zf3Lo}Pf6 zs;AVhtEaSXxCiG*!&nFhZbuEFwSlUqJ7qoHCF?0-*+~74qJ|)r?s|Gm*3%Qvt9nYc zTh-IyJvFk|dOD(~Z*O&l@)^HJ*31kO{&&YZT zf0T=%`;iNp=c=9_l=bwmtf%ltxfps1W7gNx)3TnPK|LMSQzLsF_voI!y^Z@T)l+E$|Bu1D0mg6e<`pEcpIqha;*n5?HKN|U?SFLE71^&l7L zC3I{@J&@~6lJj@Udb+DJS*>@}x`paOF3v~jI2u+@kI8y^!k4VpM{1q(8RnI$k-gRp z%q!Kmx4J=j8i9F0IBGxgq_3w3Wj#F%y;zU!raX|ZkDy*KKmi6=u@+JE3qkI&= zKVjj1v3s;+X3GP%0P}%<;5c9_&@su~oHwbeIe(J3dEBJyo5}9mCbEm!<2V|F?};&c z@__lkeBe0XIH02#cEzwOhF$UX&6t$tzQz)n50&1rhM-3b>nLD#9P{{6D=JzNtRm|r&!unK)wy~YRI>zSh{nS zCSqGIV_P9(TPb6^?Mq_&cKX-P)pAdoWkvNs%gUN`%WaE?SeD-b`A*1pLEbP#!bBWdYkPH*)Mqy^-se4-)Mw&lByd&KK=# zF1XQzXJ|RWI6SMObrU@Yz2d$Dj!f~)lD@Ho6iXO ze}a&U3!1Zv3!8&@HpW=P6vt%+iH`3SCT7OOSTc=9OJ;1WB{MG0qOnb{XhtK`iZ;FC zk2A8Ygak{L#bU`yPPSyFrdnFi-exhf7PNPy8d>Im0hY``gDjbY2U{}FI}h`1(<{pd z8JQpLE6;;{MusKp{PQhYSy`5>3ofv_l7>o7?d$q8-mke}vDh z&F!_(Gg12d@XRpB!Jm)H&l=zT9rBqgn?3Y^L&F1&`M?(7T%aHLJz#6kJfLSG9r78h z-XELIKPOES=x1}e+@=ZaVDq`WTobsGeUHnlG=XahWL=vjpDWUHe0t8u>k{T)0QW!^ zV@xD8#2Qn1UAMJr+ekmam`G?CJP3MRA6ApL1F{MI3C*z=@&4b=Y1O(&kN$+_!Fj2> zf$``^m>4^S_ZvN;Roh2;^dn3hT$*|Sn1p_WDY2Ej-?~jEXU?8AccAE(dmSxC>$8OI zM(Ym3JB-#B3GXsmdEeRM8NqJS{n%)Ig>bXcdVuhkM(dvmpFrP5ZiUhMJmG4i zwTtjxqxB`i`(gffcRkBU|4ggUKLdT9 z)Cbs!J$as2c>A;MqW#%0yXS5d`sZ|Z&*<#7tq|?o!tA!+CiL5NcF*eUo?k86pAWNp z;dY^aL1*`z&ThvYqJ2l0-Of9Oey7fEo6hdVyF~koVRnDqAoPFK*=^U^b=@o4yTa^t z{Xpn<>Fl1@*}ZhXXn!fp?&Ti|{mVML7j$;Je=OQ}huQ6UQ0VvQ>~`qvUfC?#UkS6@ z`>@dO)!FUT+3kBowC@YE+y676->UC{dhUJ9 zReJ!BU5Z#E$4NFJPqqnpN|BJK774j{p^&F76!PV)mD^2c>p5Ow6Y`ZdAzxJ_WM{r-W;T}ymEVh;3qv^ZfTcpRmnQ++PJE0eA z=yxIKpg-a8*avxk`p$Qp^yp7GeDJSRPXb4vAK|Fj=XgK*4)-zX(T{M{;JvAz0!O1C zp*{9Z-cSFX?Q?2Je?t4*U9`UuX7}a`LjR`D z?sc8rTRTPjTVZxb|0wiFb#`y)?B3ob+TRYdd*@}Le@AEcrq1r&J)-^HFuP-Wh5ne% z?k%0&d;3NEdtr8e`;*ZBO=ovhXLtOdXg?licj7NXe?n*Xw$AST!=nBDFuM<46Z#Ky zcJJuyPQEVMPlnlj_@>Z*sIz-lXZQD`qW$k-b|1YX^dIT$j_K?^J|@~f4zoM;H=#eJ zvwKfx_m2~z{U2d=pL`(npXluVrnCF>L(%?enBD1*g#NV7?zqnGvs0q|voO1Vej@b$ z)Y+ZT*?oRmw0|CEcjli$e@18bzRvE8Got;AFuPz-=z}`D52PJmBOH-ygg4|G;VrpF zcw4R!-j!>F_v9MkxLhN=-@is+xJQ3e*9i2?`NTi{;6!NrJuTjU7Ds}fCxqi7T5k5|oQ?5=Vm1PjI22D zb!a3LSAvi@65-qk(i3N*97r4qK9?jng18c-CyqooH-hxUnOHz}@_dRLL0k#a6GtMP z8$o*FO#Hh!68gP7;!03{h$Er65hN2=g2qoA2|mBTi6fb~5`@H&P}~TTi7P?lCXR%D zFORqqq$iGq;zp26TnQR4aU}SBliUd6N|2s75{er^GI1sTQ#lf6)zg2$jWB>qu@>Vc zjzo7orT+SQN;0@+)XxkK#d;udB+ja*)K6beNmlg~Tne%?fg{mfPpMs3Pl+2reazrb z+yf+zL`XdaSAz8E5jYYd^%Pu*jnwb|nIjQWPr;R-{3S);NQBf=a3v@QNf9^_A@vko z3Ccxg1dc>VJq1^Sa*_EB;z)$lQ*b5d`X`Qrs;4ZB8$s>E!JQ}r5=SDWo`Neu`W$d1 ziiF)Z#nB}FaBwE*`X`P=NIeBtg7n0Z(1hI%#f>2ScyK1D4iHCz&n?L{Bd!GL^TCk_ z2)i!DjUfGaa3=oU9Ep&63a$j@XEZnxiW@<)9b5?-KXD{N>M6Jqq#q5AgyKe!YzJ3@ z#!VcFka`NP1nHgNNGNUu$rHhqpz#t%BBY*zD?xfEI1-8*LGna!CHgrM=^Oev5|OzZ z{Tzwj`7Qk%iGGemKS$!5&5@ur#wc(j0=VZ9S7IBGI1-8*K{9bAh<8973F=SW2$G2_ z(M5XnC)9H!s2_17c0*5GiG9FG90}^D=SCbrKjKUr0)81sg2xPQ1jS5T35qciM}o%+ zZUn_jTnUOX5=Vl^3T_0&N?ZwwH4;aH#|mx)#YkKUim{&~(Zl=b=ST>yMn6Yl`tE*? zgxEjm=Sax?kA9AX;9wAE;v2mG@yzM7ITHMS3vLAQ4u~s3JOkoLC~gGF#FZf40dXYw zeHYval8Gxpe1k|F34Y%NH{vAPi7W9jFcL?C-*t(iKo5@*3+R?wqxs#znoH*VmHjFsl#9ODYW5l{n%#Am}+z6+VP)l#3i-$b=986 ziUwDcm;Pc;g|EJ%$%|jwYiwBJtEhCKiLuwY3%XzKVv%`l^~*kM9y+W922awKpso1I_iG$|f$@a@}rMp37Y|Zv3U= z$9r;JuF70@Zr<2S^C~^=yi3Pj<{67W1XQ`Ga%6YdXB`k_gkeK*8Gc(v|4QOQ99vkf z*D`jV)H^Q?)en>UBa=e)W28QQdZ@lY>dV6XPmy|4WoZAIQa`#TRKJfz1Gu&;LiO_@ zM*5w;yP%KsJAKSvSKs7ecK1zn_~AijudZ8a_qu#uCjTXUQzNrC*1KIzE@p4?ENkMu zT}x^zq4v}^*5e~}yYYcCdu9C+`lUujKVsPAS_JgMS{w;vwXQ{;+KRD`sm1azchY4A z>d=utO&#V>>(-7ha47Ah%f_k0srh5oVeyp7(frs85QOEo~5sk)$0O-HH=bgp>3>80vLDvtjV4Av`qRTmWI z@ZLTCmAzE63>=VeWv}XpLYwTU+Ex4tFF-rd2~@kPTMAX(l5Y4I)G^_=@TPGod-c1N z3X9ODe--Hk9AAPr*{izR{d@FrUt1HIJssmbe?eqWAWsxGVi z;U~HHYa;$7ow7eEukVx6KCL@2(Vt?~zpjQrx?H^y$IAW}VR~h+a30#bbEoV3!1Y3~ w$&Seo>0YAZSM%pO=v3^&Li`(8EKrSKwW;$e*^7fn_PMp9$6?h`Vz~YP0VhG!-v9sr literal 0 HcmV?d00001 diff --git a/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x128.co b/hsa/gfx942/fmoe/silu/fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x128.co new file mode 100644 index 0000000000000000000000000000000000000000..31cf559926dd7f2d42ddeb3998df3535ecfc7907 GIT binary patch literal 29272 zcmeHQ4R}-4`9Dq4^dtQO3Z}KK5i2VUVo3|uF0dk>I)p+C79VB_P1+Pn+ciaTe^+nQ z4*>zOqKL!Iq0EoX&3(AZHrlA5;6MwAD;rEyP;ubLlnE;4|9j6lH@%5*)rLKD&(rvX z-+kY6-uJ%e&AIPAx#wqk%hZ|EbWu@71}0v5c9ccQV}c@i+*2zKW=4~)fW_k9B$mL8 zP?K$uX#O1fq%##`Wj|7@=ai}-D9RL~iY!IQWQPzK%MvoP24l}mA*-mhqgZ>E$oAF2 zST97D!<|Er=hNZv%h;**>iFg!Z*$Bo$sr@=v+LX6fnI)uscntRaA2y z1NEQKW&uvRX3F&9d5n2~;aKK$dmPrKu7+AyjpvRvTdj_UD$ik7*8U5tqoQ%Sqo&+Z zQQ`7<9&KsaRc$?S)$b(NJKS0meM z^;Cb~75g*m4UU@SuJWbTwa&*x*Yd^7D=S?MZNlV%dWTariVb7*@Zy`wxeL`(<4yIh z2deXIp&>rPL#(TwU5(HZ=45?Fm=nh6d`CD>jW9RN z$@+}2AdJ!Zj&O<^VNsZq^%>!eFh=J&!YyTU)DVlqoUP9g%fc9)?+~X4LR=K)Y<-64 z4r6q_L!1%_u|CY%`V4VJ7^CwX;sYhu1Y+EyggIZILGD=@#_GIBSz4^l$35%AoUiXF z?+Igdo}=7Sa%~{U`@@{B?;sxzV|AW`e4u1%Zz>-TbH2W#d@_vH7e=YxN1fHn0?F|S z2L|KU@T5;aH>&-ip-0$$mI^&{f~i7)Zi<{qPW!gbyQia3)d*pZ3GSmNJi;g1u3! z9}cuR*c)Yt*h6f@9uoTFVfs8x9#Z<_K~o#|Mx{pV!5p!NjDC37GDVXHOFtY0PlxP? zJ>*2}VRSz{Y?-Y|LvBAD1g9ZCVh;rod$9Mz!-KA%%c7_sE`k#=C1MXVBK9z|KOS7# z>!P?n9)ik4X~Z7NBKEMbA0D=1Ld?-z8jJejBX~wQBlh5q*h5V}JUr-{qD@47KU@SS zqA_9*DZ8qW=bgi@DZGe)`&f_EIX#)|jq*KdUFKL(URmR4EWfd?VX1V^+nCriI5e^Rx>@5K*o|lyms`24&Q-p+ za_so>#Wi&mOFb2i8rQV?g7T`d%g2`AfNhHEn&stV8>`CkS>@SQjI|X^{iC(Mq51|# zqbm?nZ_Mns)&|%2u-OrG>O1iTDk|z{R!>!ZLwQBr^4iAwhpe8PmMyNUIW#G zcuo`WyI044X}!^LgR8Qk4#}jP6sa3EH@F%+)pfNU>MoMl>(Ty8Era#g?NRs3WMdI< z5bbcO`)yKB{nd9mj#DhOy`*qmdh|s@et+h+aeC=;L!G+UVn3%|@fo0)>)b^yN~6-smeL zj4}F(31f}EMT7>UkMA|d8GQ=LCZlgXVZ70IKR|)}ILRqGpAS$3><0RP?*Z8$osTZ! zB47$o0a}1QU@nji*7=I49av25z(v#!tf%%NI-dg6&IkI8^Za>yE^NYi4CuBg0q4Ps z+Q#_D@V3kRmvM~q$8o%JioN5?)(IVUc6VntyBF_uct48wkMQ1t_fPTe==8IVopeS& z`=P|!CH_p}k0m}J@ga$KavbU(%FmbPPve;APvkh#KayjnKa=Cd{);(Y>c5oZApana zu)mdd*I~~Q_8ei)5%wHm&k^<^4JJ>EPBbg>`op zv3;HTR_Lr&MY-%U@3m1Pchs-dZtu9AIY)S%&TPBW=~i@=O-w01Y-6Kie30dFEcK1C z+ZF0>A3a)8#*Fdaohtmi3{1giF>EM4Yv6ofF7R$(5%6VTF>u&0eAdAEzSfOC zx&FDxgP+aC=Vf8}`)oHcR^l(=x)SIvmby}ZDc6-kH(cuG`R8%nJm@Zwy7~V3TsI%O zY^l4}e=XNt3*DtsSLQF{x-#fSNZkVe0^8-3 z%TttW##))v;&oPLi_gz(X?MG`^Q$V|E%-eBPIp2qZk!~0D-GE?*el4neX6}9dd7r~ z3HU7SX0xKCrFmo4<1>Z6l@({(i;LYp9NUZMy3OoxXBs=wna=*+c_DkHbLu4M7cNwk z?|#=y{^%wZeUQB`FD-TZaBMG~@3zk`bKA?Vb5A2XT;G27LMNs? zo7If#bHbB~Iv4FZIf^oJq*sh-aJFx7s=Me``|WmI$0jypAJa!M6Ep5J{U36n8`fka zT}%|}v(wV*bS9Ww!`zdo{{(wSBJyX#Tv|!@lxD;MUG&uy?*wIn#iA$|U+m?5;hVR& z+B=N$97deOh;xMcS9H!MBip>+=+ZHc@E`ph_)bYtltF{MezvVsjPDPf#PRV?nBC!L zf9yngVTJ6ks5QUs)ai{ee)a?(zn}dM{SC;+?}0jlpS5>>>(M8_?Pq&C6NYzex3Nz< zanEcTd*z-kzwY`bWX|u`Eh4@<_9gM`pAuh4Ok)bnF&Ir!*-#9ncFj z09%1ZU^_4txDpr-Tn$VBt^pw$xT9l%uJ2H-H@-9R(&9$-4~USI}rGcXJI zBcKI%KX5p33osk_AaDfmr@$QG!@yC%M}eb(KL=WYzX0X}9|ziizXs+5e+wKB{2j0W z_$06p_y?dJ_$S~*U}qE6oh{DT1aXa4CgjD%x~V>`#7AL|jdgotW3!uipZJ7kRyiWi z6z`7l;u?vMcYEXGvsWaq!IEz^-k%-P;nrUyaEBsrmrvjxX5j7JMFL+^1orp@9#$+h zJ(xbp%EcU0l;Io)DVJ~@9OE@~T@~BfRgdd~&bOX@aDX-I;|X{eRL}*+~c5?nwyLSGPjQP8P&hry}JfUuDNfMuL+2TBWUQ<8q zye1j*n)(|aDM=zUJ_b8=o>RMKp66kn--A9W7z^RxwhBI$k+~bWK6O<^66uGntw`ET z{SXV`;H@4W%gFqV3dyODczFA;$DmiQ8@gt3JtvxDm5;NW@fQ?F`EB~jCZ*`{1M6M7 zEHALb6`!>dc#ms%)@tCru2ESZTYCGpLbt&+IIA6aw<|qs4REu|mbDJ}BUfP-Jk*B2XC48)A_@gnW z--TSj-^c=yi;PVo7g_g-T$tgH#-9EJ#teTW?IIT$+e9w1wtJF39*(ih{gUI(?(N&x zxc$X;KELd9;=2rrq0J!gLG|)_C`Mcse5QoM+a@Qf^@`-|Rg)75N312isJluwzdnw# zseHZ3;&pO5M&$@!0X5(&pY=1D9Pb|={kZa0kPm08ZK zBFu?mmoD>4;9OUH<_ut&Yk20>!0TM2GHG6^eWyTI>KdFm4LIMGo;ee^&}GY<1-#x> zm`VA`!8{-wbrp97zH;MdN@0*;c|b41o9>8KFOoEiF{eNBVVHb zhm?l`_$Ms93;CdVhWi7__Dv#B6Yqn5B=>)Y#$N#cgoRHaKQ!NPA0gSkP2^|dcH~F& z|CHvPS#mdQJr-5``{Fr< zu6a=*_4FxOPj^yX#PzAIr(baYUytz{cFTIYm+b!idSCJ{r>*D?p%5%sxsH|V5p`Wbb-;qpnsE5>ka~JT*3(lZ ziGlTtT!&CS$i;aHZ97p9**2b#d@rp@|1)5Ksf4jNIg9( z>*+bzMm~LH~WNKXn z`8LRFA>W>43gjw9#I{n#wo1mfTE=$U7sU3>)UTeam97-is;a@J)zzt{+m;S9t-J&B zosjQ>ym6Swm085IM#i&N#GbZD!$Fs&|Hm$8nH?6DAFx|d1%d}<_kGhiJpUAb#IR?Lp-yj?~?T0w*}8k*;OZdY>X{Z z1TJP*bJ?i~tYqKevRe_jgw5n~jUup?UBl&iMPLJ)#pOmt;0u+0=~oR+FAJqrE*@&oTxLGGz=IV#*jg z)Rb|-1(f@c8>fX%>pKrisS zz?R;5K+i(jB%UNXfx zEK|g4j+@M5PEVAv7Km8QX(h=NYkH=LH6yNq$DEZYW3{&^6kA4`Cz&uS)06zMy?4#+ zEBe&|TeIH$G+~?Gyo>M-z4>{caC^Q&`r9Z=*{~Hx9ZJ3gul|8|4R4-`cCZi zyMuJA^yX&?*XqrC2si1?FA&}*`%1qrk?s+_`DMb#^yb5a+w|tY6K>a=wd;@qrJ!w= zGOKO2GP^Bw?xg9RO=)<(q{B~NW9TdHTsEWI*Q^Isz`xH|Vbr01?z1|c zW~6_rMd+V`K2Pd>?9{$I*M8pqbem{@I>hdoTZR4^jonikyB(`U`;HL1owo`7PL18u z8oOuLiuPwi?4G+_=%3TrJ)^PPb%$u*6=JvhPNCnevD=}sd;Tub{(OktpEnBqpEY(n zHFkS8iS|7qc6)y)^m{dS&uZ*mxKFgd5MuY@kA?n4joot^yY8Qe_U;h7eGdrzK8@Wj zjotpOqJ4je-GPUM{(#1Ax5n`q z*4Xv@O0@Tc*d6+f&>zy+?a|o%^$F4b*ATnE{a)z*rm@?ru{*p?v>y(!JMu@NKccaF zL1Xv#?V|nfA$G5H3H>V?yBDP$bKKiw@Znz80(=_#Xw|?Sz}tY&Lf-(~1>6999{TSC z_W(BoU+BH}F-J`g9=jB=29A>~LcY=>C;_sf&a>ZIO_tvlebQgKgk=l|{%` zTZH_bA|cN#67n^Rggk4JkZ04{Csj%5V#EM@+UHNx$Y`HGUCOPZy=;a?Mtj*+Qf?FN*IliV(SF@`q}(pr7tYkk zXkU1Zly4R7*U!?(Xup28kiQiht0-}CUiJB0yskB{MmUQ5ng#c0!r{>^{2oo;{f?0y z_h!Q3L+^xMtfAk6oQ?j3BcdPR{pma3Nz$V~;fSHXNq!HQgMNggqMzaY=sVnpq(?u( zQ9}%5=#JKLw!j{bz!p&#%y0>zpe*OE+VOO!F=I~c{9n|5b1p)K>y zWQs9A?g5VD6J@OU&P6fir~M|GaD1kSwIJ>p9&=%$j1}LZD8_=c1IdJinIcwu-0M8% ziHR~+d}pHA>}ek)6Hd(hfUgnM=O;($KKzv4{2Jj-z4O5z+om zh~2xd2>rVnyEip<@4YJ8-wUyO|8=2%Ut{-{#_pfTMf*QP>^^u?=s(cdozU2QctW&) z7-Dz&A3}dxWB0bk?q8=w`@cf$K6+Q^KhoI!Lt}U5ebIg<#O~||LVs3ccT!{b@oCZi zafsa~9|`>@8oN^(yHC%G_D@6X&V3^E=QMWjXzV^aC)z&?vGe$E`8*+{CmRuveE!PMq)wI1=y5u_*1#6q%@=To^6 z#FZdDaU?>y5u_*1#DAJ2q20?Pt_0I0f{-{8DmQ{; z;!4oCi6f!i%OkD?>4_tuawAA4t^|#jI1+rmNp1vjB}h*k36&c`GI1sTS2+@&tEc~l z8=(W2Vm-!79Em_ZrT*G_N;0@+)XxYG#Red8BtBP9sh_rn0FFeUo>IG} zo)R~L`WV5VxCclaiJ*E4t_0~*!*C>m>M6Jqo2lP_nIjQYPr;R-{3V3pNCeeWa3v@Q z31K)ALG=_|3CcxA7>-0xJq1^Sa*^>h;z$J5Q*b5d`X`QrT2EOBH-g$nfICqNB#uN- zJq1^S^x5D@6bZW>Do2y_Bfy!U>z_ChLG=_|3DOfsLJ@YmRBi<6$AdFLb$~b$d~Qju z8F3{@pAU|NPuT5Ixe=rv56;AYnj;ZZPr;R-{EP-iLghw~Yz0?>#!noHpn3|f1nEbE zBcXC5NVbA2LE|QlL{L2iSAz6*a3oZ21j!S@m7wtwMIg{8$q!WSAt><$C2Q%f*V0G5?6v^9N%2IJ;0F=YlH!ggj^#Ga3uaG)(B62q;e$IbUB-y3jM}d^r=?xCFr*# zn!B)t#$VkR3N%UCERnX{=(Q04&k()h1u(C~)1Ue4=+G?Nv6aVPb}36S05hv=@z;_H zE%+NsmYFlBSZujg{AbCztfsEE%GFR_?`U+>UqUMP)Ri~7@e76x^~*fv6^`0+&+_{E zx`syds95T#s;;dmc_2%~doR#CoUp{`kE7#$u$aUuCjlDdt!sX1neB2eTvG@Z^6^koI z2EzW_0Z~X8HW=$K3vNn~`eW8${RL9r{rzD5aH)TIL9l*|)VCG{>xCG@awZ4sg&4z% zuL{=Bl72Ra_!nXf^M%CYl>KLlgMc&sA!EFaO9Co>@Cp5^i)B5lgZ1P#-0vuEgFf8v zC^BnpU89Rxoj29uS0S0Ts&={6?eMsn{Fn5N4b0k5=X5kWn6=ThqLKG@EUT`7+Evp~ zhpFLoVq!3BMcp#`B~3=ZFxlu>40OX<90_DKj>WE;^4#2UlLLo&SIEP$d3pJ&cGA>I z>Y;7YSoLsRen2~Zil*=7y&iJazEiHqQxB(&of6@%HAZMoKlWHYWx=dT*IX4cfr3B2 zFD=n^K-=s()j~axsQw)_wOOqXs!X<2b5t|64yiapcBDE)=ZclGTAz|3|I6>MQ|;9{ zq++)0uePi9Qq9tEK)%&>wQi|sksZ}`HGUN@LOanJ)ONK_s_2#WDu}wFN{oF2hon>O z)!!FYv8cBzKMIwp{E)p`#{<7xn#H@bz~A>1qUfw11IOV_wloHMsn@ak_xR}d?dYZ6 z2h{xGC&Bn@BK{?vYCq#Pp`Wo-;FD^igveu+ha9fKzZ8G2R*7TP{+A(oHGeA3M|&W5 zn!ab``Q5T(BHHM!#;?ww>!DL)*O;?K%Y>wkUu{#*tIBR1gtPCK_J$CH5Rv^qYExTnO{Ri^49f;Gp8oE+@NCRM7W3KP%0RUdm4R zc;4rEzc=SQPacxg?ooSpDSIh;X`H60kY&Suf{tC@IQ@BPdje^fiO8jM3cw1hkrAj({lz#dSbS}(KkB?PNCL%YBY694ft<6LuItLgHUyMvAw#B&Oibs#rX-K4Mc*D)`~XV+gB-nMsye^ z=@{&GU+lZmQ2DLE({<6WL}^y))DufwA<T%TfWUxKY|FpTeW9`*OBi;q?&t?oZaCUKhY)>6p8lQyT3*hErjrm{JW z_%B{hVIK^eX~dBPMGx56N-y;%B&;D;_EXfzmGAx8}B3mvEuxH`L z^z}>mTt<}nf>4~x30aX>@_cblDDrY4tMEx7$1974LS8Pyj^t$_GoQ;qU*uEb>|%zW z&!zKxIxq8rB=LELFUa}yyd*09ik$41q_?xrfwCYbiyC3rR7wa6sq_$cVwmGZRuGbG ziVX&agGn(J93FaJ45ZUyayEGie@Z=hHD~MEdWW3&@M_b_W9TV-GS|`dE_c`=mCxwb z@vG2{M2~hyjIIkW=Q_fnw~lrvjqW}1sJ=%iZA0eorP-aqKOgU)Fg}@Iiw$rQBnh*k z#IxbSVY3_vM9nf7eW(sa0;U=s95l;uxOWUM$xq`BrOR$gUum1w(DU2Sbh|0G?zuB}w-9HYI}*{yBT6}QP4 z)jOB^MON*6r-V^TMHv%?&E9ZhmL=f8eh>*1WFHj#q|*-@vJWB(^?CE&bofgFGuBZV0vHD>fnz`wFbPxxQJ@Bx1{7chr~_UC8i12P z6Yw(744ej9fHS~Tz^sO!An>Y2dM@6vEnpd@z;6?xKMlSBJ^_9g`ay64JPW=EeGI$^ zo&zs|N%|Hc=aQ!Y_JlavKq0Qyv zjU7JT*vWL+JLFD)b8w;%uXlxbJxpgCOEi^>5tWPGi)eOuPq>%Cd2IHn%&dFKY=(Q7 z?y|=-vj-~U`Uk9!P0sszeY|fU(_LQbz6{Q1lk<9Ch}ZWs-EDuLt~Zu?R9@>>d2N6$ z3_rTkwAbMo+;AodoVC%~`Z{A{aoi(Ccgu5|i*yyI@v+OCHy3G%FA~%J_Rskuk+^=1 z^|8z3eH}jD*U5Chcinvh9AKBp>s=vU4^uUWHR57K`hKR@*vsklAl8W2`c+;V zFw((8+bmF_MCOto{dN{E(lcl=O1uC6qzwHFNufVITZZ77_udSMl?TiJVlJ#au-j7} zi0f?MVSm9b=7U8c2e-|NgcubT1w@2)4;Lz8Oql&yEEuyGi1DDuhJA{diI35okAoPy zy@p%Ng^?J3W}AENxc4gy1^sHIOcAqSdEfH7m;w6xNQovIpLslhYq7j8W?<9%rpGru zFJysvoOz!y^SYP;)BAL2v{VuO|0Ry8#<7fRy!8ct%E0XCcI%$A|T* z2yHl$1TVUi$jLu_YFpE~fA_7ac2_)iP&? z^@JvZ3q{w7PJ8IA%wp!T_b?%^C28uY?x=09QEC+Wp0tyuqK^7pa^MyGg)LitW}HhZ zpW7m0SZwMIw05Gn^gvs7^$D}OMh{OE(~=2~)@dr9)7hQL>${pr^zhh=(NU$nq1_tRDP7iZOG9I% z&QjYO_L#%ky5 Date: Fri, 19 Dec 2025 22:38:47 +0800 Subject: [PATCH 16/40] Optimize RoPE in the cases that hdim is small. (#1698) * Introduce new grid config strategy for compatibility with cases that hdim is small. * add launch bound to make sure that occu is always 8 * follow Copilot the suggestions --- csrc/kernels/rope/rope_common.h | 761 +++++++++++++++++--------------- 1 file changed, 393 insertions(+), 368 deletions(-) diff --git a/csrc/kernels/rope/rope_common.h b/csrc/kernels/rope/rope_common.h index a14c0534e9..598f70c207 100644 --- a/csrc/kernels/rope/rope_common.h +++ b/csrc/kernels/rope/rope_common.h @@ -3,6 +3,7 @@ #pragma once +#include "aiter_hip_common.h" #include "dispatch_utils.h" #include @@ -1271,20 +1272,21 @@ template -__global__ void kn_entry_1c_sbhd_uncached(scalar_t* __restrict__ p_output, - const scalar_t* __restrict__ p_input, - const scalar_f_t* __restrict__ p_freqs, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_i_s, - const int32_t stride_i_b, - const int32_t stride_i_h, - const int32_t stride_i_d, - const int32_t stride_o_s, - const int32_t stride_o_b, - const int32_t stride_o_h, - const int32_t stride_o_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_sbhd_uncached(scalar_t* __restrict__ p_output, + const scalar_t* __restrict__ p_input, + const scalar_f_t* __restrict__ p_freqs, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_i_s, + const int32_t stride_i_b, + const int32_t stride_i_h, + const int32_t stride_i_d, + const int32_t stride_o_s, + const int32_t stride_o_b, + const int32_t stride_o_h, + const int32_t stride_o_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1316,16 +1318,16 @@ template -__global__ void -kn_entry_1c_sbhd_uncached_inplace(scalar_t* __restrict__ p_inout, - const scalar_f_t* __restrict__ p_freqs, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_s, - const int32_t stride_b, - const int32_t stride_h, - const int32_t stride_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_sbhd_uncached_inplace(scalar_t* __restrict__ p_inout, + const scalar_f_t* __restrict__ p_freqs, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_s, + const int32_t stride_b, + const int32_t stride_h, + const int32_t stride_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1359,31 +1361,32 @@ template -__global__ void kn_entry_2c_sbhd_uncached(scalar_t* __restrict__ p_output_x, - scalar_t* __restrict__ p_output_y, - const scalar_t* __restrict__ p_input_x, - const scalar_t* __restrict__ p_input_y, - const scalar_f_t* __restrict__ p_freqs, - const int32_t size_h_x, - const int32_t size_h_y, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_ix_s, - const int32_t stride_ix_b, - const int32_t stride_ix_h, - const int32_t stride_ix_d, - const int32_t stride_iy_s, - const int32_t stride_iy_b, - const int32_t stride_iy_h, - const int32_t stride_iy_d, - const int32_t stride_ox_s, - const int32_t stride_ox_b, - const int32_t stride_ox_h, - const int32_t stride_ox_d, - const int32_t stride_oy_s, - const int32_t stride_oy_b, - const int32_t stride_oy_h, - const int32_t stride_oy_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_2c_sbhd_uncached(scalar_t* __restrict__ p_output_x, + scalar_t* __restrict__ p_output_y, + const scalar_t* __restrict__ p_input_x, + const scalar_t* __restrict__ p_input_y, + const scalar_f_t* __restrict__ p_freqs, + const int32_t size_h_x, + const int32_t size_h_y, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_ix_s, + const int32_t stride_ix_b, + const int32_t stride_ix_h, + const int32_t stride_ix_d, + const int32_t stride_iy_s, + const int32_t stride_iy_b, + const int32_t stride_iy_h, + const int32_t stride_iy_d, + const int32_t stride_ox_s, + const int32_t stride_ox_b, + const int32_t stride_ox_h, + const int32_t stride_ox_d, + const int32_t stride_oy_s, + const int32_t stride_oy_b, + const int32_t stride_oy_h, + const int32_t stride_oy_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1427,22 +1430,22 @@ template -__global__ void -kn_entry_2c_sbhd_uncached_inplace(scalar_t* __restrict__ p_inout_x, - scalar_t* __restrict__ p_inout_y, - const scalar_f_t* __restrict__ p_freqs, - const int32_t size_h_x, - const int32_t size_h_y, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_x_s, - const int32_t stride_x_b, - const int32_t stride_x_h, - const int32_t stride_x_d, - const int32_t stride_y_s, - const int32_t stride_y_b, - const int32_t stride_y_h, - const int32_t stride_y_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_2c_sbhd_uncached_inplace(scalar_t* __restrict__ p_inout_x, + scalar_t* __restrict__ p_inout_y, + const scalar_f_t* __restrict__ p_freqs, + const int32_t size_h_x, + const int32_t size_h_y, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_x_s, + const int32_t stride_x_b, + const int32_t stride_x_h, + const int32_t stride_x_d, + const int32_t stride_y_s, + const int32_t stride_y_b, + const int32_t stride_y_h, + const int32_t stride_y_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1484,21 +1487,22 @@ template -__global__ void kn_entry_1c_sbhd_cached(scalar_t* __restrict__ p_output, - const scalar_t* __restrict__ p_input, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_i_s, - const int32_t stride_i_b, - const int32_t stride_i_h, - const int32_t stride_i_d, - const int32_t stride_o_s, - const int32_t stride_o_b, - const int32_t stride_o_h, - const int32_t stride_o_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_sbhd_cached(scalar_t* __restrict__ p_output, + const scalar_t* __restrict__ p_input, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_i_s, + const int32_t stride_i_b, + const int32_t stride_i_h, + const int32_t stride_i_d, + const int32_t stride_o_s, + const int32_t stride_o_b, + const int32_t stride_o_h, + const int32_t stride_o_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1531,17 +1535,17 @@ template -__global__ void -kn_entry_1c_sbhd_cached_inplace(scalar_t* __restrict__ p_inout, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_s, - const int32_t stride_b, - const int32_t stride_h, - const int32_t stride_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_sbhd_cached_inplace(scalar_t* __restrict__ p_inout, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_s, + const int32_t stride_b, + const int32_t stride_h, + const int32_t stride_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1576,32 +1580,33 @@ template -__global__ void kn_entry_2c_sbhd_cached(scalar_t* __restrict__ p_output_x, - scalar_t* __restrict__ p_output_y, - const scalar_t* __restrict__ p_input_x, - const scalar_t* __restrict__ p_input_y, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int32_t size_h_x, - const int32_t size_h_y, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_ix_s, - const int32_t stride_ix_b, - const int32_t stride_ix_h, - const int32_t stride_ix_d, - const int32_t stride_iy_s, - const int32_t stride_iy_b, - const int32_t stride_iy_h, - const int32_t stride_iy_d, - const int32_t stride_ox_s, - const int32_t stride_ox_b, - const int32_t stride_ox_h, - const int32_t stride_ox_d, - const int32_t stride_oy_s, - const int32_t stride_oy_b, - const int32_t stride_oy_h, - const int32_t stride_oy_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_2c_sbhd_cached(scalar_t* __restrict__ p_output_x, + scalar_t* __restrict__ p_output_y, + const scalar_t* __restrict__ p_input_x, + const scalar_t* __restrict__ p_input_y, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int32_t size_h_x, + const int32_t size_h_y, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_ix_s, + const int32_t stride_ix_b, + const int32_t stride_ix_h, + const int32_t stride_ix_d, + const int32_t stride_iy_s, + const int32_t stride_iy_b, + const int32_t stride_iy_h, + const int32_t stride_iy_d, + const int32_t stride_ox_s, + const int32_t stride_ox_b, + const int32_t stride_ox_h, + const int32_t stride_ox_d, + const int32_t stride_oy_s, + const int32_t stride_oy_b, + const int32_t stride_oy_h, + const int32_t stride_oy_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1646,23 +1651,23 @@ template -__global__ void -kn_entry_2c_sbhd_cached_inplace(scalar_t* __restrict__ p_inout_x, - scalar_t* __restrict__ p_inout_y, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int32_t size_h_x, - const int32_t size_h_y, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_x_s, - const int32_t stride_x_b, - const int32_t stride_x_h, - const int32_t stride_x_d, - const int32_t stride_y_s, - const int32_t stride_y_b, - const int32_t stride_y_h, - const int32_t stride_y_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_2c_sbhd_cached_inplace(scalar_t* __restrict__ p_inout_x, + scalar_t* __restrict__ p_inout_y, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int32_t size_h_x, + const int32_t size_h_y, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_x_s, + const int32_t stride_x_b, + const int32_t stride_x_h, + const int32_t stride_x_d, + const int32_t stride_y_s, + const int32_t stride_y_b, + const int32_t stride_y_h, + const int32_t stride_y_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1705,24 +1710,24 @@ template -__global__ void -kn_entry_1c_sbhd_cached_indirect(scalar_t* __restrict__ p_output, - const scalar_t* __restrict__ p_input, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int64_t* __restrict__ p_indirect_buffer, - const int32_t max_position, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_i_s, - const int32_t stride_i_b, - const int32_t stride_i_h, - const int32_t stride_i_d, - const int32_t stride_o_s, - const int32_t stride_o_b, - const int32_t stride_o_h, - const int32_t stride_o_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_sbhd_cached_indirect(scalar_t* __restrict__ p_output, + const scalar_t* __restrict__ p_input, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int64_t* __restrict__ p_indirect_buffer, + const int32_t max_position, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_i_s, + const int32_t stride_i_b, + const int32_t stride_i_h, + const int32_t stride_i_d, + const int32_t stride_o_s, + const int32_t stride_o_b, + const int32_t stride_o_h, + const int32_t stride_o_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1764,35 +1769,35 @@ template -__global__ void -kn_entry_2c_sbhd_cached_indirect(scalar_t* __restrict__ p_output_x, - scalar_t* __restrict__ p_output_y, - const scalar_t* __restrict__ p_input_x, - const scalar_t* __restrict__ p_input_y, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int64_t* __restrict__ p_indirect_buffer, - const int32_t max_position, - const int32_t size_h_x, - const int32_t size_h_y, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_ix_s, - const int32_t stride_ix_b, - const int32_t stride_ix_h, - const int32_t stride_ix_d, - const int32_t stride_iy_s, - const int32_t stride_iy_b, - const int32_t stride_iy_h, - const int32_t stride_iy_d, - const int32_t stride_ox_s, - const int32_t stride_ox_b, - const int32_t stride_ox_h, - const int32_t stride_ox_d, - const int32_t stride_oy_s, - const int32_t stride_oy_b, - const int32_t stride_oy_h, - const int32_t stride_oy_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_2c_sbhd_cached_indirect(scalar_t* __restrict__ p_output_x, + scalar_t* __restrict__ p_output_y, + const scalar_t* __restrict__ p_input_x, + const scalar_t* __restrict__ p_input_y, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int64_t* __restrict__ p_indirect_buffer, + const int32_t max_position, + const int32_t size_h_x, + const int32_t size_h_y, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_ix_s, + const int32_t stride_ix_b, + const int32_t stride_ix_h, + const int32_t stride_ix_d, + const int32_t stride_iy_s, + const int32_t stride_iy_b, + const int32_t stride_iy_h, + const int32_t stride_iy_d, + const int32_t stride_ox_s, + const int32_t stride_ox_b, + const int32_t stride_ox_h, + const int32_t stride_ox_d, + const int32_t stride_oy_s, + const int32_t stride_oy_b, + const int32_t stride_oy_h, + const int32_t stride_oy_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1842,19 +1847,19 @@ template -__global__ void -kn_entry_1c_sbhd_cached_indirect_inplace(scalar_t* __restrict__ p_inout, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int64_t* __restrict__ p_indirect_buffer, - const int32_t max_position, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_s, - const int32_t stride_b, - const int32_t stride_h, - const int32_t stride_d) +__launch_bounds__(256, 8) __global__ void kn_entry_1c_sbhd_cached_indirect_inplace( + scalar_t* __restrict__ p_inout, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int64_t* __restrict__ p_indirect_buffer, + const int32_t max_position, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_s, + const int32_t stride_b, + const int32_t stride_h, + const int32_t stride_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1893,25 +1898,25 @@ template -__global__ void -kn_entry_2c_sbhd_cached_indirect_inplace(scalar_t* __restrict__ p_inout_x, - scalar_t* __restrict__ p_inout_y, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int64_t* __restrict__ p_indirect_buffer, - const int32_t max_position, - const int32_t size_h_x, - const int32_t size_h_y, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_x_s, - const int32_t stride_x_b, - const int32_t stride_x_h, - const int32_t stride_x_d, - const int32_t stride_y_s, - const int32_t stride_y_b, - const int32_t stride_y_h, - const int32_t stride_y_d) +__launch_bounds__(256, 8) __global__ void kn_entry_2c_sbhd_cached_indirect_inplace( + scalar_t* __restrict__ p_inout_x, + scalar_t* __restrict__ p_inout_y, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int64_t* __restrict__ p_indirect_buffer, + const int32_t max_position, + const int32_t size_h_x, + const int32_t size_h_y, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_x_s, + const int32_t stride_x_b, + const int32_t stride_x_h, + const int32_t stride_x_d, + const int32_t stride_y_s, + const int32_t stride_y_b, + const int32_t stride_y_h, + const int32_t stride_y_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -1960,25 +1965,25 @@ template -__global__ void -kn_entry_1c_sbhd_cached_indirect2(scalar_t* __restrict__ p_output, - const scalar_t* __restrict__ p_input, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int64_t* __restrict__ p_indirect_buffer_0, - const int64_t* __restrict__ p_indirect_buffer_1, - const int32_t max_position, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_i_s, - const int32_t stride_i_b, - const int32_t stride_i_h, - const int32_t stride_i_d, - const int32_t stride_o_s, - const int32_t stride_o_b, - const int32_t stride_o_h, - const int32_t stride_o_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_sbhd_cached_indirect2(scalar_t* __restrict__ p_output, + const scalar_t* __restrict__ p_input, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int64_t* __restrict__ p_indirect_buffer_0, + const int64_t* __restrict__ p_indirect_buffer_1, + const int32_t max_position, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_i_s, + const int32_t stride_i_b, + const int32_t stride_i_h, + const int32_t stride_i_d, + const int32_t stride_o_s, + const int32_t stride_o_b, + const int32_t stride_o_h, + const int32_t stride_o_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -2020,36 +2025,36 @@ template -__global__ void -kn_entry_2c_sbhd_cached_indirect2(scalar_t* __restrict__ p_output_x, - scalar_t* __restrict__ p_output_y, - const scalar_t* __restrict__ p_input_x, - const scalar_t* __restrict__ p_input_y, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int64_t* __restrict__ p_indirect_buffer_0, - const int64_t* __restrict__ p_indirect_buffer_1, - const int32_t max_position, - const int32_t size_h_x, - const int32_t size_h_y, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_ix_s, - const int32_t stride_ix_b, - const int32_t stride_ix_h, - const int32_t stride_ix_d, - const int32_t stride_iy_s, - const int32_t stride_iy_b, - const int32_t stride_iy_h, - const int32_t stride_iy_d, - const int32_t stride_ox_s, - const int32_t stride_ox_b, - const int32_t stride_ox_h, - const int32_t stride_ox_d, - const int32_t stride_oy_s, - const int32_t stride_oy_b, - const int32_t stride_oy_h, - const int32_t stride_oy_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_2c_sbhd_cached_indirect2(scalar_t* __restrict__ p_output_x, + scalar_t* __restrict__ p_output_y, + const scalar_t* __restrict__ p_input_x, + const scalar_t* __restrict__ p_input_y, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int64_t* __restrict__ p_indirect_buffer_0, + const int64_t* __restrict__ p_indirect_buffer_1, + const int32_t max_position, + const int32_t size_h_x, + const int32_t size_h_y, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_ix_s, + const int32_t stride_ix_b, + const int32_t stride_ix_h, + const int32_t stride_ix_d, + const int32_t stride_iy_s, + const int32_t stride_iy_b, + const int32_t stride_iy_h, + const int32_t stride_iy_d, + const int32_t stride_ox_s, + const int32_t stride_ox_b, + const int32_t stride_ox_h, + const int32_t stride_ox_d, + const int32_t stride_oy_s, + const int32_t stride_oy_b, + const int32_t stride_oy_h, + const int32_t stride_oy_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -2099,20 +2104,20 @@ template -__global__ void -kn_entry_1c_sbhd_cached_indirect2_inplace(scalar_t* __restrict__ p_inout, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int64_t* __restrict__ p_indirect_buffer_0, - const int64_t* __restrict__ p_indirect_buffer_1, - const int32_t max_position, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_s, - const int32_t stride_b, - const int32_t stride_h, - const int32_t stride_d) +__launch_bounds__(256, 8) __global__ void kn_entry_1c_sbhd_cached_indirect2_inplace( + scalar_t* __restrict__ p_inout, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int64_t* __restrict__ p_indirect_buffer_0, + const int64_t* __restrict__ p_indirect_buffer_1, + const int32_t max_position, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_s, + const int32_t stride_b, + const int32_t stride_h, + const int32_t stride_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -2151,26 +2156,26 @@ template -__global__ void -kn_entry_2c_sbhd_cached_indirect2_inplace(scalar_t* __restrict__ p_inout_x, - scalar_t* __restrict__ p_inout_y, - const scalar_f_t* __restrict__ p_cos, - const scalar_f_t* __restrict__ p_sin, - const int64_t* __restrict__ p_indirect_buffer_0, - const int64_t* __restrict__ p_indirect_buffer_1, - const int32_t max_position, - const int32_t size_h_x, - const int32_t size_h_y, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_x_s, - const int32_t stride_x_b, - const int32_t stride_x_h, - const int32_t stride_x_d, - const int32_t stride_y_s, - const int32_t stride_y_b, - const int32_t stride_y_h, - const int32_t stride_y_d) +__launch_bounds__(256, 8) __global__ void kn_entry_2c_sbhd_cached_indirect2_inplace( + scalar_t* __restrict__ p_inout_x, + scalar_t* __restrict__ p_inout_y, + const scalar_f_t* __restrict__ p_cos, + const scalar_f_t* __restrict__ p_sin, + const int64_t* __restrict__ p_indirect_buffer_0, + const int64_t* __restrict__ p_indirect_buffer_1, + const int32_t max_position, + const int32_t size_h_x, + const int32_t size_h_y, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_x_s, + const int32_t stride_x_b, + const int32_t stride_x_h, + const int32_t stride_x_d, + const int32_t stride_y_s, + const int32_t stride_y_b, + const int32_t stride_y_h, + const int32_t stride_y_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -2219,19 +2224,20 @@ template -__global__ void kn_entry_1c_thd_uncached(scalar_t* __restrict__ p_output, - const scalar_t* __restrict__ p_input, - const int32_t* __restrict__ p_cu_seqlens, - const scalar_f_t* __restrict__ p_freqs, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_i_t, - const int32_t stride_i_h, - const int32_t stride_i_d, - const int32_t stride_o_t, - const int32_t stride_o_h, - const int32_t stride_o_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_thd_uncached(scalar_t* __restrict__ p_output, + const scalar_t* __restrict__ p_input, + const int32_t* __restrict__ p_cu_seqlens, + const scalar_f_t* __restrict__ p_freqs, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_i_t, + const int32_t stride_i_h, + const int32_t stride_i_d, + const int32_t stride_o_t, + const int32_t stride_o_h, + const int32_t stride_o_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -2268,16 +2274,16 @@ template -__global__ void -kn_entry_1c_thd_uncached_inplace(scalar_t* __restrict__ p_inout, - const int32_t* __restrict__ p_cu_seqlens, - const scalar_f_t* __restrict__ p_freqs, - const int32_t size_h, - const int32_t size_d, - const int32_t size_f, // size of last dimension of freqs. - const int32_t stride_t, - const int32_t stride_h, - const int32_t stride_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_thd_uncached_inplace(scalar_t* __restrict__ p_inout, + const int32_t* __restrict__ p_cu_seqlens, + const scalar_f_t* __restrict__ p_freqs, + const int32_t size_h, + const int32_t size_d, + const int32_t size_f, // size of last dimension of freqs. + const int32_t stride_t, + const int32_t stride_h, + const int32_t stride_d) { const uint64_t sid = blockIdx.x; const uint64_t bid = blockIdx.y; @@ -2314,23 +2320,24 @@ template -__global__ void kn_entry_1c_2d_cached(scalar_t* __restrict__ p_output, - const scalar_t* __restrict__ p_input, - const scalar_f_t* __restrict__ p_cos_h, - const scalar_f_t* __restrict__ p_sin_h, - const scalar_f_t* __restrict__ p_cos_w, - const scalar_f_t* __restrict__ p_sin_w, - const int32_t img_width, - const int32_t size_h, - const int32_t size_d, - const int32_t stride_i_b, - const int32_t stride_i_s, - const int32_t stride_i_h, - const int32_t stride_i_d, - const int32_t stride_o_b, - const int32_t stride_o_s, - const int32_t stride_o_h, - const int32_t stride_o_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_2d_cached(scalar_t* __restrict__ p_output, + const scalar_t* __restrict__ p_input, + const scalar_f_t* __restrict__ p_cos_h, + const scalar_f_t* __restrict__ p_sin_h, + const scalar_f_t* __restrict__ p_cos_w, + const scalar_f_t* __restrict__ p_sin_w, + const int32_t img_width, + const int32_t size_h, + const int32_t size_d, + const int32_t stride_i_b, + const int32_t stride_i_s, + const int32_t stride_i_h, + const int32_t stride_i_d, + const int32_t stride_o_b, + const int32_t stride_o_s, + const int32_t stride_o_h, + const int32_t stride_o_d) { const uint64_t Hid = blockIdx.x; const uint64_t Wid = blockIdx.y; @@ -2386,18 +2393,19 @@ template -__global__ void kn_entry_1c_2d_cached_inplace(scalar_t* __restrict__ p_inout, - const scalar_f_t* __restrict__ p_cos_h, - const scalar_f_t* __restrict__ p_sin_h, - const scalar_f_t* __restrict__ p_cos_w, - const scalar_f_t* __restrict__ p_sin_w, - const int32_t img_width, - const int32_t size_h, - const int32_t size_d, - const int32_t stride_b, - const int32_t stride_s, - const int32_t stride_h, - const int32_t stride_d) +__launch_bounds__(256, 8) __global__ + void kn_entry_1c_2d_cached_inplace(scalar_t* __restrict__ p_inout, + const scalar_f_t* __restrict__ p_cos_h, + const scalar_f_t* __restrict__ p_sin_h, + const scalar_f_t* __restrict__ p_cos_w, + const scalar_f_t* __restrict__ p_sin_w, + const int32_t img_width, + const int32_t size_h, + const int32_t size_d, + const int32_t stride_b, + const int32_t stride_s, + const int32_t stride_h, + const int32_t stride_d) { const uint64_t Hid = blockIdx.x; const uint64_t Wid = blockIdx.y; @@ -2648,6 +2656,32 @@ __global__ void kn_entry_1c_2d_cached_inplace(scalar_t* __restrict__ p_inout, } \ } +template +std::tuple get_grid_config(const int32_t size_s_h, + const int32_t size_s_w, + const int32_t size_b, + const int32_t size_f) +{ + constexpr int32_t num_warps = 4; + constexpr int32_t num_threads = num_warps * ck_tile::get_warp_size(); + + const int32_t size_r = ReuseFreqsFrontPart ? (size_f << 1) : size_f; + const int32_t size_half_r = size_r >> 1; + const int32_t aligned_size_half_r = ck_tile::next_power_of_two(size_half_r); + + const int32_t block_dim_x = std::min(aligned_size_half_r, ck_tile::get_warp_size()); + const int32_t block_dim_y = std::max(num_threads / block_dim_x, 1); + + if constexpr(Is2D) + { + return {dim3(size_s_h, size_s_w, size_b), dim3(block_dim_x, block_dim_y)}; + } + else + { + return {dim3(size_s_h * size_s_w, size_b), dim3(block_dim_x, block_dim_y)}; + } +} + template (size_s, 1, size_b, size_f); if(p_output == p_input) { @@ -2764,8 +2797,7 @@ void dispatch_2c_sbhd_uncached(scalar_t* __restrict__ p_output_x, { const hipStream_t stream = at::hip::getCurrentHIPStream(); - const dim3 grid(size_s, size_b); - const dim3 block(C10_WARP_SIZE, size_h_x < 16 ? 4 : 8); + auto [grid, block] = get_grid_config(size_s, 1, size_b, size_f); if((p_output_x == p_input_x) && (p_output_y == p_input_y)) { @@ -2873,8 +2905,7 @@ void dispatch_1c_sbhd_cached(scalar_t* __restrict__ p_output, { const hipStream_t stream = at::hip::getCurrentHIPStream(); - const dim3 grid(size_s, size_b); - const dim3 block(C10_WARP_SIZE, size_h < 16 ? 4 : 8); + auto [grid, block] = get_grid_config(size_s, 1, size_b, size_f); if(p_output == p_input) { @@ -2967,8 +2998,7 @@ void dispatch_2c_sbhd_cached(scalar_t* __restrict__ p_output_x, { const hipStream_t stream = at::hip::getCurrentHIPStream(); - const dim3 grid(size_s, size_b); - const dim3 block(C10_WARP_SIZE, size_h_x < 16 ? 4 : 8); + auto [grid, block] = get_grid_config(size_s, 1, size_b, size_f); if((p_output_x == p_input_x) && (p_output_y == p_input_y)) { @@ -3079,8 +3109,7 @@ void dispatch_1c_sbhd_cached_indirect(scalar_t* __restrict__ p_output, { const hipStream_t stream = at::hip::getCurrentHIPStream(); - const dim3 grid(size_s, size_b); - const dim3 block(C10_WARP_SIZE, size_h < 16 ? 4 : 8); + auto [grid, block] = get_grid_config(size_s, 1, size_b, size_f); if(p_output == p_input) { @@ -3180,8 +3209,7 @@ void dispatch_2c_sbhd_cached_indirect(scalar_t* __restrict__ p_output_x, { const hipStream_t stream = at::hip::getCurrentHIPStream(); - const dim3 grid(size_s, size_b); - const dim3 block(C10_WARP_SIZE, size_h_x < 16 ? 4 : 8); + auto [grid, block] = get_grid_config(size_s, 1, size_b, size_f); if((p_output_x == p_input_x) && (p_output_y == p_input_y)) { @@ -3298,8 +3326,7 @@ void dispatch_1c_sbhd_cached_indirect2(scalar_t* __restrict__ p_output, { const hipStream_t stream = at::hip::getCurrentHIPStream(); - const dim3 grid(size_s, size_b); - const dim3 block(C10_WARP_SIZE, size_h < 16 ? 4 : 8); + auto [grid, block] = get_grid_config(size_s, 1, size_b, size_f); if(p_output == p_input) { @@ -3403,8 +3430,7 @@ void dispatch_2c_sbhd_cached_indirect2(scalar_t* __restrict__ p_output_x, { const hipStream_t stream = at::hip::getCurrentHIPStream(); - const dim3 grid(size_s, size_b); - const dim3 block(C10_WARP_SIZE, size_h_x < 16 ? 4 : 8); + auto [grid, block] = get_grid_config(size_s, 1, size_b, size_f); if((p_output_x == p_input_x) && (p_output_y == p_input_y)) { @@ -3519,8 +3545,7 @@ void dispatch_1c_thd_uncached(scalar_t* __restrict__ p_output, { const hipStream_t stream = at::hip::getCurrentHIPStream(); - const dim3 grid(size_max_s, size_b); - const dim3 block(C10_WARP_SIZE, size_h < 16 ? 4 : 8); + auto [grid, block] = get_grid_config(size_max_s, 1, size_b, size_f); if(p_output == p_input) { @@ -3600,8 +3625,8 @@ void dispatch_1c_2d_cached(scalar_t* __restrict__ p_output, { const hipStream_t stream = at::hip::getCurrentHIPStream(); - const dim3 grid(img_height, img_width, size_b); - const dim3 block(C10_WARP_SIZE, size_h < 16 ? 4 : 8); + auto [grid, block] = + get_grid_config(img_height, img_width, size_b, size_d >> 1); if(p_output == p_input) { From 7f8ed6ab58708edc44a1309a35f7bf24605178fc Mon Sep 17 00:00:00 2001 From: amd-ruitang3 <145657428+amd-ruitang3@users.noreply.github.com> Date: Fri, 19 Dec 2025 22:39:08 +0800 Subject: [PATCH 17/40] rm garbage from whl (#1696) --- MANIFEST.in | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 0d8f6f5d5d..de0ff3c02c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,6 @@ graft aiter -graft aiter_meta \ No newline at end of file +graft aiter_meta + +# exclude cache and compiled files .pyc / .pyo / .pyd / .pyd +global-exclude *.py[cod] +prune aiter/jit/build \ No newline at end of file From 70562e8e8d217d09ff0440bd6a5b8c832a1383e4 Mon Sep 17 00:00:00 2001 From: zufayu Date: Fri, 19 Dec 2025 22:41:08 +0800 Subject: [PATCH 18/40] enhance prebuild logic (#1672) * enhance prebuild logic * ATen.h build issues * bug fix * bug fix II * bug fix III --------- Co-authored-by: zufayu Co-authored-by: Lingpeng Jin <103567126+valarLip@users.noreply.github.com> --- aiter/jit/utils/cpp_extension.py | 15 ++- csrc/include/rocm_ops.hpp | 135 +++++++++++++++++++----- setup.py | 173 +++++++++---------------------- 3 files changed, 172 insertions(+), 151 deletions(-) diff --git a/aiter/jit/utils/cpp_extension.py b/aiter/jit/utils/cpp_extension.py index 5799e47205..2bd4c14a67 100644 --- a/aiter/jit/utils/cpp_extension.py +++ b/aiter/jit/utils/cpp_extension.py @@ -1534,7 +1534,20 @@ def _write_ninja_file_to_build_library( extra_ldflags = [flag.strip() for flag in extra_ldflags] extra_include_paths = [flag.strip() for flag in extra_include_paths] # include_paths() gives us the location of torch/extension.h - system_includes = [] if torch_exclude else include_paths(with_cuda) + # system_includes = [] if torch_exclude else include_paths(with_cuda) + import torch + + _TORCH_PATH = os.path.dirname(torch.__file__) + TORCH_INCLUDE_ROOT = os.path.join(_TORCH_PATH, "include") + system_includes = [ + TORCH_INCLUDE_ROOT, + os.path.join(TORCH_INCLUDE_ROOT, "torch/csrc/api/include"), + os.path.join(TORCH_INCLUDE_ROOT, "TH"), + os.path.join(TORCH_INCLUDE_ROOT, "THC"), + ] + if not torch_exclude: + system_includes += include_paths(with_cuda) + system_includes = list(set(system_includes)) # FIXME: build python module excluded with torch, use `pybind11` # But we can't use this now because all aiter op based on torch diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index 925deb96ea..135e8ae03b 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -3,6 +3,7 @@ #pragma once #include + namespace py = pybind11; #define ACTIVATION_PYBIND \ @@ -658,6 +659,64 @@ namespace py = pybind11; py::arg("rng_state") = std::nullopt, \ py::arg("gen") = std::nullopt); +#define ROCSOLGEMM_PYBIND \ + m.def("rocb_create_extension", &rocb_create_extension, "create_extension"); \ + m.def("rocb_destroy_extension", &rocb_destroy_extension, "destroy_extension"); \ + m.def("rocb_mm", &RocSolIdxBlas, "mm"); \ + m.def("rocb_findallsols", &RocFindAllSolIdxBlas, "rocblas_find_all_sols"); + +#define HIPBSOLGEMM_PYBIND \ + m.def("hipb_create_extension", &hipb_create_extension, "create_extension"); \ + m.def("hipb_destroy_extension", &hipb_destroy_extension, "destroy_extension"); \ + m.def("hipb_mm", \ + &hipb_mm, \ + "hipb_mm", \ + py::arg("mat1"), \ + py::arg("mat2"), \ + py::arg("solution_index"), \ + py::arg("bias") = std::nullopt, \ + py::arg("out_dtype") = std::nullopt, \ + py::arg("scaleA") = std::nullopt, \ + py::arg("scaleB") = std::nullopt, \ + py::arg("scaleOut") = std::nullopt, \ + py::arg("bpreshuffle") = std::nullopt); \ + m.def("hipb_findallsols", \ + &hipb_findallsols, \ + "hipb_findallsols", \ + py::arg("mat1"), \ + py::arg("mat2"), \ + py::arg("bias") = std::nullopt, \ + py::arg("out_dtype") = std::nullopt, \ + py::arg("scaleA") = std::nullopt, \ + py::arg("scaleB") = std::nullopt, \ + py::arg("scaleC") = std::nullopt, \ + py::arg("bpreshuffle") = false); \ + m.def("getHipblasltKernelName", &getHipblasltKernelName); + +#define LIBMHA_BWD_PYBIND \ + m.def("libmha_bwd", \ + &aiter::torch_itfs::mha_bwd, \ + py::arg("dout"), \ + py::arg("q"), \ + py::arg("k"), \ + py::arg("v"), \ + py::arg("out"), \ + py::arg("softmax_lse"), \ + py::arg("dropout_p"), \ + py::arg("softmax_scale"), \ + py::arg("is_causal"), \ + py::arg("window_size_left"), \ + py::arg("window_size_right"), \ + py::arg("deterministic"), \ + py::arg("dq") = std::nullopt, \ + py::arg("dk") = std::nullopt, \ + py::arg("dv") = std::nullopt, \ + py::arg("dbias") = std::nullopt, \ + py::arg("bias") = std::nullopt, \ + py::arg("alibi_slopes") = std::nullopt, \ + py::arg("rng_state") = std::nullopt, \ + py::arg("gen") = std::nullopt); + #define MHA_VARLEN_BWD_ASM_PYBIND \ m.def("fmha_v3_varlen_bwd", \ &aiter::torch_itfs::fmha_v3_varlen_bwd, \ @@ -756,32 +815,56 @@ namespace py = pybind11; py::arg("v_descale") = std::nullopt, \ py::arg("gen") = std::nullopt); -#define MHA_VARLEN_FWD_ASM_PYBIND \ - m.def("fmha_v3_varlen_fwd", \ - &aiter::torch_itfs::fmha_v3_varlen_fwd, \ - py::arg("q"), \ - py::arg("k"), \ - py::arg("v"), \ - py::arg("cu_seqlens_q"), \ - py::arg("cu_seqlens_k"), \ - py::arg("max_seqlen_q"), \ - py::arg("max_seqlen_k"), \ - py::arg("min_seqlen_q"), \ - py::arg("dropout_p"), \ - py::arg("softmax_scale"), \ - py::arg("logits_soft_cap"), \ - py::arg("zero_tensors"), \ - py::arg("is_causal"), \ - py::arg("window_size_left"), \ - py::arg("window_size_right"), \ - py::arg("return_softmax_lse"), \ - py::arg("return_dropout_randval"), \ - py::arg("how_v3_bf16_cvt"), \ - py::arg("out") = std::nullopt, \ - py::arg("block_table") = std::nullopt, \ - py::arg("bias") = std::nullopt, \ - py::arg("alibi_slopes") = std::nullopt, \ - py::arg("gen") = std::nullopt, \ +#define LIBMHA_FWD_PYBIND \ + m.def("libmha_fwd", \ + &aiter::torch_itfs::mha_fwd, \ + py::arg("q"), \ + py::arg("k"), \ + py::arg("v"), \ + py::arg("dropout_p"), \ + py::arg("softmax_scale"), \ + py::arg("is_causal"), \ + py::arg("window_size_left"), \ + py::arg("window_size_right"), \ + py::arg("sink_size"), \ + py::arg("return_softmax_lse"), \ + py::arg("return_dropout_randval"), \ + py::arg("cu_seqlens_q") = std::nullopt, \ + py::arg("cu_seqlens_kv") = std::nullopt, \ + py::arg("out") = std::nullopt, \ + py::arg("bias") = std::nullopt, \ + py::arg("alibi_slopes") = std::nullopt, \ + py::arg("q_descale") = std::nullopt, \ + py::arg("k_descale") = std::nullopt, \ + py::arg("v_descale") = std::nullopt, \ + py::arg("gen") = std::nullopt); + +#define MHA_VARLEN_FWD_ASM_PYBIND \ + m.def("fmha_v3_varlen_fwd", \ + &aiter::torch_itfs::fmha_v3_varlen_fwd, \ + py::arg("q"), \ + py::arg("k"), \ + py::arg("v"), \ + py::arg("cu_seqlens_q"), \ + py::arg("cu_seqlens_k"), \ + py::arg("max_seqlen_q"), \ + py::arg("max_seqlen_k"), \ + py::arg("min_seqlen_q"), \ + py::arg("dropout_p"), \ + py::arg("softmax_scale"), \ + py::arg("logits_soft_cap"), \ + py::arg("zero_tensors"), \ + py::arg("is_causal"), \ + py::arg("window_size_left"), \ + py::arg("window_size_right"), \ + py::arg("return_softmax_lse"), \ + py::arg("return_dropout_randval"), \ + py::arg("how_v3_bf16_cvt"), \ + py::arg("out") = std::nullopt, \ + py::arg("block_table") = std::nullopt, \ + py::arg("bias") = std::nullopt, \ + py::arg("alibi_slopes") = std::nullopt, \ + py::arg("gen") = std::nullopt, \ py::arg("cu_seqlens_q_padded") = std::nullopt, \ py::arg("cu_seqlens_k_padded") = std::nullopt); diff --git a/setup.py b/setup.py index 9b1540db52..c704459431 100644 --- a/setup.py +++ b/setup.py @@ -4,6 +4,7 @@ import os import shutil import sys +import json from setuptools import Distribution, setup @@ -81,131 +82,55 @@ def is_develop_mode(): shutil.copytree("gradlib", "aiter_meta/gradlib") shutil.copytree("csrc", "aiter_meta/csrc") + def _load_modules_from_config(): + cfg_path = os.path.join(this_dir, "aiter", "jit", "optCompilerConfig.json") + try: + with open(cfg_path, "r", encoding="utf-8") as f: + data = json.load(f) + except Exception: + return [] + if isinstance(data, dict): + return list(data.keys()) + return [] + def get_exclude_ops(): - if PREBUILD_KERNELS == 1: - return [ - "libmha_fwd", - "libmha_bwd", - "module_fmha_v3_fwd", - "module_mha_fwd", - "module_mha_varlen_fwd", - "module_mha_batch_prefill", - "module_fmha_v3_bwd", - "module_fmha_v3_varlen_bwd", - "module_fmha_v3_varlen_fwd", - "module_mha_bwd", - "module_mha_varlen_bwd", - "module_batched_gemm_bf16_tune", - "module_batched_gemm_a8w8_tune", - "module_gemm_a8w8_tune", - "module_gemm_a8w8_blockscale_tune", - "module_gemm_a8w8_blockscale_bpreshuffle_tune", - "module_gemm_a4w4_blockscale_tune", - "module_gemm_a8w8_bpreshuffle_tune", - "module_gemm_a8w8_bpreshuffle_cktile_tune", - "module_gemm_mi350_a8w8_blockscale_asm", - ] - elif PREBUILD_KERNELS == 2: - return [ - "libmha_bwd", - "module_mha_batch_prefill", - "module_fmha_v3_bwd", - "module_fmha_v3_varlen_bwd", - "module_mha_bwd", - "module_mha_varlen_bwd", - "module_batched_gemm_bf16_tune", - "module_batched_gemm_a8w8_tune", - "module_gemm_a8w8_tune", - "module_gemm_a8w8_blockscale_tune", - "module_gemm_a8w8_blockscale_bpreshuffle_tune", - "module_gemm_a4w4_blockscale_tune", - "module_gemm_a8w8_bpreshuffle_tune", - "module_gemm_a8w8_bpreshuffle_cktile_tune", - "module_gemm_mi350_a8w8_blockscale_asm", - ] - elif PREBUILD_KERNELS == 3: - return [ - "module_activation", - "module_attention", - "module_pa_ragged", - "module_pa_v1", - "module_attention_asm", - "module_pa", - "module_mla_asm", - "module_cache", - "module_custom_all_reduce", - "module_quick_all_reduce", - "module_custom", - "module_gemm_common", - "module_batched_gemm_bf16", - "module_batched_gemm_a8w8", - "module_gemm_a8w8", - "module_gemm_a8w8_blockscale", - "module_gemm_a8w8_blockscale_bpreshuffle", - "module_gemm_a4w4_blockscale", - "module_gemm_a8w8_bpreshuffle", - "module_deepgemm", - "module_gemm_a8w8_bpreshuffle_cktile", - "module_gemm_a8w8_asm", - "module_gemm_a16w16_asm", - "module_gemm_a4w4_asm", - "module_gemm_a8w8_blockscale_asm", - "module_gemm_a8w8_blockscale_bpreshuffle_asm", - "module_gemm_mi350_a8w8_blockscale_asm", - "module_moe_asm", - "module_moe_ck2stages", - "module_moe_cktile2stages", - "module_moe_sorting", - "module_moe_topk", - "module_norm", - "module_pos_encoding", - "module_rmsnorm", - "module_smoothquant", - "module_batched_gemm_bf16_tune", - "module_batched_gemm_a8w8_tune", - "module_gemm_a8w8_tune", - "module_gemm_a8w8_blockscale_tune", - "module_gemm_a8w8_blockscale_bpreshuffle_tune", - "module_gemm_a4w4_blockscale_tune", - "module_gemm_a8w8_bpreshuffle_tune", - "module_gemm_a8w8_bpreshuffle_cktile_tune", - "module_aiter_operator", - "module_aiter_unary", - "module_quant", - "module_sample", - "module_rope_general_fwd", - "module_rope_general_bwd", - "module_rope_pos_fwd", - "module_fused_mrope_rms", - # "module_fmha_v3_fwd", - "module_mha_fwd", - "module_mha_varlen_fwd", - # "module_fmha_v3_bwd", - "module_fmha_v3_varlen_bwd", - "module_fmha_v3_varlen_fwd", - "module_mha_bwd", - "module_mha_varlen_bwd", - "libmha_fwd", - "libmha_bwd", - "module_rocsolgemm", - "module_hipbsolgemm", - "module_top_k_per_row", - "module_mla_metadata", - "module_mla_reduce", - "module_topk_plain", - ] - else: - return [ - "module_gemm_mi350_a8w8_blockscale_asm", - "module_batched_gemm_bf16_tune", - "module_batched_gemm_a8w8_tune", - "module_gemm_a8w8_tune", - "module_gemm_a8w8_blockscale_tune", - "module_gemm_a8w8_blockscale_bpreshuffle_tune", - "module_gemm_a4w4_blockscale_tune", - "module_gemm_a8w8_bpreshuffle_tune", - "module_gemm_a8w8_bpreshuffle_cktile_tune", - ] + all_modules = _load_modules_from_config() + exclude_ops = [] + + for module in all_modules: + if PREBUILD_KERNELS == 1: + # Exclude mha, _tune, and specific module + if ( + "mha" in module + or "_tune" in module + or module == "module_gemm_mi350_a8w8_blockscale_asm" + ): + exclude_ops.append(module) + elif PREBUILD_KERNELS == 2: + # Exclude _bwd, _tune, and specific module + if ( + "_bwd" in module + or "_tune" in module + or module == "module_gemm_mi350_a8w8_blockscale_asm" + ): + exclude_ops.append(module) + elif PREBUILD_KERNELS == 3: + # Keep only module_fmha_v3* and module_aiter_enum + if not ( + module.startswith("module_fmha_v3") + or module == "module_aiter_enum" + or module == "module_gemm_mi350_a8w8_blockscale_asm" + ): + exclude_ops.append(module) + else: + # Default behavior: exclude tunes and specific mi350 module + if ( + "_tune" in module + or module == "module_gemm_mi350_a8w8_blockscale_asm" + ): + exclude_ops.append(module) + + return exclude_ops exclude_ops = get_exclude_ops() From bf02586feaed0fe4550c4fbb9d5ed32353f7dbb7 Mon Sep 17 00:00:00 2001 From: amirumoAMD Date: Fri, 19 Dec 2025 11:06:36 -0500 Subject: [PATCH 19/40] LLfp4 qr cap for atom (#1673) * QR cap implemented to limit QR to prefill * test git config * Fix to genericize qr comm cap * Incorrect cap number --- aiter/dist/device_communicators/communicator_cuda.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/aiter/dist/device_communicators/communicator_cuda.py b/aiter/dist/device_communicators/communicator_cuda.py index 60ff59ca57..55c3fa3bfc 100644 --- a/aiter/dist/device_communicators/communicator_cuda.py +++ b/aiter/dist/device_communicators/communicator_cuda.py @@ -155,6 +155,8 @@ def all_reduce( qr_comm is not None and not qr_comm.disabled and qr_comm.should_quick_allreduce(input_) + and (input_.nelement() * input_.element_size()) >= 4*1024*1024 # input shape should be such that quick reduce will show benefits. + # input shape estimated at 2 * max concurrency for now. if performance issues, subject to change ): out = qr_comm.quick_all_reduce(input_) assert out is not None From 3ad1c76a4a70c7a5aa86d45ed23aa4033d5ca0bf Mon Sep 17 00:00:00 2001 From: Zzz9990 Date: Sat, 20 Dec 2025 08:36:03 +0800 Subject: [PATCH 20/40] [MLA] MLA conditions rewrite (#1665) * open mla mtp and remove some logs * fix qlen dense 128,N * fix hint * support sparse qlen input = 1 * change default splits --- aiter/mla.py | 15 ++++-- aiter/ops/attention.py | 3 +- csrc/kernels/mla/metadata/v1_2_device.cuh | 62 ++++++++++++++++------ op_tests/test_mla.py | 43 +++++++++------- op_tests/test_mla_persistent.py | 51 +++++++++--------- op_tests/test_mla_sparse.py | 63 ++++++++++++----------- 6 files changed, 142 insertions(+), 95 deletions(-) diff --git a/aiter/mla.py b/aiter/mla.py index 6f4cd2150a..1e09e1bd51 100644 --- a/aiter/mla.py +++ b/aiter/mla.py @@ -162,6 +162,8 @@ def mla_decode_fwd( q_scale=None, kv_scale=None, intra_batch_mode=False, + return_logits=False, + return_lse=False, ): device = q.device assert logit_cap <= 0, f"{logit_cap=} is not support yet" @@ -271,7 +273,7 @@ def mla_decode_fwd( ): # Natively support cases pass - elif nhead in range(32, 128 + 1, 16) and persistent_mode and max_seqlen_q == 1: + elif nhead in range(32, 128 + 1, 16) and persistent_mode: # we use nhead=16 to simulate such cases by customized metadata # metadata also views qo's tensor as shape (total_s * (nhead // 16), 16, ...) total_s = ori_total_s * (ori_nhead // 16) @@ -292,7 +294,11 @@ def mla_decode_fwd( dtype=dtypes.fp32, device=device, ) - final_lse = torch.empty((total_s, nhead), dtype=dtypes.fp32, device=device) + final_lse = ( + torch.empty((total_s, nhead), dtype=dtypes.fp32, device=device) + if return_lse + else None + ) aiter.mla_decode_stage1_asm_fwd( q, @@ -326,10 +332,9 @@ def mla_decode_fwd( ) if io_transformed: - if persistent_mode: + if return_logits: logits = logits.view(-1, 1, ori_nhead, v_head_dim) - else: - logits = logits.view(ori_total_s, num_kv_splits, ori_nhead, v_head_dim) + q = q.view(ori_total_s, ori_nhead, -1) o = o.view(ori_total_s, ori_nhead, -1) diff --git a/aiter/ops/attention.py b/aiter/ops/attention.py index 20101480eb..a433bd213a 100644 --- a/aiter/ops/attention.py +++ b/aiter/ops/attention.py @@ -629,7 +629,8 @@ def get_mla_metadata_info_v1( max_qo_tiles_per_batch = ( int(math.ceil(max_seqlen_qo * num_head_qo / 128)) - if num_head_qo == 16 or (num_head_qo == 128 and kv_dtype == dtypes.fp8) + if num_head_qo == 16 + or (num_head_qo == 128 and kv_dtype == dtypes.fp8 and q_dtype == dtypes.fp8) else int(math.ceil(max_seqlen_qo * num_head_qo / 16)) ) batch_size = batch_size * max_seqlen_qo if is_sparse else batch_size diff --git a/csrc/kernels/mla/metadata/v1_2_device.cuh b/csrc/kernels/mla/metadata/v1_2_device.cuh index ad64bce238..b96051874d 100644 --- a/csrc/kernels/mla/metadata/v1_2_device.cuh +++ b/csrc/kernels/mla/metadata/v1_2_device.cuh @@ -28,12 +28,34 @@ __launch_bounds__(ck_tile::get_warp_size(), 1) __global__ { using QoState = QoState; + const int32_t ori_seqlen_qo = [&]() { + if constexpr (Traits::kIsSparse) + { + return params.p_seqlens_qo_indptr[1] - params.p_seqlens_qo_indptr[0]; + } + else + { + return params.ori_seqlen_qo; + } + }(); + + const int32_t num_batches = [&]() { + if constexpr (Traits::kIsSparse) + { + return params.num_batches * ori_seqlen_qo; + } + else + { + return params.num_batches; + } + }(); + extern __shared__ uint8_t p_smem[]; int32_t* p_lds_seqlens_qo = reinterpret_cast(p_smem); - int32_t* p_lds_seqlens_kv = p_lds_seqlens_qo + (QoState::is_unique() ? 0 : params.num_batches); + int32_t* p_lds_seqlens_kv = p_lds_seqlens_qo + (QoState::is_unique() ? 0 : num_batches); QoState qo_state( - params.uni_seqlen_qo, params.ori_seqlen_qo, p_lds_seqlens_qo, params.p_seqlens_qo_indptr); + params.uni_seqlen_qo, ori_seqlen_qo, p_lds_seqlens_qo, params.p_seqlens_qo_indptr); auto get_num_qo_tiles = [&](const int32_t batch_idx) { if constexpr(Traits::kQoSplits) @@ -53,10 +75,10 @@ __launch_bounds__(ck_tile::get_warp_size(), 1) __global__ MlaWorkInfo* p_work_info_set = reinterpret_cast(params.p_work_info_set_raw); int32_t sum_blocks = 0; - for(int32_t bid = lane_idx; bid < params.num_batches; bid += ck_tile::get_warp_size()) + for(int32_t bid = lane_idx; bid < num_batches; bid += ck_tile::get_warp_size()) { const int32_t bid_ori = Traits::kIsSparse - ? (bid / params.ori_seqlen_qo / params.qk_batch_ratio) + ? (bid / ori_seqlen_qo / params.qk_batch_ratio) : (bid / params.qk_batch_ratio); const int32_t kv_end = params.p_seqlens_kv_indptr[bid_ori + 1]; const int32_t seqlen_kv = @@ -119,7 +141,7 @@ __launch_bounds__(ck_tile::get_warp_size(), 1) __global__ for(int32_t cid = 0; cid < params.num_cu; ++cid) { int32_t remain_payload = payload; - while(curr_batch < params.num_batches) + while(curr_batch < num_batches) { const int32_t num_qo_tiles = get_num_qo_tiles(curr_batch); const int32_t qo_tile_size = @@ -143,9 +165,17 @@ __launch_bounds__(ck_tile::get_warp_size(), 1) __global__ work_info.qo_end = ck_tile::min(work_info.qo_start + qo_tile_size, qo_state.get_end(curr_batch)); work_info.kv_start = curr_kv_begin + (curr_kv_block * params.kv_granularity); + int32_t batch_tail = (num_qo_tiles - 1 - curr_qo_tile_idx); + if constexpr(!Traits::kIsSparse) + { + if (params.qk_batch_ratio != 1) + { + batch_tail = num_qo_tiles - (work_info.qo_start / params.qk_batch_ratio) % ori_seqlen_qo - 1; + } + } work_info.kv_end = ck_tile::min( work_info.kv_start + (remain_kv_blocks * params.kv_granularity), - curr_kv_end - (num_qo_tiles - 1 - curr_qo_tile_idx)); + curr_kv_end - batch_tail); work_info.kv_offset = curr_kv_end - work_info.kv_end; // split related info @@ -202,7 +232,7 @@ __launch_bounds__(ck_tile::get_warp_size(), 1) __global__ curr_sub_head_idx = (curr_sub_head_idx == (params.qk_batch_ratio - 1)) ? 0 : (curr_sub_head_idx + 1); - if(curr_batch < params.num_batches) + if(curr_batch < num_batches) { if(curr_sub_head_idx == 0) { @@ -213,7 +243,7 @@ __launch_bounds__(ck_tile::get_warp_size(), 1) __global__ else { const int32_t bid_ori = Traits::kIsSparse - ? (curr_batch / params.ori_seqlen_qo / + ? (curr_batch / ori_seqlen_qo / params.qk_batch_ratio) : (curr_batch / params.qk_batch_ratio); curr_kv_seqlen = params.p_seqlens_kv_indptr[bid_ori + 1] - @@ -251,9 +281,17 @@ __launch_bounds__(ck_tile::get_warp_size(), 1) __global__ qo_state.get_end(curr_batch)); work_info.kv_start = curr_kv_begin + (curr_kv_block * params.kv_granularity); + int32_t batch_tail = (num_qo_tiles - 1 - curr_qo_tile_idx); + if constexpr(!Traits::kIsSparse) + { + if (params.qk_batch_ratio != 1) + { + batch_tail = num_qo_tiles - (work_info.qo_start / params.qk_batch_ratio) % ori_seqlen_qo - 1; + } + } work_info.kv_end = ck_tile::min( work_info.kv_start + (consuming_blks * params.kv_granularity), - curr_kv_end - (num_qo_tiles - 1 - curr_qo_tile_idx)); + curr_kv_end - batch_tail); work_info.kv_offset = curr_kv_end - work_info.kv_end; work_info.partial_qo_loc = partial_idx; p_work_info_set[num_works] = work_info; @@ -365,12 +403,6 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba num_batches *= qk_batch_ratio; } - if(is_sparse) - { - num_batches *= uni_seqlen_qo; - uni_seqlen_qo = 1; - } - TORCH_CHECK((num_heads == 16) || (num_heads == 128), __func__, ": only supports #heads in [16, 128], or (#head, uni_seqlen_qo) = (16*N, 1) where " diff --git a/op_tests/test_mla.py b/op_tests/test_mla.py index efe8b47f71..0307082441 100644 --- a/op_tests/test_mla.py +++ b/op_tests/test_mla.py @@ -19,6 +19,12 @@ # qdtype fp8, kdtype fp8: nhead16, nhead128 +def check_support(dtype, kv_dtype, nhead): + if dtype == dtypes.fp8 and kv_dtype == dtypes.bf16: + return False + return True + + def cal_diff( x: torch.Tensor, y: torch.Tensor, name: str, use_fp8: bool = False ) -> None: @@ -445,11 +451,11 @@ def test_absorb_decode_fp8(): err = None us_asm_decode = 1e12 - if (dtype == torch.bfloat16 and kvtype == torch.bfloat16) and nhead in [16, 128]: + if dtype == torch.bfloat16 and nhead in [16, 128]: err, us_asm_decode = test_absorb_decode_bf16() - elif kvtype == dtypes.fp8 and nhead in [16, 128]: err, us_asm_decode = test_absorb_decode_fp8() + ret["decode:err"] = err ret["decode:asm_576"] = us_asm_decode @@ -599,22 +605,23 @@ def test_absorb_decode_fp8(): for dtype, kvtype, ctx_len, batch_size, split_per_batch in itertools.product( list_dtype, l_kv_dtype, args.ctxLen, args.batchSize, args.split_per_batch ): - ret = test_mla( - ctx_len, - batch_size, - nhead, - args.kv_lora_rank, - args.qk_nope_head_dim, - args.qk_rope_head_dim, - args.v_head_dim, - dtype, - kvtype, - args.block_size, - varlen=args.varlen, - decode_qlen=decode_qlen, - split_per_batch=split_per_batch, - ) - df.append(ret) + if check_support(dtype, kvtype, nhead): + ret = test_mla( + ctx_len, + batch_size, + nhead, + args.kv_lora_rank, + args.qk_nope_head_dim, + args.qk_rope_head_dim, + args.v_head_dim, + dtype, + kvtype, + args.block_size, + varlen=args.varlen, + decode_qlen=decode_qlen, + split_per_batch=split_per_batch, + ) + df.append(ret) df = pd.DataFrame(df) # df.to_csv(f"mla_nhead{nhead}decode_qlen{decode_qlen}.csv") aiter.logger.info(f"summary:\n{df}") diff --git a/op_tests/test_mla_persistent.py b/op_tests/test_mla_persistent.py index 03b8695b91..68d558048a 100644 --- a/op_tests/test_mla_persistent.py +++ b/op_tests/test_mla_persistent.py @@ -18,6 +18,12 @@ # qdtype fp8, kdtype bf16: nhead16 +def check_support(dtype, kv_dtype, nhead): + if dtype == dtypes.fp8 and kv_dtype == dtypes.bf16: + return False + return True + + def cal_diff( x: torch.Tensor, y: torch.Tensor, name: str, use_fp8: bool = False ) -> None: @@ -401,15 +407,9 @@ def test_absorb_decode_fp8(): err = None us_asm_decode = 1e12 - if (dtype == torch.bfloat16 and kvtype == torch.bfloat16) and ( - (nhead in [16]) or (decode_qlen == 1 and nhead in range(32, 128 + 1, 16)) - ): + if dtype == torch.bfloat16: err, us_asm_decode = test_absorb_decode_bf16() - elif kvtype == dtypes.fp8 and ( - (dtype == dtypes.fp8 and nhead in [16, 128]) - or (dtype == dtypes.bf16 and nhead in [16]) - or (decode_qlen == 1 and nhead in range(32, 128 + 1, 16)) - ): + elif kvtype == dtypes.fp8: err, us_asm_decode = test_absorb_decode_fp8() ret["decode:err"] = err ret["decode:asm_576"] = us_asm_decode @@ -566,23 +566,24 @@ def test_absorb_decode_fp8(): for dtype, kvtype, ctx_len, batch_size, max_split_per_batch in itertools.product( list_dtype, l_kv_dtype, args.ctxLen, args.batchSize, args.max_split_per_batch ): - ret = test_mla( - ctx_len, - batch_size, - nhead, - args.kv_lora_rank, - args.qk_nope_head_dim, - args.qk_rope_head_dim, - args.v_head_dim, - dtype, - kvtype, - args.block_size, - varlen=args.varlen, - decode_qlen=decode_qlen, - max_split_per_batch=max_split_per_batch, - non_persistent_mode=args.non_persistent_mode, - ) - df.append(ret) + if check_support(dtype, kvtype, nhead): + ret = test_mla( + ctx_len, + batch_size, + nhead, + args.kv_lora_rank, + args.qk_nope_head_dim, + args.qk_rope_head_dim, + args.v_head_dim, + dtype, + kvtype, + args.block_size, + varlen=args.varlen, + decode_qlen=decode_qlen, + max_split_per_batch=max_split_per_batch, + non_persistent_mode=args.non_persistent_mode, + ) + df.append(ret) df = pd.DataFrame(df) # df.to_csv(f"mla_nhead{nhead}decode_qlen{decode_qlen}.csv") aiter.logger.info(f"summary:\n{df}") diff --git a/op_tests/test_mla_sparse.py b/op_tests/test_mla_sparse.py index c93170b0c5..6372f9e9de 100644 --- a/op_tests/test_mla_sparse.py +++ b/op_tests/test_mla_sparse.py @@ -20,6 +20,12 @@ # qdtype fp8, kdtype bf16: nhead16 +def check_support(dtype, kv_dtype, nhead): + if dtype == dtypes.fp8 and kv_dtype == dtypes.bf16: + return False + return True + + def cal_diff( x: torch.Tensor, y: torch.Tensor, name: str, use_fp8: bool = False ) -> None: @@ -450,8 +456,8 @@ def test_mla( reduce_final_map, reduce_partial_map, kv_granularity=max(page_size, 16), - max_seqlen_qo=int(max_seqlen_qo), - uni_seqlen_qo=decode_qlen, + max_seqlen_qo=1, + uni_seqlen_qo=1, fast_mode=True, max_split_per_batch=max_split_per_batch, topk=2048, @@ -525,7 +531,7 @@ def test_sparse_mla_bf16(): ) return err, us_asm_decode - def test_absorb_decode_fp8(): + def test_sparse_mla_fp8(): if dtype != dtypes.fp8 and nhead == 128: aiter.logger.info("don't support this case:\n") return None, 1e12 @@ -597,16 +603,10 @@ def test_absorb_decode_fp8(): err = None us_asm_decode = 1e12 - if (dtype == torch.bfloat16 and kvtype == torch.bfloat16) and ( - (nhead in [16]) or (max_seqlen_qo == 1 and nhead in range(32, 128 + 1, 16)) - ): + if dtype == torch.bfloat16: err, us_asm_decode = test_sparse_mla_bf16() - elif kvtype == dtypes.fp8 and ( - (dtype == dtypes.fp8 and nhead in [16, 128]) - or (dtype == dtypes.bf16 and nhead in [16]) - or (decode_qlen == 1 and nhead in range(32, 128 + 1, 16)) - ): - err, us_asm_decode = test_absorb_decode_fp8() + elif kvtype == dtypes.fp8: + err, us_asm_decode = test_sparse_mla_fp8() ret["decode:err"] = err ret["decode:asm_576"] = us_asm_decode @@ -684,7 +684,7 @@ def test_absorb_decode_fp8(): type=str, choices=["bf16", "fp8"], nargs="*", - default=["bf16"], + default=["bf16", "fp8"], help="""Data type of Q. e.g.: -d bf16""", ) @@ -694,7 +694,7 @@ def test_absorb_decode_fp8(): type=str, choices=["bf16", "fp8"], nargs="*", - default=["bf16"], + default=["bf16", "fp8"], help="""Data type of KV. e.g.: -kvd bf16""", ) @@ -731,7 +731,7 @@ def test_absorb_decode_fp8(): "--max_split_per_batch", type=int, nargs="*", - default=[16], + default=[32], help="""kv seqlens max split num for per batch. e.g.: -ms 32""", ) @@ -755,22 +755,23 @@ def test_absorb_decode_fp8(): for dtype, kvtype, ctx_len, batch_size, max_split_per_batch in itertools.product( list_dtype, l_kv_dtype, args.ctxLen, args.batchSize, args.max_split_per_batch ): - ret = test_mla( - ctx_len, - batch_size, - nhead, - args.kv_lora_rank, - args.qk_nope_head_dim, - args.qk_rope_head_dim, - args.v_head_dim, - dtype, - kvtype, - args.block_size, - varlen=args.varlen, - decode_qlen=decode_qlen, - max_split_per_batch=max_split_per_batch, - ) - df.append(ret) + if check_support(dtype, kvtype, nhead): + ret = test_mla( + ctx_len, + batch_size, + nhead, + args.kv_lora_rank, + args.qk_nope_head_dim, + args.qk_rope_head_dim, + args.v_head_dim, + dtype, + kvtype, + args.block_size, + varlen=args.varlen, + decode_qlen=decode_qlen, + max_split_per_batch=max_split_per_batch, + ) + df.append(ret) df = pd.DataFrame(df) # df.to_csv(f"mla_nhead{nhead}decode_qlen{decode_qlen}.csv") aiter.logger.info(f"summary:\n{df}") From 84b2b2f624a68777985099a8c81e55689753b81c Mon Sep 17 00:00:00 2001 From: Zzz9990 Date: Sat, 20 Dec 2025 08:37:43 +0800 Subject: [PATCH 21/40] fix dp causal (#1677) --- csrc/kernels/mla/metadata/v1_0_device.cuh | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/csrc/kernels/mla/metadata/v1_0_device.cuh b/csrc/kernels/mla/metadata/v1_0_device.cuh index a3a9fe2e6f..f7d0d5ea5b 100644 --- a/csrc/kernels/mla/metadata/v1_0_device.cuh +++ b/csrc/kernels/mla/metadata/v1_0_device.cuh @@ -47,20 +47,19 @@ void kn_get_mla_metadata_v1_0(MlaMetadataV1KernelParameter params) const int32_t bid_ori = bid / params.qk_batch_ratio; const int32_t kv_begin = params.p_seqlens_kv_indptr[bid_ori]; - const int32_t kv_end = params.p_seqlens_kv_indptr[bid_ori + 1]; - int32_t kv_tail = [&](){ if constexpr(DP_MODE) { - // max(*, 0) for cuda graph capture: kvlen < mtp+1 - return max(bid % params.ori_seqlen_qo - params.ori_seqlen_qo + 1, 0); + return bid % params.ori_seqlen_qo - params.ori_seqlen_qo + 1; } else { return 0; } }(); - const int32_t seqlen_kv = kv_end - kv_begin + kv_tail; + const int32_t kv_end = max(params.p_seqlens_kv_indptr[bid_ori + 1] + kv_tail, kv_begin + 1); + + const int32_t seqlen_kv = kv_end - kv_begin; const int32_t num_blocks = integer_divide_ceil_power2( seqlen_kv, params.kv_granularity, params.kv_granularity_log2); @@ -98,19 +97,17 @@ void kn_get_mla_metadata_v1_0(MlaMetadataV1KernelParameter params) const int32_t bid_ori = bid / params.qk_batch_ratio; const int32_t kv_begin = p_lds_kv_seqlen[bid_ori]; - int32_t kv_end = p_lds_kv_seqlen[bid_ori + 1]; int32_t kv_tail = [&](){ if constexpr(DP_MODE) { - // max(*, 0) for cuda graph capture: kvlen < mtp+1 - return max(bid % params.ori_seqlen_qo - params.ori_seqlen_qo + 1, 0); + return bid % params.ori_seqlen_qo - params.ori_seqlen_qo + 1; } else { return 0; } }(); - kv_end += kv_tail; + const int32_t kv_end = max(p_lds_kv_seqlen[bid_ori + 1] + kv_tail, kv_begin + 1); MlaWorkInfo work_info{}; const int32_t split_start = p_lds_shift[bid]; const int32_t split_local = p_lds_split[bid]; From 723467d3aad98e4ad83d2d2d47ded1e5b8954e16 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Sat, 20 Dec 2025 06:18:57 -0500 Subject: [PATCH 22/40] add two fp4 tune shapes and tuned config (#1687) * add two fp4 tune shapes and tuned config * change 32800 to 65536 to cover all cases between 32768 to 65536 as per feedback --- aiter/configs/a4w4_blockscale_tuned_gemm.csv | 2 ++ aiter/configs/a4w4_blockscale_untuned_gemm.csv | 2 ++ 2 files changed, 4 insertions(+) diff --git a/aiter/configs/a4w4_blockscale_tuned_gemm.csv b/aiter/configs/a4w4_blockscale_tuned_gemm.csv index 51a05157f0..3988c91f18 100644 --- a/aiter/configs/a4w4_blockscale_tuned_gemm.csv +++ b/aiter/configs/a4w4_blockscale_tuned_gemm.csv @@ -921,3 +921,5 @@ cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio 256,8,3072,1536,42,0,5.4682,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_128x128E,13.81,441.57,0.0 256,8,7168,2048,29,0,5.836,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,40.25,1278.77,0.0 256,8,512,7168,29,0,9.6677,_ZN5aiter41f4gemm_bf16_per1x32Fp4_BpreShuffle_64x128E,6.07,193.62,0.0 +256,32768,2112,7168,48,0,293.0219,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3385.88,898.98,0.0 +256,65536,2112,7168,48,0,575.6528,_ZN5aiter42f4gemm_bf16_per1x32Fp4_BpreShuffle_160x384E,3447.0,902.06,0.0 diff --git a/aiter/configs/a4w4_blockscale_untuned_gemm.csv b/aiter/configs/a4w4_blockscale_untuned_gemm.csv index 3c91c37b07..e78f1eb3f5 100644 --- a/aiter/configs/a4w4_blockscale_untuned_gemm.csv +++ b/aiter/configs/a4w4_blockscale_untuned_gemm.csv @@ -193,3 +193,5 @@ M,N,K 3000, 7168, 2048 3000, 512, 7168 60000, 4096, 512 +32768, 2112, 7168 +65536, 2112, 7168 From 48ee8cc97864bba13f30815ab32f0ea300470b83 Mon Sep 17 00:00:00 2001 From: yadaish Date: Sat, 20 Dec 2025 19:22:31 +0800 Subject: [PATCH 23/40] Dev/a8w4 and a8w8splitk (#1667) * support moe a8w8 splitk (#1654) * Add support to a8w8_ck_moe_blk_gemm1 splitk * add switch and add some logging * tiny fix * update ck 3rd party and add some logging * add AITER_HEURISTIC_ONLY env * update ck * add condition to bypass tuned cfg * change bypass type * fix * fix removed log * upate ck submodule * fix lint * force to run tests --------- Co-authored-by: oscar * Zan/moe a8w4 (#1655) * update * update * update quant * ut ready * update quant type * compile pass * python3 op_tests/test_moe_2stage.py -t 16 -e 1 -k 1 -dim 256,256 ready * update aiter dipatcher for bf16&fp8 * support a16 a8 dispatch * finish quant & sort * update aiter framework for a8w4 moe * update ck * update * update * update for atom * update --------- Co-authored-by: Zzz9990 Co-authored-by: root * update ck * fix dispatch * fix too much logging * update * update ck * update ck * fix ruff code style * revert aiter-test yaml * fix ci * fix ci * fix ci * add mocked tuned result and decoding cfg token to next power of 2 * Update tuned_fmoe.csv remove duplicate * remove hack dtype * fix black * unique index * add empty arg to ck_moe_stage1 * resolve bias into lru cache * rename bypass cfg to AITER_BYPASS_TUNE_CONFIG --------- Co-authored-by: oscar Co-authored-by: Zzz9990 Co-authored-by: root Co-authored-by: felix Co-authored-by: Lingpeng Jin <103567126+valarLip@users.noreply.github.com> --- 3rdparty/composable_kernel | 2 +- aiter/configs/tuned_fmoe.csv | 434 ++++++++++- aiter/fused_moe.py | 203 ++++- aiter/ops/moe_op.py | 23 +- aiter/ops/quant.py | 36 + aiter/ops/triton/fused_mxfp4_quant.py | 194 ++++- aiter/ops/triton/utils/gemm_config_utils.py | 2 +- .../gemm_a4w4_blockscale_tune.py | 1 - .../gemm_moe_ck2stages.cu | 24 +- .../gemm_moe_ck2stages.h | 22 +- .../gemm_moe_ck2stages_common.cuh | 315 ++++---- .../gemm_moe_ck2stages_common.py | 6 +- .../gemm_moe_ck2stages_common_blockscale.cuh | 28 +- .../gemm_moe_ck2stages_common_mxfp4.cuh | 12 +- .../gemm_moe_ck2stages_common_mxfp4_bns.cuh | 22 +- .../gen_instances.py | 28 +- .../ck_tile_gemm_moe_2stages/gen_instances.py | 157 ++-- .../include/moe_cktile2stages_common.cuh | 112 +-- .../moe_cktile2stages.cu | 34 +- .../moe_cktile2stages_common.py | 164 +++- csrc/cpp_itfs/mha_bwd.cpp | 5 +- csrc/include/mha_bwd.h | 98 +-- csrc/include/mha_fwd.h | 2 +- csrc/include/moe_ck.h | 8 +- csrc/include/rocm_ops.hpp | 64 +- csrc/include/topk_per_row.h | 1 - csrc/kernels/mla/metadata/v1_1_device.cuh | 712 ++++++++++-------- csrc/kernels/mla/metadata/v1_1_host.cuh | 189 +++-- hsa/gfx942/fmha_v3_fwd/codegen.py | 2 +- op_tests/cpp/mha/benchmark_mha_bwd.cpp | 40 +- op_tests/test_gemm_a16w16.py | 2 - op_tests/test_moe_2stage.py | 49 +- 32 files changed, 2162 insertions(+), 829 deletions(-) diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel index f5573f56d9..9a6e61de97 160000 --- a/3rdparty/composable_kernel +++ b/3rdparty/composable_kernel @@ -1 +1 @@ -Subproject commit f5573f56d9d4981def16f575ddb14535b93bb9bb +Subproject commit 9a6e61de9787be2e7ed4a9566cb59a420c5d3f78 diff --git a/aiter/configs/tuned_fmoe.csv b/aiter/configs/tuned_fmoe.csv index 275cd8944e..1ce76a65be 100644 --- a/aiter/configs/tuned_fmoe.csv +++ b/aiter/configs/tuned_fmoe.csv @@ -4,6 +4,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,17.6606,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,15.126,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.3%,32.7866,0,5.18,2591.37 80,4,2304,1536,8,2,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,17.8008,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,14.5115,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,32.3123,0,5.26,2629.41 80,512,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,774.6328,moe_ck2stages_gemm1_256x64x128x64_1x4_TypeCast_v3_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,459.0113,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCastExpertWeight_v3_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.3%,1233.6441,0,125.34,989.38 +256,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,130.4639,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,70.3202,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,200.7841,0,7.02,14040.11 +256,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,130.4639,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,70.3202,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,200.7841,0,7.02,14040.11 +256,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,130.4639,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,70.3202,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,200.7841,0,7.02,14040.11 +256,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,130.4639,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,70.3202,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,200.7841,0,7.02,14040.11 256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,130.4639,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,70.3202,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,200.7841,0,7.02,14040.11 256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,195.38,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,107.5659,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,302.9459,0,9.3,9306.91 256,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,278.093,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,140.8376,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,418.9306,0,13.46,6732.4 @@ -11,6 +15,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,306.0006,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,170.2105,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,476.2111,0,47.35,5934.16 256,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,309.2402,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,184.9719,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,494.2121,0,91.25,5732.87 256,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,325.0568,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,231.4032,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,556.46,0,162.09,5117.95 +256,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.2525,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,72.0121,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.2646,0,7.04,14076.53 +256,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.2525,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,72.0121,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.2646,0,7.04,14076.53 +256,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.2525,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,72.0121,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.2646,0,7.04,14076.53 +256,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.2525,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,72.0121,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.2646,0,7.04,14076.53 256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.2525,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,72.0121,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.2646,0,7.04,14076.53 256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,195.9999,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,102.7882,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,298.7881,0,9.43,9436.42 256,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,277.4499,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,139.0861,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,416.536,0,13.53,6771.1 @@ -18,6 +26,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,306.2672,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,164.6962,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,470.9634,0,47.88,6000.28 256,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,309.6434,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,178.4363,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,488.0797,0,92.4,5804.9 256,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,325.7872,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,223.4421,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,549.2293,0,164.22,5185.32 +256,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.4265,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,41.189,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,108.6155,0,12.98,12978.17 +256,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.4265,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,41.189,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,108.6155,0,12.98,12978.17 +256,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.4265,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,41.189,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,108.6155,0,12.98,12978.17 +256,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.4265,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,41.189,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,108.6155,0,12.98,12978.17 256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.4265,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,41.189,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,108.6155,0,12.98,12978.17 256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,102.7345,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,56.8998,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,159.6343,0,17.66,8832.53 256,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,140.8235,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,76.5494,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,217.3729,0,25.93,6489.6 @@ -25,6 +37,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,158.9481,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,92.9698,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,251.91790000000003,0,89.51,5616.08 256,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,161.9427,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,114.4508,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,276.3935,0,163.16,5138.67 256,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,64,0,168.3246,moe_ck2stages_gemm1_256x64x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,205.6813,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,374.0059,0,241.16,3826.96 +256,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,68.0621,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,40.8199,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.882,0,12.94,12946.4 +256,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,68.0621,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,40.8199,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.882,0,12.94,12946.4 +256,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,68.0621,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,40.8199,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.882,0,12.94,12946.4 +256,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,68.0621,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,40.8199,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.882,0,12.94,12946.4 256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,68.0621,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,40.8199,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.882,0,12.94,12946.4 256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,102.8318,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,57.3307,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,160.1625,0,17.6,8803.4 256,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,141.6806,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,77.5578,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,219.2384,0,25.71,6434.38 @@ -32,6 +48,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,159.3862,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,95.0034,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,254.3896,0,88.64,5561.51 256,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,162.5288,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,113.9963,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,276.5251,0,163.09,5136.23 256,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,64,0,168.5532,moe_ck2stages_gemm1_256x64x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,205.3887,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,373.9419,0,241.2,3827.62 +256,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.6613,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,46.3816,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.7%,115.0429,0,12.25,12253.08 +256,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.6613,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,46.3816,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.7%,115.0429,0,12.25,12253.08 +256,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.6613,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,46.3816,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.7%,115.0429,0,12.25,12253.08 +256,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.6613,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,46.3816,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.7%,115.0429,0,12.25,12253.08 256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.6613,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,46.3816,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.7%,115.0429,0,12.25,12253.08 256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,158.0965,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256E,0.0%,0.0,Null,0,158.0965,1,17.83,8918.44 256,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,215.8536,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256E,0.0%,0.0,Null,0,215.8536,1,26.12,6535.27 @@ -39,6 +59,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,254.5557,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256E,0.0%,0.0,Null,0,254.5557,1,88.58,5557.88 256,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,267.5654,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x256E,0.0%,0.0,Null,0,267.5654,1,168.55,5308.22 256,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,366.6991,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256E,0.0%,0.0,Null,0,366.6991,1,245.96,3903.22 +256,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.3263,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,44.1851,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,112.5114,0,12.53,12528.78 +256,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.3263,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,44.1851,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,112.5114,0,12.53,12528.78 +256,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.3263,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,44.1851,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,112.5114,0,12.53,12528.78 +256,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.3263,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,44.1851,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,112.5114,0,12.53,12528.78 256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.3263,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,44.1851,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,112.5114,0,12.53,12528.78 256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,100.365,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf2E,0.0%,61.0618,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,161.4268,0,17.46,8734.45 256,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,140.407,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,78.9057,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,219.3127,0,25.7,6432.2 @@ -46,6 +70,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,159.338,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,102.7582,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,262.0962,0,86.03,5397.98 256,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,161.3644,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf2E,0.0%,132.3204,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,293.6848,0,153.56,4836.12 256,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,64,0,163.9563,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x128_2tg_pf3E,0.0%,218.341,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,382.2973,0,235.93,3743.96 +256,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,268.7481,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,135.0723,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,403.82040000000006,0,6.98,13960.67 +256,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,268.7481,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,135.0723,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,403.82040000000006,0,6.98,13960.67 +256,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,268.7481,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,135.0723,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,403.82040000000006,0,6.98,13960.67 +256,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,268.7481,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,135.0723,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,403.82040000000006,0,6.98,13960.67 256,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,268.7481,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,135.0723,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,403.82040000000006,0,6.98,13960.67 256,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,378.5195,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,196.1646,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,574.6841,0,9.81,9810.72 256,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,559.7713,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,271.7302,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,831.5015000000001,0,13.56,6781.68 @@ -53,6 +81,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,612.6749,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,322.9055,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,935.5804,0,48.2,6033.14 256,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,623.7185,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,338.7751,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,962.4936,0,93.71,5872.06 256,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,649.3028,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,368.4383,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,1017.7411,0,177.24,5567.73 +256,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,265.8935,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,135.088,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,400.9815,0,7.03,14059.51 +256,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,265.8935,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,135.088,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,400.9815,0,7.03,14059.51 +256,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,265.8935,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,135.088,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,400.9815,0,7.03,14059.51 +256,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,265.8935,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,135.088,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,400.9815,0,7.03,14059.51 256,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,265.8935,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,135.088,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,400.9815,0,7.03,14059.51 256,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,376.5017,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,196.4837,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,572.9854,0,9.84,9839.8 256,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,556.9744,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,271.6147,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,828.5890999999999,0,13.61,6805.52 @@ -60,6 +92,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,614.0275,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,317.5052,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,931.5327,0,48.41,6059.35 256,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,624.6592,moe_ck2stages_gemm1_256x64x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,332.6196,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,957.2788,0,94.22,5904.05 256,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,644.3248,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,363.2348,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1007.5596,0,179.04,5623.99 +256,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,139.2785,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,70.4958,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,209.7743,0,13.44,13437.85 +256,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,139.2785,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,70.4958,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,209.7743,0,13.44,13437.85 +256,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,139.2785,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,70.4958,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,209.7743,0,13.44,13437.85 +256,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,139.2785,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,70.4958,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,209.7743,0,13.44,13437.85 256,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,139.2785,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,70.4958,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,209.7743,0,13.44,13437.85 256,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,194.034,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,100.7957,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.4%,294.8297,0,19.12,9562.34 256,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,274.3536,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2E,0.0%,140.3968,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,414.7504,0,27.18,6799.15 @@ -67,6 +103,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,64,0,314.2275,moe_ck2stages_gemm1_256x64x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,169.8802,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,484.1077,0,93.16,5833.57 256,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,318.189,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2E,0.0%,183.5436,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,501.7326000000001,0,179.77,5639.62 256,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,64,0,328.7642,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_64x128_2tg_pf2E,0.0%,226.0569,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,554.8211,0,325.13,5119.83 +256,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.6795,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,70.6801,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.3596,0,13.46,13464.47 +256,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.6795,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,70.6801,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.3596,0,13.46,13464.47 +256,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.6795,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,70.6801,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.3596,0,13.46,13464.47 +256,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.6795,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,70.6801,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.3596,0,13.46,13464.47 256,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.6795,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,70.6801,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.3596,0,13.46,13464.47 256,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,193.8469,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,101.2026,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,295.0495,0,19.11,9555.21 256,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,277.7873,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2E,0.0%,141.146,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,418.9333,0,26.91,6731.26 @@ -74,6 +114,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,314.5026,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,168.2646,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,482.7672,0,93.41,5849.77 256,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,318.2151,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,183.4334,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,501.6485,0,179.8,5640.57 256,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,64,0,328.5261,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x128_2tg_pf2E,0.0%,225.2001,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,553.7262,0,325.77,5129.96 +256,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,203.1825,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256E,0.0%,0.0,Null,0,203.1825,1,13.87,13873.81 +256,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,203.1825,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256E,0.0%,0.0,Null,0,203.1825,1,13.87,13873.81 +256,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,203.1825,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256E,0.0%,0.0,Null,0,203.1825,1,13.87,13873.81 +256,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,203.1825,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256E,0.0%,0.0,Null,0,203.1825,1,13.87,13873.81 256,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,203.1825,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x256E,0.0%,0.0,Null,0,203.1825,1,13.87,13873.81 256,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,196.4497,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf2E,0.0%,105.2123,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.4%,301.66200000000003,0,18.69,9345.76 256,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,264.1173,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2E,0.0%,144.1125,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,408.2298,0,27.62,6907.75 @@ -81,6 +125,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,482.7665,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x512E,0.0%,0.0,Null,0,482.7665,1,93.41,5849.78 256,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,494.6598,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x512E,0.0%,0.0,Null,0,494.6598,1,182.34,5720.26 256,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,333.8711,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2E,0.0%,248.6884,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,582.5595000000001,0,309.65,4876.06 +256,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,129.8926,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf2E,0.0%,73.8599,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,203.7525,0,13.83,13835.0 +256,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,129.8926,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf2E,0.0%,73.8599,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,203.7525,0,13.83,13835.0 +256,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,129.8926,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf2E,0.0%,73.8599,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,203.7525,0,13.83,13835.0 +256,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,129.8926,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf2E,0.0%,73.8599,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,203.7525,0,13.83,13835.0 256,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,129.8926,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf2E,0.0%,73.8599,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,203.7525,0,13.83,13835.0 256,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,196.3192,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf2E,0.0%,102.4978,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,298.817,0,18.86,9434.74 256,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,264.1664,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2E,0.0%,141.5633,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,405.7297,0,27.79,6950.31 @@ -88,6 +136,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,305.4521,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2E,0.0%,172.4236,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,477.8757,0,94.37,5909.65 256,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,306.7972,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2E,0.0%,190.6723,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,497.4695,0,181.31,5687.95 256,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,333.2413,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf2E,0.0%,244.2778,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,577.5191,0,312.35,4918.61 +256,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.5023,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,51.4998,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,141.0021,0,8.57,8568.82 +256,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.5023,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,51.4998,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,141.0021,0,8.57,8568.82 +256,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.5023,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,51.4998,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,141.0021,0,8.57,8568.82 +256,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.5023,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,51.4998,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,141.0021,0,8.57,8568.82 256,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.5023,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,51.4998,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,141.0021,0,8.57,8568.82 256,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,127.8742,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,68.7529,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,196.6271,0,12.29,6146.07 256,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,136.8058,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,75.6377,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,212.4435,0,22.74,5690.96 @@ -95,11 +147,19 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.7161,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,80.4366,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,221.1527,0,87.39,5481.07 256,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,144.981,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,104.8371,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,249.8181,0,154.73,4868.94 256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,128,0,171.882,moe_ck2stages_gemm1_256x128x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,152.5554,moe_ck2stages_gemm2_256x128x128x64_1x4_TypeCast_v3_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,324.4374,0,238.29,3774.96 +256,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,89.9594,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,51.0022,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,140.9616,0,8.57,8571.28 +256,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,89.9594,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,51.0022,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,140.9616,0,8.57,8571.28 +256,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,89.9594,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,51.0022,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,140.9616,0,8.57,8571.28 +256,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,89.9594,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,51.0022,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,140.9616,0,8.57,8571.28 256,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,89.9594,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,51.0022,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,140.9616,0,8.57,8571.28 256,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,127.4464,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,69.0267,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,196.4731,0,12.3,6150.89 256,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,136.676,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,75.4552,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,212.1312,0,22.78,5699.34 256,128,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,139.2281,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,77.6845,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,216.9126,0,44.55,5578.55 256,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,141.0976,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,80.1778,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,221.2754,0,87.35,5478.03 +256,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,131.8625,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,70.246,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.4%,202.1085,0,6.97,13948.11 +256,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,131.8625,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,70.246,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.4%,202.1085,0,6.97,13948.11 +256,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,131.8625,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,70.246,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.4%,202.1085,0,6.97,13948.11 +256,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,131.8625,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,70.246,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.4%,202.1085,0,6.97,13948.11 256,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,131.8625,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,70.246,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.4%,202.1085,0,6.97,13948.11 256,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,198.5347,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,102.7245,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.4%,301.2592,0,9.36,9359.02 256,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,277.5506,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,141.6194,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,419.17,0,13.45,6728.55 @@ -107,6 +167,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,307.9132,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,170.5755,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,478.4887,0,47.12,5905.91 256,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,310.8521,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,187.6128,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,498.4649,0,90.47,5683.96 256,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,325.8822,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,228.9235,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,554.8057,0,162.57,5133.21 +256,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.4088,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,71.9127,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.3215,0,7.04,14072.53 +256,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.4088,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,71.9127,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.3215,0,7.04,14072.53 +256,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.4088,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,71.9127,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.3215,0,7.04,14072.53 +256,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.4088,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,71.9127,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.3215,0,7.04,14072.53 256,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,128.4088,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,71.9127,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,200.3215,0,7.04,14072.53 256,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,198.743,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,102.3427,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,301.0857,0,9.36,9364.41 256,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,278.5912,moe_ck2stages_gemm1_256x64x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,137.9968,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,416.588,0,13.53,6770.26 @@ -114,6 +178,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,307.5774,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,164.881,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,472.4584,0,47.73,5981.29 256,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,311.7026,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,177.8505,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,489.5531,0,92.12,5787.43 256,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,326.6863,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,223.1283,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,549.8146,0,164.04,5179.81 +256,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.999,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,40.5166,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.4%,108.5156,0,12.99,12990.12 +256,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.999,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,40.5166,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.4%,108.5156,0,12.99,12990.12 +256,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.999,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,40.5166,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.4%,108.5156,0,12.99,12990.12 +256,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.999,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,40.5166,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.4%,108.5156,0,12.99,12990.12 256,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,67.999,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,40.5166,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.4%,108.5156,0,12.99,12990.12 256,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,102.4854,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,56.7236,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.4%,159.209,0,17.7,8856.12 256,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,140.7195,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,75.8797,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,216.5992,0,26.03,6512.78 @@ -121,6 +189,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,158.7735,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,92.6941,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,251.4676,0,89.67,5626.14 256,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,162.9055,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,114.6803,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,277.5858,0,162.46,5116.6 256,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,64,0,171.438,moe_ck2stages_gemm1_256x64x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,205.2641,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,376.7021,0,239.43,3799.57 +256,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,67.7683,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,40.8282,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.5965,0,12.98,12980.44 +256,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,67.7683,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,40.8282,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.5965,0,12.98,12980.44 +256,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,67.7683,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,40.8282,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.5965,0,12.98,12980.44 +256,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,67.7683,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,40.8282,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.5965,0,12.98,12980.44 256,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,67.7683,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,40.8282,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,108.5965,0,12.98,12980.44 256,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,104.0822,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,57.7224,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,161.8046,0,17.42,8714.06 256,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,142.2581,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,77.8633,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,220.1214,0,25.61,6408.57 @@ -128,6 +200,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,158.256,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,94.9442,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,253.2002,0,89.05,5587.64 256,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,162.1092,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,114.1086,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,276.2178,0,163.27,5141.94 256,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,64,0,169.5988,moe_ck2stages_gemm1_256x64x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,205.5691,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,375.1679,0,240.41,3815.11 +256,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.2374,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,46.5664,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,114.8038,0,12.28,12278.6 +256,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.2374,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,46.5664,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,114.8038,0,12.28,12278.6 +256,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.2374,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,46.5664,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,114.8038,0,12.28,12278.6 +256,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.2374,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,46.5664,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,114.8038,0,12.28,12278.6 256,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,68.2374,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,46.5664,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.4%,114.8038,0,12.28,12278.6 256,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,100.638,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2E,0.0%,64.7122,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,165.3502,0,17.05,8527.2 256,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,139.9452,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,84.8694,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.3%,224.8146,0,25.07,6274.78 @@ -135,7 +211,15 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,267.3292,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0,267.3292,1,84.35,5292.32 256,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,272.5758,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0,272.5758,1,165.45,5210.65 256,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,367.9317,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0,367.9317,1,245.14,3890.14 +256,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.7365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,44.3023,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.0388,0,12.47,12470.32 +256,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.7365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,44.3023,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.0388,0,12.47,12470.32 +256,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.7365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,44.3023,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.0388,0,12.47,12470.32 +256,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.7365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,44.3023,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.0388,0,12.47,12470.32 256,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,68.7365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,44.3023,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.0388,0,12.47,12470.32 +256,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,268.2034,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,134.7329,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,402.9363,0,7.0,13991.3 +256,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,268.2034,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,134.7329,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,402.9363,0,7.0,13991.3 +256,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,268.2034,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,134.7329,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,402.9363,0,7.0,13991.3 +256,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,268.2034,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,134.7329,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,402.9363,0,7.0,13991.3 256,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,268.2034,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,134.7329,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,402.9363,0,7.0,13991.3 256,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,380.515,moe_ck2stages_gemm1_256x64x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,194.6522,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,575.1672,0,9.8,9802.47 256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,563.744,moe_ck2stages_gemm1_256x64x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,268.523,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,832.267,0,13.55,6775.45 @@ -143,6 +227,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,613.3676,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,316.2011,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,929.5687,0,48.51,6072.15 256,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,621.3253,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,334.5202,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,955.8455,0,94.36,5912.91 256,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,647.0892,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,363.288,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1010.3772,0,178.54,5608.31 +256,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,138.6611,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,69.9939,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,208.655,0,13.51,13509.94 +256,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,138.6611,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,69.9939,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,208.655,0,13.51,13509.94 +256,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,138.6611,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,69.9939,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,208.655,0,13.51,13509.94 +256,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,138.6611,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,69.9939,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,208.655,0,13.51,13509.94 256,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,138.6611,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,69.9939,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,208.655,0,13.51,13509.94 256,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,194.5186,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,101.0683,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,295.5869,0,19.07,9537.84 256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,278.8859,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2E,0.0%,141.0514,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,0.0,419,26.85,6715.17 @@ -150,6 +238,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,313.5176,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,168.5062,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,482.0238000000001,0,93.56,5858.79 256,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,32,0,316.7569,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2E,0.0%,183.8783,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,500.6352,0,180.16,5651.98 256,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,64,0,328.9474,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_64x256_pf2E,0.0%,225.6663,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,554.6137,0,325.25,5121.75 +256,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.8701,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,70.2409,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.111,0,13.48,13480.48 +256,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.8701,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,70.2409,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.111,0,13.48,13480.48 +256,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.8701,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,70.2409,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.111,0,13.48,13480.48 +256,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.8701,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,70.2409,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.111,0,13.48,13480.48 256,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,138.8701,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,70.2409,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,209.111,0,13.48,13480.48 256,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,195.414,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,101.3303,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,296.74429999999995,0,19.0,9500.64 256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,275.5829,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2E,0.0%,141.8053,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,417.3882,0,27.01,6756.18 @@ -157,6 +249,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,314.331,moe_ck2stages_gemm1_256x32x64x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,169.9851,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,484.3161,0,93.12,5831.06 256,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,32,0,315.989,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2E,0.0%,183.8335,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,499.8225,0,180.45,5661.17 256,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,64,0,328.6203,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x256_pf2E,0.0%,227.39,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,556.0102999999999,0,324.43,5108.88 +256,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,130.3234,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,74.9639,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,205.2873,0,13.73,13731.57 +256,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,130.3234,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,74.9639,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,205.2873,0,13.73,13731.57 +256,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,130.3234,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,74.9639,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,205.2873,0,13.73,13731.57 +256,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,130.3234,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,74.9639,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,205.2873,0,13.73,13731.57 256,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,130.3234,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,74.9639,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,205.2873,0,13.73,13731.57 256,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,298.4201,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512E,0.0%,0.0,Null,0,298.4201,1,18.89,9447.29 256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,266.0248,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2E,0.0%,146.7599,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,412.7847,0,27.31,6831.52 @@ -164,6 +260,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,304.5903,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf2E,0.0%,178.5369,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,483.1272,0,93.34,5845.41 256,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,498.7106,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_ps_32x512E,0.0%,0.0,Null,0,498.7106,1,180.86,5673.8 256,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,333.1281,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf2E,0.0%,249.1231,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,582.2512,0,309.81,4878.64 +256,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,130.2529,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,74.4948,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,204.7477,0,13.77,13767.76 +256,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,130.2529,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,74.4948,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,204.7477,0,13.77,13767.76 +256,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,130.2529,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,74.4948,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,204.7477,0,13.77,13767.76 +256,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,130.2529,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,74.4948,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,204.7477,0,13.77,13767.76 256,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,130.2529,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,74.4948,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,204.7477,0,13.77,13767.76 256,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,195.8963,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,103.7824,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,299.6787,0,18.81,9407.61 256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,265.4483,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2E,0.0%,142.3836,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,407.8319,0,27.64,6914.49 @@ -171,6 +271,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,304.4449,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2E,0.0%,177.3471,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,481.792,0,93.6,5861.61 256,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,306.5738,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf2E,0.0%,190.3011,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,496.8749,0,181.52,5694.76 256,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,332.9031,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf2E,0.0%,245.6419,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,578.545,0,311.8,4909.89 +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.9773,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,50.9117,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,140.889,0,8.57,8575.7 +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.9773,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,50.9117,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,140.889,0,8.57,8575.7 +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.9773,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,50.9117,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,140.889,0,8.57,8575.7 +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.9773,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,50.9117,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,140.889,0,8.57,8575.7 256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,89.9773,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,50.9117,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,140.889,0,8.57,8575.7 256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,127.6678,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,69.4848,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,197.1526,0,12.25,6129.69 256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,136.4991,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,75.3558,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,211.8549,0,22.81,5706.77 @@ -178,6 +282,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,141.1941,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,80.8519,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,222.046,0,87.04,5459.02 256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,144.5293,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,104.6954,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,249.2247,0,155.1,4880.53 256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,128,0,169.6864,moe_ck2stages_gemm1_256x128x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,153.1648,moe_ck2stages_gemm2_256x128x128x64_1x4_TypeCast_v3_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,322.8512,0,239.46,3793.5 +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,90.339,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,50.8203,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,141.1593,0,8.56,8559.28 +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,90.339,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,50.8203,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,141.1593,0,8.56,8559.28 +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,90.339,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,50.8203,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,141.1593,0,8.56,8559.28 +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,90.339,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,50.8203,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,141.1593,0,8.56,8559.28 256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,90.339,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,50.8203,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,141.1593,0,8.56,8559.28 256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,127.4505,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,69.9278,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,197.3783,0,12.24,6122.68 256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,136.5934,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,76.0763,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,212.6697,0,22.72,5684.91 @@ -185,6 +293,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,140.488,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,80.8556,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,221.3436,0,87.32,5476.34 256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,64,0,144.855,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,97.8569,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,242.7119,0,159.26,5011.49 256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,128,0,171.8959,moe_ck2stages_gemm1_256x128x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,144.4775,moe_ck2stages_gemm2_256x128x128x64_1x4_TypeCast_v3_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,316.3734,0,244.36,3871.17 +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.9507,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,42.2681,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,93.2188,0,12.96,6481.27 +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.9507,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,42.2681,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,93.2188,0,12.96,6481.27 +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.9507,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,42.2681,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,93.2188,0,12.96,6481.27 +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.9507,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,42.2681,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,93.2188,0,12.96,6481.27 256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.9507,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,42.2681,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,93.2188,0,12.96,6481.27 256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,79.6392,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,54.2772,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,133.9164,0,18.04,4513.06 256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,83.5661,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,60.2323,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,143.79840000000002,0,33.6,4205.65 @@ -192,6 +304,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,85.0781,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,66.2997,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,151.3778,0,127.68,4010.66 256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,87.3865,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,82.0796,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,169.46609999999998,0,228.1,3601.14 256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,92.393,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,125.505,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,217.898,0,354.8,2829.59 +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.387,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,41.0638,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.4508,0,13.07,6535.11 +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.387,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,41.0638,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.4508,0,13.07,6535.11 +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.387,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,41.0638,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.4508,0,13.07,6535.11 +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.387,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,41.0638,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.4508,0,13.07,6535.11 256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.387,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,41.0638,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.4508,0,13.07,6535.11 256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,81.0339,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,51.9652,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,132.9991,0,18.16,4544.19 256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,84.9852,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,57.2253,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,142.2105,0,33.98,4252.61 @@ -199,6 +315,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,86.6075,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,64.6415,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,151.249,0,127.78,4014.08 256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,91.2218,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,82.8771,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,174.0989,0,222.03,3505.31 256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,93.6859,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,125.2007,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,218.8866,0,353.19,2816.81 +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.0789,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0,76.0789,1,15.88,7941.44 +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.0789,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0,76.0789,1,15.88,7941.44 +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.0789,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0,76.0789,1,15.88,7941.44 +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.0789,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0,76.0789,1,15.88,7941.44 256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.0789,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0,76.0789,1,15.88,7941.44 256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,105.3523,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0,105.3523,1,22.93,5736.69 256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,115.4816,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0,115.4816,1,41.84,5236.91 @@ -206,6 +326,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,120.3527,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x192E,0.0%,0.0,Null,0,120.3527,1,160.59,5044.55 256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,149.2682,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x384E,0.0%,0.0,Null,0,149.2682,1,258.96,4088.42 256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,91.8397,_ZN5aiter45fmoe_stage1_bf16_pertokenFp8_g1u1_128x128_pf3E,0.0%,159.1852,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,251.0249,0,307.98,2456.18 +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.7186,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.1778,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.8964,0,10.61,5304.61 +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.7186,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.1778,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.8964,0,10.61,5304.61 +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.7186,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.1778,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.8964,0,10.61,5304.61 +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.7186,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.1778,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.8964,0,10.61,5304.61 256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.7186,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.1778,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.8964,0,10.61,5304.61 256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,80.6595,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf3E,0.0%,84.4366,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,165.09609999999998,0,14.63,3660.73 256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,83.8169,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf3E,0.0%,88.246,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,172.0629,0,28.08,3514.8 @@ -219,6 +343,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,159.5203,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,103.4399,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,262.9602,0,85.75,5380.25 256,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,160.4127,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf2E,0.0%,132.4101,moe_ck2stages_gemm2_256x32x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,292.82280000000003,0,154.01,4850.36 256,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,64,0,162.8098,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_64x128_2tg_pf2E,0.0%,221.1124,moe_ck2stages_gemm2_256x64x128x256_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,383.9222,0,234.93,3728.12 +256,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,269.3232,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.7722,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,404.0954,0,6.98,13951.17 +256,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,269.3232,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.7722,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,404.0954,0,6.98,13951.17 +256,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,269.3232,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.7722,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,404.0954,0,6.98,13951.17 +256,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,269.3232,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.7722,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,404.0954,0,6.98,13951.17 256,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,269.3232,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.7722,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,404.0954,0,6.98,13951.17 256,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,381.0416,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,195.7302,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,576.7718,0,9.77,9775.2 256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,562.6212,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,271.5572,moe_ck2stages_gemm2_256x32x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,834.1784,0,13.52,6759.92 @@ -227,6 +355,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,622.7788,moe_ck2stages_gemm1_256x64x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,341.9723,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,964.7511,0,93.49,5858.32 256,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,64,0,649.2553,moe_ck2stages_gemm1_256x64x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.4126,moe_ck2stages_gemm2_256x64x128x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,1020.6679,0,176.74,5551.76 256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,128,0,171.7102,moe_ck2stages_gemm1_256x128x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,144.3167,moe_ck2stages_gemm2_256x128x128x64_1x4_TypeCast_v3_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,316.0269,0,244.63,3875.42 +256,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.0987,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,42.6821,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,92.7808,0,13.02,6511.87 +256,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.0987,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,42.6821,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,92.7808,0,13.02,6511.87 +256,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.0987,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,42.6821,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,92.7808,0,13.02,6511.87 +256,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.0987,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,42.6821,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,92.7808,0,13.02,6511.87 256,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,50.0987,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,42.6821,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,92.7808,0,13.02,6511.87 256,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,78.873,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,54.6731,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,133.5461,0,18.09,4525.58 256,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,82.7604,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,60.2229,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,142.9833,0,33.79,4229.63 @@ -234,6 +366,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,84.7124,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,66.4881,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,151.2005,0,127.83,4015.37 256,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,86.6188,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,83.3428,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,169.96159999999998,0,227.43,3590.64 256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,0,128,0,92.6237,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,125.9531,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,218.5768,0,353.69,2820.81 +256,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.0702,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,41.1541,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.2243,0,13.1,6551.16 +256,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.0702,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,41.1541,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.2243,0,13.1,6551.16 +256,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.0702,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,41.1541,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.2243,0,13.1,6551.16 +256,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.0702,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,41.1541,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.2243,0,13.1,6551.16 256,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,51.0702,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,41.1541,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,92.2243,0,13.1,6551.16 256,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,81.6357,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,52.029,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,133.6647,0,18.07,4521.56 256,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,84.8817,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,56.8257,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,141.7074,0,34.1,4267.71 @@ -241,6 +377,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,86.7569,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,64.6359,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,151.39280000000002,0,127.66,4010.27 256,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,88.2869,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,82.4745,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,170.7614,0,226.37,3573.82 256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Tensor,1,1,128,0,93.1182,moe_ck2stages_gemm1_256x128x64x128_1x4_TypeCast_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,125.1912,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,218.3094,0,354.13,2824.26 +256,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.4501,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x192E,0.0%,0.0,Null,0,76.4501,1,15.8,7902.89 +256,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.4501,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x192E,0.0%,0.0,Null,0,76.4501,1,15.8,7902.89 +256,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.4501,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x192E,0.0%,0.0,Null,0,76.4501,1,15.8,7902.89 +256,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.4501,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x192E,0.0%,0.0,Null,0,76.4501,1,15.8,7902.89 256,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,76.4501,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x192E,0.0%,0.0,Null,0,76.4501,1,15.8,7902.89 256,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,104.0048,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x192E,0.0%,0.0,Null,0,104.0048,1,23.23,5811.01 256,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,117.2665,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x192E,0.0%,0.0,Null,0,117.2665,1,41.2,5157.2 @@ -248,6 +388,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,122.2554,_ZN5aiter48fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_ps_32x192E,0.0%,0.0,Null,0,122.2554,1,158.09,4966.04 256,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,32,0,148.475,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_gelu_1tg_32x384E,0.0%,0.0,Null,0,148.475,1,260.34,4110.26 256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,0,128,0,92.8309,_ZN5aiter45fmoe_stage1_bf16_pertokenFp8_g1u1_128x128_pf3E,0.0%,159.4309,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,252.2618,0,306.46,2444.14 +256,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.6325,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.2871,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.9196,0,10.6,5303.53 +256,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.6325,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.2871,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.9196,0,10.6,5303.53 +256,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.6325,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.2871,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.9196,0,10.6,5303.53 +256,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.6325,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.2871,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.9196,0,10.6,5303.53 256,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,49.6325,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,64.2871,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,113.9196,0,10.6,5303.53 256,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,80.9233,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf3E,0.0%,84.6624,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,165.5857,0,14.59,3649.91 256,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,84.5381,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf3E,0.0%,88.2359,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,172.774,0,27.97,3500.33 @@ -255,6 +399,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,86.7616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf3E,0.0%,94.7695,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,181.5311,0,106.47,3344.47 256,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,89.8093,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x64_pf3E,0.0%,111.8215,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,201.6308,0,191.71,3026.68 256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,128,0,94.4833,_ZN5aiter54fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_128x128_pf3E,0.0%,162.4132,moe_ck2stages_gemm2_256x128x128x128_1x4_TypeCast_v3_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,256.8965,0,300.94,2400.04 +80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,238.4483,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,155.017,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,393.4653,0,3.58,7164.62 +80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,238.4483,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,155.017,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,393.4653,0,3.58,7164.62 +80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,238.4483,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,155.017,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,393.4653,0,3.58,7164.62 +80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,238.4483,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,155.017,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,393.4653,0,3.58,7164.62 80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,238.4483,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,155.017,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,393.4653,0,3.58,7164.62 80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,367.326,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,243.7681,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,611.0941,0,4.61,4613.84 80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,485.1197,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,319.3392,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,804.4589000000001,0,7.01,3505.97 @@ -262,6 +410,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,588.4224,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,400.0835,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,988.5059,0,22.81,2858.77 80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,617.8863,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,423.4145,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1041.3008,0,43.31,2720.88 80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,783.2767,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,600.1759,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.3%,1383.4526,0,65.2,2058.57 +80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,238.507,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,146.2522,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,384.7592,0,3.66,7326.74 +80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,238.507,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,146.2522,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,384.7592,0,3.66,7326.74 +80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,238.507,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,146.2522,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,384.7592,0,3.66,7326.74 +80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,238.507,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,146.2522,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,384.7592,0,3.66,7326.74 80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,238.507,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,146.2522,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,384.7592,0,3.66,7326.74 80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,376.8336,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,233.1548,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,609.9884,0,4.62,4622.2 80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,484.4582,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,310.4899,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,794.9481,0,7.09,3547.91 @@ -269,6 +421,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,591.6172,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,377.301,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,968.9182,0,23.27,2916.56 80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,621.3374,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,402.5935,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1023.9309,0,44.04,2767.03 80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,784.9765,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,567.3861,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1352.3626,0,66.69,2105.89 +80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,121.6233,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,117.3629,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,238.9862,0,5.9,5898.37 +80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,121.6233,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,117.3629,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,238.9862,0,5.9,5898.37 +80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,121.6233,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,117.3629,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,238.9862,0,5.9,5898.37 +80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,121.6233,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,117.3629,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,238.9862,0,5.9,5898.37 80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,121.6233,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,117.3629,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,238.9862,0,5.9,5898.37 80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,194.485,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,176.724,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,371.209,0,7.59,3798.33 80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,251.2934,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,233.6045,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,484.8979,0,11.63,2909.19 @@ -276,6 +432,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,300.6316,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,279.9782,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,580.6098,0,38.84,2436.73 80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,312.6244,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,296.8907,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,609.5151,0,73.99,2330.21 80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,394.9761,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,427.6178,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.3%,822.5939,0,109.65,1739.99 +80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,120.6124,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,113.2457,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,233.8581,0,6.03,6027.72 +80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,120.6124,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,113.2457,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,233.8581,0,6.03,6027.72 +80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,120.6124,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,113.2457,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,233.8581,0,6.03,6027.72 +80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,120.6124,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,113.2457,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,233.8581,0,6.03,6027.72 80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,120.6124,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,113.2457,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,233.8581,0,6.03,6027.72 80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,188.7522,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,170.2119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,358.96410000000003,0,7.85,3927.9 80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,254.6539,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.1%,220.0918,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,474.7457,0,11.87,2971.41 @@ -283,6 +443,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,299.2146,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,268.1991,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,567.4137000000001,0,39.74,2493.4 80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,316.0831,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,287.2443,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,603.3274,0,74.75,2354.11 80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,398.3967,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,410.8972,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,809.2939,0,111.45,1768.59 +80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,239.0773,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,239.0773,1,5.89,5896.13 +80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,239.0773,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,239.0773,1,5.89,5896.13 +80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,239.0773,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,239.0773,1,5.89,5896.13 +80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,239.0773,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,239.0773,1,5.89,5896.13 80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,239.0773,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,239.0773,1,5.89,5896.13 80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,361.5408,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,361.5408,1,7.8,3899.9 80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,409.1668,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,409.1668,1,13.78,3447.65 @@ -290,6 +454,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,534.8764,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,534.8764,1,42.16,2645.08 80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,560.9956,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,560.9956,1,80.39,2531.74 80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,723.0851,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,723.0851,1,124.74,1979.44 +80,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,438.8756,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,256.4253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,695.3009,0,4.05,8108.15 +80,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,438.8756,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,256.4253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,695.3009,0,4.05,8108.15 +80,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,438.8756,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,256.4253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,695.3009,0,4.05,8108.15 +80,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,438.8756,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,256.4253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,695.3009,0,4.05,8108.15 80,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,438.8756,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,256.4253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,695.3009,0,4.05,8108.15 80,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,714.79,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,400.2169,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,1115.0069,0,5.06,5056.53 80,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,998.4164,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,561.412,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.9%,1559.8284,0,7.23,3615.13 @@ -300,6 +468,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,504.1748,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,378.662,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.9%,882.8368,0,12.77,3194.19 80,128,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,570.6719,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.1%,417.051,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,987.7229,0,22.83,2856.39 80,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,597.6775,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,432.5419,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1030.2194,0,43.77,2741.24 +80,1,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.0761,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,66.5013,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,154.5774,0,3.91,3908.99 +80,2,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.0761,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,66.5013,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,154.5774,0,3.91,3908.99 +80,4,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.0761,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,66.5013,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,154.5774,0,3.91,3908.99 +80,8,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.0761,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,66.5013,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,154.5774,0,3.91,3908.99 80,16,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.0761,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,66.5013,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,154.5774,0,3.91,3908.99 80,32,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,116.9005,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,89.872,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,206.7725,0,5.84,2923.52 80,64,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,130.2259,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,101.7908,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,232.0167,0,10.41,2607.69 @@ -307,6 +479,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,139.8396,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,107.8712,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,247.7108,0,39.01,2455.18 80,512,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,176.0293,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,152.7806,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,328.8099,0,58.78,1862.38 80,1024,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,283.4477,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,254.2291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,537.6768,0,71.89,1154.52 +80,1,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.5875,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,61.2038,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,149.7913,0,4.03,4033.89 +80,2,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.5875,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,61.2038,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,149.7913,0,4.03,4033.89 +80,4,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.5875,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,61.2038,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,149.7913,0,4.03,4033.89 +80,8,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.5875,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,61.2038,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,149.7913,0,4.03,4033.89 80,16,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.5875,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,61.2038,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,149.7913,0,4.03,4033.89 80,32,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,116.7801,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,82.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,199.298,0,6.06,3033.17 80,64,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,130.0463,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,93.2196,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,223.2659,0,10.82,2709.9 @@ -314,6 +490,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,136.6552,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,99.1414,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,235.7966,0,40.98,2579.23 80,512,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,177.8056,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,140.1446,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,317.9502,0,60.79,1925.99 80,1024,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,284.4745,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,232.9485,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,517.423,0,74.71,1199.71 +80,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,160.613,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,97.024,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,257.637,0,4.69,4689.63 +80,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,160.613,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,97.024,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,257.637,0,4.69,4689.63 +80,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,160.613,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,97.024,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,257.637,0,4.69,4689.63 +80,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,160.613,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,97.024,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,257.637,0,4.69,4689.63 80,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,160.613,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,97.024,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,257.637,0,4.69,4689.63 80,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,214.8572,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,129.0874,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,343.94460000000004,0,7.02,3513.6 80,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,238.0028,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,151.5356,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.3%,389.5384,0,12.4,3103.69 @@ -321,6 +501,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,268.9473,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,167.0488,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,435.9961,0,44.33,2780.19 80,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,347.7216,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,219.8661,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,567.5877,0,68.1,2143.01 80,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.4769,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,371.6201,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.4%,895.097,0,86.37,1368.27 +80,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,165.7381,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,94.3172,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,260.0553,0,4.65,4646.02 +80,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,165.7381,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,94.3172,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,260.0553,0,4.65,4646.02 +80,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,165.7381,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,94.3172,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,260.0553,0,4.65,4646.02 +80,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,165.7381,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,94.3172,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,260.0553,0,4.65,4646.02 80,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,165.7381,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,94.3172,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,260.0553,0,4.65,4646.02 80,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,216.9447,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,126.1828,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,343.1275,0,7.04,3521.97 80,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,246.22,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,145.7239,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,391.9439,0,12.33,3084.65 @@ -328,6 +512,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,271.8789,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,161.456,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,433.3349,0,44.6,2797.27 80,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,347.2111,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,210.4569,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,557.668,0,69.31,2181.13 80,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.4171,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,354.8123,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.2293999999999,0,87.93,1392.97 +80,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,82.6308,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,68.157,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,150.7878,0,8.01,4006.8 +80,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,82.6308,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,68.157,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,150.7878,0,8.01,4006.8 +80,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,82.6308,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,68.157,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,150.7878,0,8.01,4006.8 +80,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,82.6308,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,68.157,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,150.7878,0,8.01,4006.8 80,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,82.6308,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,68.157,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,150.7878,0,8.01,4006.8 80,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,109.4253,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,88.2541,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,197.6794,0,12.22,3057.34 80,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,127.2465,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,102.2132,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,229.4597,0,21.06,2635.61 @@ -335,6 +523,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,134.0808,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,109.8052,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,243.886,0,79.25,2489.38 80,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,179.6681,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,155.5374,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,335.20550000000003,0,115.32,1820.59 80,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,269.7206,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,263.4387,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.4%,533.1593,0,145.0,1156.43 +80,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,83.7141,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,64.4684,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,148.1825,0,8.15,4077.25 +80,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,83.7141,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,64.4684,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,148.1825,0,8.15,4077.25 +80,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,83.7141,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,64.4684,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,148.1825,0,8.15,4077.25 +80,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,83.7141,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,64.4684,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,148.1825,0,8.15,4077.25 80,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,83.7141,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,64.4684,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,148.1825,0,8.15,4077.25 80,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,109.0496,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,83.8951,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,192.9447,0,12.52,3132.36 80,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,127.6734,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,97.1704,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,224.8438,0,21.49,2689.72 @@ -342,6 +534,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,134.708,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,105.5023,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,240.2103,0,80.46,2527.47 80,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,178.7076,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,147.3146,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,326.0222,0,118.56,1871.87 80,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,269.4957,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,248.2335,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,517.7292,0,149.32,1190.9 +80,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.7173,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.3%,76.402,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,159.1193,0,7.59,3797.0 +80,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.7173,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.3%,76.402,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,159.1193,0,7.59,3797.0 +80,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.7173,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.3%,76.402,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,159.1193,0,7.59,3797.0 +80,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.7173,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.3%,76.402,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,159.1193,0,7.59,3797.0 80,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.7173,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.3%,76.402,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,159.1193,0,7.59,3797.0 80,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,109.8209,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,99.0655,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,208.8864,0,11.57,2893.31 80,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,119.69,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,114.5365,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,234.2265,0,20.63,2581.97 @@ -349,6 +545,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,129.4978,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,126.7846,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,256.2824,0,75.41,2368.97 80,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,174.904,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,184.6529,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,359.5569,0,107.51,1697.29 80,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.1056,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_silu_F8_F8_B16,0.0%,314.319,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.4%,586.4246,0,131.83,1051.39 +80,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,82.1262,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,74.17,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,156.2962,0,7.73,3865.59 +80,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,82.1262,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,74.17,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,156.2962,0,7.73,3865.59 +80,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,82.1262,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,74.17,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,156.2962,0,7.73,3865.59 +80,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,82.1262,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,74.17,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,156.2962,0,7.73,3865.59 80,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,82.1262,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,74.17,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,156.2962,0,7.73,3865.59 80,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,110.0096,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,95.6992,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,205.7088,0,11.74,2938.0 80,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,125.1757,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,111.2889,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,236.4646,0,20.43,2557.53 @@ -356,11 +556,19 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,136.164,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,123.0773,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,259.2413,0,74.55,2341.93 80,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,173.4458,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,178.424,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,351.8698,0,109.86,1734.37 80,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,268.8237,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,301.9153,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,570.739,0,135.45,1080.29 +80,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,165.0797,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,96.9624,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,262.0421,0,4.61,4610.79 +80,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,165.0797,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,96.9624,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,262.0421,0,4.61,4610.79 +80,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,165.0797,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,96.9624,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,262.0421,0,4.61,4610.79 +80,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,165.0797,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,96.9624,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,262.0421,0,4.61,4610.79 80,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,165.0797,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,96.9624,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,262.0421,0,4.61,4610.79 80,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,217.0102,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,129.4305,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,346.4407,0,6.97,3488.28 80,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,241.1054,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,150.0225,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,391.1279,0,12.35,3091.08 80,128,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,256.7095,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,157.2573,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,413.9668,0,23.34,2923.08 80,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,265.3977,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,166.4318,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,431.8295,0,44.76,2807.02 +80,1,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,38.52,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,36.2602,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.78020000000001,0,4.04,4040.12 +80,2,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,38.52,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,36.2602,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.78020000000001,0,4.04,4040.12 +80,4,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,38.52,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,36.2602,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.78020000000001,0,4.04,4040.12 +80,8,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,38.52,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,36.2602,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.78020000000001,0,4.04,4040.12 80,16,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,38.52,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,36.2602,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.78020000000001,0,4.04,4040.12 80,32,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,58.4576,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,49.4407,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,107.8983,0,5.6,2801.27 80,64,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,62.2694,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,55.5013,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,117.7707,0,10.26,2568.67 @@ -368,6 +576,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,66.8949,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,59.7486,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,126.64350000000002,0,38.15,2401.13 80,512,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,94.7614,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,83.103,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,177.8644,0,54.33,1721.45 80,1024,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,143.5168,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,134.0455,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,277.5623,0,69.63,1118.23 +80,1,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.6753,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,33.9165,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.5918,0,4.16,4161.92 +80,2,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.6753,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,33.9165,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.5918,0,4.16,4161.92 +80,4,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.6753,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,33.9165,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.5918,0,4.16,4161.92 +80,8,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.6753,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,33.9165,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.5918,0,4.16,4161.92 80,16,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.6753,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,33.9165,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.5918,0,4.16,4161.92 80,32,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,59.5728,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,45.8094,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,105.3822,0,5.73,2868.15 80,64,2048,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,62.2801,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,51.218,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,113.4981,0,10.64,2665.37 @@ -381,6 +593,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,292.8938,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,319.6213,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,612.5151000000001,0,36.81,2309.81 80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,305.4473,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,349.9039,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,655.3512000000001,0,68.81,2167.23 80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,394.0843,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,517.6212,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,911.7055,0,98.93,1569.92 +80,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,241.3413,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,156.5265,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,397.8678,0,3.54,7085.35 +80,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,241.3413,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,156.5265,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,397.8678,0,3.54,7085.35 +80,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,241.3413,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,156.5265,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,397.8678,0,3.54,7085.35 +80,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,241.3413,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,156.5265,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,397.8678,0,3.54,7085.35 80,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,241.3413,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,156.5265,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.7%,397.8678,0,3.54,7085.35 80,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,368.8396,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,244.9596,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,613.7992,0,4.59,4593.51 80,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,486.2218,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,330.374,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,816.5958,0,6.9,3453.86 @@ -388,6 +604,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,587.3979,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,399.9114,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,987.3093,0,22.84,2862.24 80,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,619.5779,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,428.3394,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1047.9173,0,43.04,2703.7 80,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,785.2244,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,600.8826,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,1.6%,1386.107,0,65.07,2054.63 +80,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,239.8912,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,147.6741,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,387.5653,0,3.64,7273.69 +80,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,239.8912,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,147.6741,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,387.5653,0,3.64,7273.69 +80,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,239.8912,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,147.6741,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,387.5653,0,3.64,7273.69 +80,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,239.8912,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,147.6741,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,387.5653,0,3.64,7273.69 80,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,239.8912,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,147.6741,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,387.5653,0,3.64,7273.69 80,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,368.433,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,237.6243,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,606.0572999999999,0,4.65,4652.18 80,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,493.7026,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,313.5552,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,807.2578000000001,0,6.98,3493.81 @@ -395,7 +615,15 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,593.7672,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,382.7183,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,976.4855,0,23.09,2893.96 80,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,617.3315,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,411.0096,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1028.3411,0,43.85,2755.17 80,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,781.9082,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,569.3894,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1351.2976,0,66.75,2107.55 +80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,118.5389,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,118.5512,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,237.0901,0,5.94,5945.55 +80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,118.5389,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,118.5512,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,237.0901,0,5.94,5945.55 +80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,118.5389,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,118.5512,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,237.0901,0,5.94,5945.55 +80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,118.5389,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,118.5512,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,237.0901,0,5.94,5945.55 80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,118.5389,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_silu_F8_F8_B16,0.0%,118.5512,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,237.0901,0,5.94,5945.55 +80,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,120.5717,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,117.6181,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.7%,238.1898,0,5.92,5918.1 +80,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,120.5717,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,117.6181,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.7%,238.1898,0,5.92,5918.1 +80,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,120.5717,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,117.6181,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.7%,238.1898,0,5.92,5918.1 +80,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,120.5717,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,117.6181,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.7%,238.1898,0,5.92,5918.1 80,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,120.5717,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,117.6181,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.7%,238.1898,0,5.92,5918.1 80,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,188.5396,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,176.4776,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,365.0172,0,7.72,3862.76 80,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,252.7997,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,230.1231,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.7%,482.9228,0,11.67,2921.09 @@ -403,6 +631,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,301.0401,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,279.1768,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,580.2169,0,38.86,2438.38 80,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,314.4244,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,297.6385,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,612.0629,0,73.68,2320.51 80,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,388.4815,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,429.5016,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,1.6%,817.9830999999999,0,110.26,1749.8 +80,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,121.9866,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,113.0687,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,235.0553,0,6.0,5997.02 +80,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,121.9866,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,113.0687,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,235.0553,0,6.0,5997.02 +80,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,121.9866,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,113.0687,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,235.0553,0,6.0,5997.02 +80,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,121.9866,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,113.0687,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,235.0553,0,6.0,5997.02 80,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,121.9866,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,113.0687,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,235.0553,0,6.0,5997.02 80,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,191.3402,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,169.9426,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,361.2828,0,7.8,3902.69 80,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,253.0207,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,220.8733,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,473.894,0,11.9,2976.75 @@ -411,20 +643,36 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,304.03,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,287.1332,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,591.1632,0,76.29,2402.55 80,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,390.4133,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,410.897,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,801.3103,0,112.56,1786.21 80,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1538.2556,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,1005.4051,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2543.6607,0,70.92,2227.7 +80,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,237.9312,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,159.3483,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,397.2795,0,7.09,7095.55 +80,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,237.9312,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,159.3483,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,397.2795,0,7.09,7095.55 +80,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,237.9312,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,159.3483,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,397.2795,0,7.09,7095.55 +80,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,237.9312,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,159.3483,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,397.2795,0,7.09,7095.55 80,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,237.9312,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,159.3483,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,397.2795,0,7.09,7095.55 80,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,369.2362,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,241.8314,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.9%,611.0676,0,9.23,4613.66 80,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,510.6484,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,335.3164,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,845.9648,0,13.33,3333.41 80,128,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,582.0633,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_silu_F8_F8_B16,0.0%,408.0174,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,990.0807,0,22.77,2849.59 80,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,615.4904,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,418.179,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1033.6694,0,43.63,2732.09 80,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,762.665,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,620.1682,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1382.8332,0,130.45,2054.18 +80,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,231.5308,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,154.9071,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,386.4379,0,7.29,7294.62 +80,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,231.5308,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,154.9071,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,386.4379,0,7.29,7294.62 +80,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,231.5308,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,154.9071,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,386.4379,0,7.29,7294.62 +80,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,231.5308,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,154.9071,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,386.4379,0,7.29,7294.62 80,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,231.5308,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,154.9071,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,386.4379,0,7.29,7294.62 80,128,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,579.7023,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.1%,364.33,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,944.0323,0,23.89,2988.59 80,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,601.7144,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,402.2826,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1003.997,0,44.92,2812.83 80,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,625.392,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,411.3256,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1036.7176,0,87.0,2729.37 80,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,763.616,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,611.4583,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1375.0743,0,131.18,2065.77 +80,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,228.7896,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,171.5575,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.9%,400.3471,0,7.04,7041.18 +80,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,228.7896,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,171.5575,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.9%,400.3471,0,7.04,7041.18 +80,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,228.7896,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,171.5575,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.9%,400.3471,0,7.04,7041.18 +80,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,228.7896,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,171.5575,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.9%,400.3471,0,7.04,7041.18 80,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,228.7896,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,171.5575,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.9%,400.3471,0,7.04,7041.18 80,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,618.0944,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.1%,476.0445,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1094.1389,0,82.43,2586.13 80,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.5786,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,718.1941,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,2.8%,1468.7727,0,122.82,1933.99 +80,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,231.8332,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,169.7631,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,401.5963,0,7.02,7019.28 +80,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,231.8332,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,169.7631,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,401.5963,0,7.02,7019.28 +80,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,231.8332,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,169.7631,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,401.5963,0,7.02,7019.28 +80,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,231.8332,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,169.7631,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,401.5963,0,7.02,7019.28 80,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,231.8332,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,169.7631,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,401.5963,0,7.02,7019.28 80,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,367.8369,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,260.1737,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,628.0106000000001,0,8.98,4489.19 80,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,495.9422,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,361.1859,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,857.1281,0,13.15,3290.0 @@ -434,6 +682,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,752.5409,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.4453,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1447.9861999999998,0,124.58,1961.75 80,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,351.9236,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,220.5397,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,572.4633,0,67.52,2124.76 80,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,523.5384,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,372.5179,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.5%,896.0563,0,86.28,1366.81 +80,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,161.7277,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,96.3101,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,258.0378,0,4.68,4682.34 +80,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,161.7277,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,96.3101,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,258.0378,0,4.68,4682.34 +80,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,161.7277,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,96.3101,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,258.0378,0,4.68,4682.34 +80,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,161.7277,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,96.3101,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,258.0378,0,4.68,4682.34 80,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,161.7277,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,96.3101,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,258.0378,0,4.68,4682.34 80,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,209.4214,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,126.4843,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,335.9057,0,7.19,3597.69 80,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,240.9494,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,148.9541,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,389.9035,0,12.39,3100.79 @@ -441,6 +693,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,269.3657,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,164.1089,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,433.4746,0,44.59,2796.37 80,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,352.9213,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,213.1932,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,566.1144999999999,0,68.28,2148.59 80,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,524.8862,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,354.9441,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,879.8303000000001,0,87.87,1392.01 +80,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,83.611,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,67.927,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,151.538,0,7.97,3986.96 +80,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,83.611,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,67.927,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,151.538,0,7.97,3986.96 +80,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,83.611,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,67.927,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,151.538,0,7.97,3986.96 +80,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,83.611,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,67.927,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,151.538,0,7.97,3986.96 80,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,83.611,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,67.927,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,151.538,0,7.97,3986.96 80,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,107.6661,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,87.4868,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,195.1529,0,12.38,3096.92 80,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,125.0556,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,101.2628,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,226.3184,0,21.35,2672.19 @@ -448,6 +704,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,135.4179,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,109.748,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,245.1659,0,78.83,2476.39 80,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,177.8313,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,155.4294,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,333.2607,0,115.99,1831.21 80,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,264.4639,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,263.7878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,0.5%,528.2517,0,146.35,1167.18 +80,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,84.3415,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,65.6237,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,149.96519999999998,0,8.05,4028.78 +80,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,84.3415,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,65.6237,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,149.96519999999998,0,8.05,4028.78 +80,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,84.3415,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,65.6237,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,149.96519999999998,0,8.05,4028.78 +80,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,84.3415,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,65.6237,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,149.96519999999998,0,8.05,4028.78 80,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,84.3415,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,65.6237,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,149.96519999999998,0,8.05,4028.78 80,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,109.5369,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,83.7014,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,193.2383,0,12.5,3127.6 80,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,130.7478,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,96.7112,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,227.459,0,21.24,2658.79 @@ -455,6 +715,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,134.1437,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,106.1464,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,240.2901,0,80.43,2526.64 80,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,179.8699,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,146.8478,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,326.71770000000004,0,118.31,1867.89 80,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,266.6047,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,248.2173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,514.822,0,150.17,1197.62 +80,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.4918,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,76.3624,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,158.8542,0,7.6,3803.34 +80,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.4918,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,76.3624,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,158.8542,0,7.6,3803.34 +80,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.4918,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,76.3624,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,158.8542,0,7.6,3803.34 +80,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.4918,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,76.3624,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,158.8542,0,7.6,3803.34 80,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,82.4918,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,76.3624,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,158.8542,0,7.6,3803.34 80,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,108.8976,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,99.2988,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,208.1964,0,11.6,2902.9 80,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,126.1647,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,114.79,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,240.9547,0,20.05,2509.88 @@ -462,6 +726,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,136.6099,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,127.3005,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,263.9104,0,73.23,2300.5 80,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,174.3438,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,184.6666,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,359.0104,0,107.67,1699.87 80,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,272.9741,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,313.1772,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,0.5%,586.1513,0,131.89,1051.88 +80,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,84.9173,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,74.1307,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,159.048,0,7.59,3798.7 +80,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,84.9173,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,74.1307,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,159.048,0,7.59,3798.7 +80,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,84.9173,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,74.1307,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,159.048,0,7.59,3798.7 +80,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,84.9173,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,74.1307,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,159.048,0,7.59,3798.7 80,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,84.9173,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,74.1307,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,159.048,0,7.59,3798.7 80,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,110.1287,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,96.0451,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,206.1738,0,11.72,2931.38 80,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,126.6253,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,111.3536,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,237.9789,0,20.3,2541.26 @@ -469,6 +737,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,130.1837,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,123.1176,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,253.3013,0,76.3,2396.85 80,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,177.7956,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,178.1446,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,355.9402,0,108.6,1714.53 80,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,271.6424,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,302.5119,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,574.1543,0,134.65,1073.86 +80,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,120.3829,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,134.4734,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,254.8563,0,5.53,5531.08 +80,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,120.3829,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,134.4734,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,254.8563,0,5.53,5531.08 +80,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,120.3829,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,134.4734,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,254.8563,0,5.53,5531.08 +80,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,120.3829,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,134.4734,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,254.8563,0,5.53,5531.08 80,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,120.3829,_ZN5aiter47fmoe_stage1_bf16_pertokenFp8_g1u1_32x64_4tg_pf3E,0.0%,134.4734,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,254.8563,0,5.53,5531.08 80,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,188.6312,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,205.9065,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,394.5377,0,7.14,3573.74 80,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,249.8568,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf2E,0.0%,269.7859,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,519.6427,0,10.85,2714.68 @@ -476,6 +748,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,302.8986,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,328.0652,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,630.9638,0,35.74,2242.27 80,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,305.7627,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.1%,359.7434,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,665.5061000000001,0,67.76,2134.16 80,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,390.1172,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf3E,0.0%,531.1275,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,1.6%,921.2447,0,97.9,1553.67 +80,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,119.4832,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,131.8606,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,251.3438,0,5.61,5608.37 +80,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,119.4832,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,131.8606,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,251.3438,0,5.61,5608.37 +80,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,119.4832,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,131.8606,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,251.3438,0,5.61,5608.37 +80,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,119.4832,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,131.8606,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,251.3438,0,5.61,5608.37 80,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,119.4832,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,131.8606,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,251.3438,0,5.61,5608.37 80,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,192.247,_ZN5aiter56fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x64_4tg_pf3E,0.0%,200.7748,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,393.0218,0,7.17,3587.52 80,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,247.1864,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,262.0145,moe_ck2stages_gemm2_256x32x64x256_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,509.2009,0,11.07,2770.35 @@ -484,6 +760,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,310.6611,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,349.0712,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,659.7322999999999,0,68.36,2152.84 80,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,385.5365,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf3E,0.0%,515.5173,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,901.0538,0,100.1,1588.48 80,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4564,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,1031.1042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,2.8%,2576.5606,0,70.01,2199.25 +80,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,442.5097,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,249.4045,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,691.9142,0,4.07,8147.84 +80,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,442.5097,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,249.4045,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,691.9142,0,4.07,8147.84 +80,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,442.5097,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,249.4045,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,691.9142,0,4.07,8147.84 +80,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,442.5097,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,249.4045,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,691.9142,0,4.07,8147.84 80,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,442.5097,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,249.4045,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,691.9142,0,4.07,8147.84 80,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,708.8776,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,395.3054,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1104.183,0,5.11,5106.09 80,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1025.0488,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_silu_B16_B16_B16,0.0%,559.4474,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1584.4962,0,7.12,3558.85 @@ -493,6 +773,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,625.1681,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf2E,0.0%,436.7729,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,2.8%,1061.941,0,84.93,2664.54 80,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,369.4651,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_silu_F8_F8_B16,0.0%,236.5581,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,606.0232,0,9.3,4652.07 80,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,516.6341,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,344.0188,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,860.6529,0,13.1,3276.52 +80,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,440.6719,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,255.5291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.4%,696.201,0,4.05,8097.67 +80,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,440.6719,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,255.5291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.4%,696.201,0,4.05,8097.67 +80,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,440.6719,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,255.5291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.4%,696.201,0,4.05,8097.67 +80,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,440.6719,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,255.5291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.4%,696.201,0,4.05,8097.67 80,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,440.6719,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,255.5291,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.4%,696.201,0,4.05,8097.67 80,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,714.05,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,401.6611,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.4%,1115.7111,0,5.05,5053.34 80,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,985.8866,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,559.8234,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.5%,1545.71,0,7.29,3648.15 @@ -500,6 +784,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1149.8246,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,670.1762,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,1820.0008,0,24.78,3101.36 80,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1256.2287,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,729.0988,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,1985.3275,0,45.43,2846.8 80,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,1545.4364,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,1026.4073,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,3.3%,2571.8437000000004,0,70.14,2203.29 +80,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,438.1766,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,247.4425,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,685.6191,0,4.11,8222.65 +80,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,438.1766,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,247.4425,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,685.6191,0,4.11,8222.65 +80,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,438.1766,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,247.4425,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,685.6191,0,4.11,8222.65 +80,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,438.1766,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,247.4425,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,685.6191,0,4.11,8222.65 80,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,438.1766,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,247.4425,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,685.6191,0,4.11,8222.65 80,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,712.3917,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,389.8042,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1102.1959,0,5.11,5115.3 80,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,979.7129,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,568.5654,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1548.2783,0,7.28,3642.1 @@ -507,6 +795,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1171.5851,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,662.7687,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1834.3538,0,24.58,3077.1 80,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,128,0,27.2341,moe_ck2stages_gemm1_256x128x128x128_1x4_TypeCastExpertWeight_v3_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,1749.6879,moe_ck2stages_gemm2_256x128x128x64_1x4_TypeCast_v3_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,1776.9219999999998,0,50.76,3180.68 80,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,1551.7224,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,998.809,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,2550.5314,0,70.73,2221.7 +80,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,230.2686,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,158.8229,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.4%,389.0915,0,7.24,7244.87 +80,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,230.2686,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,158.8229,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.4%,389.0915,0,7.24,7244.87 +80,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,230.2686,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,158.8229,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.4%,389.0915,0,7.24,7244.87 +80,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,230.2686,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,158.8229,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.4%,389.0915,0,7.24,7244.87 80,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,230.2686,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,158.8229,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.4%,389.0915,0,7.24,7244.87 80,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,372.7495,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.1%,246.5629,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.5%,619.3124,0,9.1,4552.24 80,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,502.1489,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,339.9999,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.4%,842.1488,0,13.39,3348.52 @@ -514,6 +806,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,591.8466,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.1%,398.0473,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,989.8939,0,45.56,2852.91 80,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,626.1058,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x256_2tg_pf2E,0.0%,442.0281,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1068.1339,0,84.44,2649.09 80,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,0,32,0,761.7135,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,621.4747,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight1_F8_F8_B16,3.3%,1383.1882,0,130.42,2053.66 +80,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,227.7593,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,155.545,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,383.3043,0,7.35,7354.25 +80,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,227.7593,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,155.545,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,383.3043,0,7.35,7354.25 +80,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,227.7593,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,155.545,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,383.3043,0,7.35,7354.25 +80,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,227.7593,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,155.545,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,383.3043,0,7.35,7354.25 80,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,227.7593,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant1_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,155.545,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,383.3043,0,7.35,7354.25 80,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,370.0464,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,232.0003,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,602.0467,0,9.36,4682.79 80,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,509.0501,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,326.8678,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,835.9178999999999,0,13.49,3373.48 @@ -521,6 +817,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,610.1172,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,408.2756,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1018.3928,0,44.28,2773.07 80,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,624.0096,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf2E,0.0%,411.5878,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1035.5974,0,87.09,2732.32 80,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Tensor,1,1,32,0,759.5362,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,596.1779,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant1_MulRoutedWeight0_F8_F8_B16,0.0%,1355.7141,0,133.06,2095.27 +80,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,231.6437,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,173.6438,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.4%,405.2875,0,6.95,6955.35 +80,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,231.6437,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,173.6438,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.4%,405.2875,0,6.95,6955.35 +80,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,231.6437,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,173.6438,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.4%,405.2875,0,6.95,6955.35 +80,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,231.6437,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,173.6438,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.4%,405.2875,0,6.95,6955.35 80,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,231.6437,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,173.6438,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.4%,405.2875,0,6.95,6955.35 80,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,363.3125,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,265.3606,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.4%,628.6731,0,8.97,4484.46 80,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,507.5861,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,362.0567,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.4%,869.6428,0,12.96,3242.65 @@ -528,6 +828,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,600.7597,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,432.403,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1033.1627,0,43.65,2733.43 80,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,620.397,_ZN5aiter48fmoe_stage1_bf16_pertokenFp8_g1u1_32x128_3tg_pf3E,0.0%,467.8512,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1088.2482,0,82.88,2600.13 80,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,750.417,_ZN5aiter44fmoe_stage1_bf16_pertokenFp8_g1u1_32x512_pf3E,0.0%,717.4604,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight1_F8_F8_B16,3.3%,1467.8774,0,122.89,1935.17 +80,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,230.2993,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,169.4253,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,399.7246,0,7.05,7052.15 +80,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,230.2993,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,169.4253,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,399.7246,0,7.05,7052.15 +80,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,230.2993,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,169.4253,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,399.7246,0,7.05,7052.15 +80,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,230.2993,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,169.4253,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,399.7246,0,7.05,7052.15 80,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,230.2993,moe_ck2stages_gemm1_256x32x64x256_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,169.4253,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,399.7246,0,7.05,7052.15 80,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,363.9112,moe_ck2stages_gemm1_256x32x64x128_1x4_MulABScale_v1_Nswizzle0_Quant2_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,259.8372,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,623.7484,0,9.04,4519.87 80,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,510.4385,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,361.3331,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,871.7716,0,12.93,3234.73 @@ -535,6 +839,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,591.7744,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x128_3tg_pf3E,0.0%,430.4532,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1022.2276,0,44.12,2762.67 80,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,619.0301,_ZN5aiter57fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x256_2tg_pf2E,0.0%,468.3093,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1087.3393999999998,0,82.95,2602.3 80,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,1,32,0,749.1329,_ZN5aiter53fmoe_stage1_bf16_pertokenFp8_doweight_g1u1_32x512_pf3E,0.0%,695.2144,moe_ck2stages_gemm2_256x32x64x128_1x4_MulABScaleExpertWeight_v1_Nswizzle0_Quant2_MulRoutedWeight0_F8_F8_B16,0.0%,1444.3473,0,124.89,1966.7 +80,1,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.9501,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,66.7261,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,155.6762,0,3.88,3881.4 +80,2,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.9501,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,66.7261,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,155.6762,0,3.88,3881.4 +80,4,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.9501,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,66.7261,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,155.6762,0,3.88,3881.4 +80,8,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.9501,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,66.7261,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,155.6762,0,3.88,3881.4 80,16,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,88.9501,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,66.7261,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,155.6762,0,3.88,3881.4 80,32,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,115.8935,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,90.1026,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,205.9961,0,5.86,2934.54 80,64,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,124.4518,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,102.1206,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,226.5724,0,10.66,2670.35 @@ -542,6 +850,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,139.1257,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,108.5817,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,247.7074,0,39.01,2455.21 80,512,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,177.2587,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,152.9295,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,330.1882,0,58.53,1854.6 80,1024,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,286.2896,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,254.7276,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,541.0172,0,71.45,1147.39 +80,1,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.3641,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,61.6683,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,150.0324,0,4.03,4027.41 +80,2,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.3641,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,61.6683,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,150.0324,0,4.03,4027.41 +80,4,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.3641,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,61.6683,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,150.0324,0,4.03,4027.41 +80,8,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.3641,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,61.6683,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,150.0324,0,4.03,4027.41 80,16,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,88.3641,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,61.6683,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,150.0324,0,4.03,4027.41 80,32,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,116.2949,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,83.4229,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,199.7178,0,6.05,3026.79 80,64,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,128.528,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,93.9786,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,222.5066,0,10.86,2719.15 @@ -549,6 +861,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,140.9059,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,100.6281,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,241.534,0,40.01,2517.96 80,512,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,176.6934,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,140.5842,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,317.2776,0,60.92,1930.07 80,1024,4096,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,285.499,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,233.8056,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,519.3046,0,74.44,1195.36 +80,1,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,37.5796,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,36.5426,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.12219999999999,0,4.07,4075.98 +80,2,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,37.5796,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,36.5426,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.12219999999999,0,4.07,4075.98 +80,4,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,37.5796,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,36.5426,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.12219999999999,0,4.07,4075.98 +80,8,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,37.5796,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,36.5426,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.12219999999999,0,4.07,4075.98 80,16,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,37.5796,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,36.5426,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,74.12219999999999,0,4.07,4075.98 80,32,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,58.5697,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,49.5953,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,108.165,0,5.58,2794.36 80,64,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,62.6056,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,55.5345,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,118.1401,0,10.22,2560.64 @@ -556,6 +872,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,65.3992,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,59.9428,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,125.34199999999998,0,38.55,2426.06 80,512,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,93.5238,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,83.188,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,176.71179999999998,0,54.69,1732.68 80,1024,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,140.8771,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_gelu_B16_B16_B16,0.0%,134.5253,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.0%,275.4024,0,70.18,1127.0 +80,1,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.1892,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,34.1814,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.3706,0,4.17,4174.64 +80,2,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.1892,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,34.1814,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.3706,0,4.17,4174.64 +80,4,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.1892,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,34.1814,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.3706,0,4.17,4174.64 +80,8,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.1892,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,34.1814,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.3706,0,4.17,4174.64 80,16,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,38.1892,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,34.1814,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,72.3706,0,4.17,4174.64 80,32,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,59.0984,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,45.8963,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,104.9947,0,5.75,2878.74 80,64,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,62.4816,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,51.5263,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,114.0079,0,10.6,2653.45 @@ -563,6 +883,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,65.5874,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,55.6592,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,121.2466,0,39.85,2508.0 80,512,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,94.7864,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,76.6968,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,171.4832,0,56.35,1785.51 80,1024,2048,192,128,8,ActivationType.Gelu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,1,32,0,144.9248,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCastExpertWeight_v1_Nswizzle0_Quant0_MulRoutedWeight1_gelu_B16_B16_B16,0.0%,123.3403,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_B16_B16_B16,0.0%,268.2651,0,72.05,1156.98 +256,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.439,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,41.4885,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.9275,0,12.48,12482.61 +256,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.439,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,41.4885,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.9275,0,12.48,12482.61 +256,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.439,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,41.4885,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.9275,0,12.48,12482.61 +256,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.439,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,41.4885,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.9275,0,12.48,12482.61 256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.439,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,41.4885,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.9275,0,12.48,12482.61 256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,120.1868,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,59.0038,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,179.1906,0,15.73,7868.57 256,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,141.1318,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,79.7079,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,220.8397,0,25.53,6387.72 @@ -570,6 +894,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,161.4495,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,118.1586,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,279.6081,0,80.64,5059.91 256,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,203.4884,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,212.7151,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,416.2035,0,108.35,3412.5 256,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,328.7665,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,403.3563,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,732.1228,0,123.2,1955.01 +256,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.08,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,71.7868,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.0%,208.8668,0,13.49,13496.24 +256,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.08,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,71.7868,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.0%,208.8668,0,13.49,13496.24 +256,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.08,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,71.7868,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.0%,208.8668,0,13.49,13496.24 +256,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.08,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,71.7868,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.0%,208.8668,0,13.49,13496.24 256,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.08,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,71.7868,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.0%,208.8668,0,13.49,13496.24 256,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,205.7119,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,103.7369,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,16.0%,309.4488,0,18.22,9110.59 256,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,282.9681,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,143.4641,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.9%,426.4322,0,26.44,6612.89 @@ -577,6 +905,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,318.0049,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,179.0098,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,497.0147,0,90.74,5682.08 256,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,403.3586,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,227.8796,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,631.2382,0,142.88,4482.59 256,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,553.5171,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,421.0368,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,974.5539,0,185.1,2914.76 +256,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.51,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,71.4507,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,208.9607,0,13.49,13490.17 +256,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.51,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,71.4507,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,208.9607,0,13.49,13490.17 +256,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.51,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,71.4507,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,208.9607,0,13.49,13490.17 +256,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.51,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,71.4507,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,208.9607,0,13.49,13490.17 256,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,137.51,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,71.4507,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,208.9607,0,13.49,13490.17 256,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,206.2526,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,103.5784,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.9%,309.831,0,18.19,9099.35 256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,282.8631,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,144.8538,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,427.7169,0,26.36,6593.03 @@ -584,6 +916,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,315.2045,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,179.0137,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,494.2182,0,91.25,5714.23 256,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,408.1562,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,228.0692,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,636.2254,0,141.76,4447.45 256,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,572.8802,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,420.9959,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,993.8761,0,181.5,2858.1 +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,54.4576,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,46.027,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,100.4846,0,12.02,6012.63 +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,54.4576,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,46.027,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,100.4846,0,12.02,6012.63 +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,54.4576,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,46.027,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,100.4846,0,12.02,6012.63 +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,54.4576,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,46.027,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,100.4846,0,12.02,6012.63 256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,54.4576,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,46.027,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,100.4846,0,12.02,6012.63 256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,78.6177,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,60.2532,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,138.8709,0,17.4,4352.05 256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,81.5016,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,66.9561,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,148.4577,0,32.55,4073.66 @@ -591,6 +927,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,86.024,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,141.2712,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,227.2952,0,85.03,2671.09 256,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,91.3559,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,248.1618,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,339.5177,0,113.85,1797.47 256,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,124.3946,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,473.4248,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,597.8194,0,129.32,1031.35 +256,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,55.9955,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,45.9382,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,101.9337,0,11.85,5927.15 +256,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,55.9955,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,45.9382,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,101.9337,0,11.85,5927.15 +256,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,55.9955,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,45.9382,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,101.9337,0,11.85,5927.15 +256,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,55.9955,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,45.9382,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,101.9337,0,11.85,5927.15 256,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,55.9955,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,45.9382,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,101.9337,0,11.85,5927.15 256,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,81.7704,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,60.4356,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,142.206,0,16.99,4249.98 256,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,86.8808,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,66.8218,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,153.7026,0,31.44,3934.65 @@ -598,6 +938,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,90.4642,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,141.6691,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,232.1333,0,83.26,2615.42 256,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,95.7813,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,247.6126,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,343.3939,0,112.57,1777.18 256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,64,0,128.833,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.1%,473.6369,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,602.4699,0,128.32,1023.39 +256,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,71.6188,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,41.3596,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,112.9784,0,12.47,12476.99 +256,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,71.6188,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,41.3596,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,112.9784,0,12.47,12476.99 +256,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,71.6188,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,41.3596,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,112.9784,0,12.47,12476.99 +256,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,71.6188,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,41.3596,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,112.9784,0,12.47,12476.99 256,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,71.6188,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,41.3596,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,112.9784,0,12.47,12476.99 256,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,120.1253,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,58.8622,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,178.9875,0,15.75,7877.5 256,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,141.4796,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,79.0378,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,220.5174,0,25.56,6397.06 @@ -605,6 +949,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,161.8062,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,117.8189,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,279.6251,0,80.64,5059.6 256,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,203.3392,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,212.9202,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,416.2594,0,108.34,3412.05 256,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,329.0517,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,400.2276,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,729.2793,0,123.68,1962.63 +256,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.1212,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,71.9542,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,209.0754,0,13.48,13482.77 +256,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.1212,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,71.9542,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,209.0754,0,13.48,13482.77 +256,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.1212,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,71.9542,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,209.0754,0,13.48,13482.77 +256,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.1212,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,71.9542,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,209.0754,0,13.48,13482.77 256,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.1212,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,71.9542,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,209.0754,0,13.48,13482.77 256,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,205.6879,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,103.7835,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,309.4714,0,18.22,9109.92 256,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,280.6797,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,145.7947,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,426.4744,0,26.44,6612.23 @@ -612,6 +960,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,316.5168,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,179.9899,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,496.5067,0,90.83,5687.89 256,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,403.1043,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,229.7632,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,632.8675,0,142.52,4471.05 256,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,553.6587,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,425.3929,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,979.0516,0,184.25,2901.37 +256,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.2707,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,71.8847,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,209.1554,0,13.48,13477.62 +256,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.2707,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,71.8847,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,209.1554,0,13.48,13477.62 +256,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.2707,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,71.8847,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,209.1554,0,13.48,13477.62 +256,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.2707,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,71.8847,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,209.1554,0,13.48,13477.62 256,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,137.2707,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,71.8847,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,209.1554,0,13.48,13477.62 256,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,205.8602,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,104.2026,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,310.0628,0,18.18,9092.55 256,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,281.4413,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,145.7705,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,427.2118,0,26.39,6600.82 @@ -619,6 +971,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,314.8954,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,179.522,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,494.4174,0,91.21,5711.93 256,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,406.507,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,230.6196,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,637.1266,0,141.56,4441.16 256,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,575.7451,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,426.4277,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1002.1728,0,180.0,2834.43 +256,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89 +256,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89 +256,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89 +256,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89 256,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,56.2731,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,45.5448,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,101.8179,0,11.86,5933.89 256,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,80.9637,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,59.914,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,140.8777,0,17.15,4290.05 256,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,84.98,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,66.4795,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,151.4595,0,31.9,3992.92 @@ -632,6 +988,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,160.0658,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,117.833,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,277.8988,0,81.14,5091.03 256,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,205.7498,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,212.5856,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,418.3354,0,107.8,3395.11 256,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,16,0,327.5497,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.1%,402.6535,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,730.2032,0,123.52,1960.15 +256,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,57.2319,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,44.9913,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,102.2232,0,11.82,5910.36 +256,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,57.2319,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,44.9913,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,102.2232,0,11.82,5910.36 +256,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,57.2319,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,44.9913,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,102.2232,0,11.82,5910.36 +256,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,57.2319,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,44.9913,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,102.2232,0,11.82,5910.36 256,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,57.2319,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,44.9913,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,102.2232,0,11.82,5910.36 256,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,83.8327,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,58.83,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,142.6627,0,16.93,4236.38 256,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,88.5384,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,66.7218,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,155.2602,0,31.12,3895.18 @@ -639,6 +999,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,92.6716,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,141.8887,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,234.5603,0,82.4,2588.36 256,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,95.6695,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,247.8364,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,343.5059,0,112.53,1776.6 256,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,1,64,0,133.8828,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.1%,474.349,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,608.2318,0,127.11,1013.7 +80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,123.5681,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,83.7681,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,207.3362,0,6.8,6798.77 +80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,123.5681,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,83.7681,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,207.3362,0,6.8,6798.77 +80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,123.5681,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,83.7681,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,207.3362,0,6.8,6798.77 +80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,123.5681,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,83.7681,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,207.3362,0,6.8,6798.77 80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,123.5681,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,83.7681,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,207.3362,0,6.8,6798.77 80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,204.1789,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,127.8428,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,332.0217,0,8.49,4246.63 80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,256.3235,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,173.5099,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,429.8334,0,13.11,3281.88 @@ -646,10 +1010,22 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,307.9267,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,225.0382,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,532.9649,0,42.31,2654.57 80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,388.2337,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,321.3807,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,709.6144,0,63.55,2001.5 80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,557.7338,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,556.9381,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1114.6719,0,80.92,1284.06 +80,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,245.7053,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,130.3948,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.1001,0,7.49,7495.12 +80,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,245.7053,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,130.3948,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.1001,0,7.49,7495.12 +80,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,245.7053,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,130.3948,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.1001,0,7.49,7495.12 +80,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,245.7053,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,130.3948,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.1001,0,7.49,7495.12 80,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,245.7053,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,130.3948,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.1001,0,7.49,7495.12 80,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,374.0904,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,216.3424,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,590.4328,0,9.55,4774.9 80,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,500.4382,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,283.1229,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.9%,783.5611,0,14.39,3598.89 +80,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,218.6275,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,307.0939,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,525.7214,0,2.3,1149.23 +80,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,218.6275,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,307.0939,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,525.7214,0,2.3,1149.23 +80,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,218.6275,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,307.0939,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,525.7214,0,2.3,1149.23 +80,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,218.6275,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,307.0939,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,525.7214,0,2.3,1149.23 80,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,218.6275,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,307.0939,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,525.7214,0,2.3,1149.23 +80,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,124.4194,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,83.1785,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,207.5979,0,6.79,6790.19 +80,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,124.4194,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,83.1785,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,207.5979,0,6.79,6790.19 +80,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,124.4194,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,83.1785,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,207.5979,0,6.79,6790.19 +80,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,124.4194,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,83.1785,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,207.5979,0,6.79,6790.19 80,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,124.4194,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,83.1785,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,207.5979,0,6.79,6790.19 80,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,204.8947,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,128.9358,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,333.8305,0,8.44,4223.62 80,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,252.4238,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,175.0782,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,427.502,0,13.19,3299.78 @@ -667,6 +1043,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,296.2706,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,517.6107,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,813.8813,0,23.75,745.96 80,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,324.2019,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,548.1039,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.5%,872.3058,0,44.31,699.61 80,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,64,0,509.2109,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,865.0837,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,1374.2946,0,56.25,448.64 +80,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,244.2105,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,132.0083,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.2188,0,7.49,7492.76 +80,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,244.2105,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,132.0083,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.2188,0,7.49,7492.76 +80,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,244.2105,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,132.0083,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.2188,0,7.49,7492.76 +80,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,244.2105,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,132.0083,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.2188,0,7.49,7492.76 80,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,244.2105,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,132.0083,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,376.2188,0,7.49,7492.76 80,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,366.6026,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,203.0742,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.8%,569.6768,0,9.9,4948.88 80,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,517.7397,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,297.5602,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.9%,815.2999,0,13.83,3458.79 @@ -674,6 +1054,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,596.114,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,376.262,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,972.376,0,46.38,2904.31 80,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,703.0772,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,515.3501,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,1218.4273,0,74.03,2322.32 80,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,1032.1656,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_gelu_F8_F8_B16,0.0%,839.6044,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,1871.77,0,96.37,1517.6 +80,1,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,218.3233,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,305.0925,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.9%,523.4158,0,2.31,1154.3 +80,2,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,218.3233,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,305.0925,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.9%,523.4158,0,2.31,1154.3 +80,4,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,218.3233,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,305.0925,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.9%,523.4158,0,2.31,1154.3 +80,8,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,218.3233,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,305.0925,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.9%,523.4158,0,2.31,1154.3 80,16,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,218.3233,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,305.0925,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.9%,523.4158,0,2.31,1154.3 80,32,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,272.2826,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,435.0107,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.9%,707.2933,0,3.42,854.49 80,64,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,275.5136,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,479.9084,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,755.422,0,6.4,800.57 @@ -681,6 +1065,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,293.1722,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,514.6465,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,807.8187,0,23.93,751.56 80,512,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,316.2687,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,545.4219,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,861.6906,0,44.86,708.23 80,1024,4096,384,128,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,495.1237,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,858.9692,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1354.0929,0,57.09,455.33 +80,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.9899,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,81.8591,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,206.849,0,6.81,6814.78 +80,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.9899,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,81.8591,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,206.849,0,6.81,6814.78 +80,4,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.9899,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,81.8591,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,206.849,0,6.81,6814.78 +80,8,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.9899,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,81.8591,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,206.849,0,6.81,6814.78 80,16,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.9899,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,81.8591,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,206.849,0,6.81,6814.78 80,32,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,205.3401,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,128.7497,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,334.0898,0,8.44,4220.35 80,64,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,255.6311,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,170.7752,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,426.4063,0,13.22,3308.26 @@ -688,6 +1076,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,301.0373,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,225.1659,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,526.2032,0,42.85,2688.68 80,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,398.4468,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,319.45,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,717.8968,0,62.82,1978.41 80,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,558.3594,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,555.8992,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1114.2586,0,80.95,1284.54 +80,1,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.1564,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,83.5223,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,207.6787,0,6.79,6787.55 +80,2,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.1564,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,83.5223,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,207.6787,0,6.79,6787.55 +80,4,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.1564,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,83.5223,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,207.6787,0,6.79,6787.55 +80,8,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.1564,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,83.5223,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,207.6787,0,6.79,6787.55 80,16,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,124.1564,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,83.5223,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,207.6787,0,6.79,6787.55 80,32,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,198.0641,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,130.4169,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,328.481,0,8.58,4292.41 80,64,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,254.9635,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,170.7009,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,425.6644,0,13.24,3314.02 @@ -695,6 +1087,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,310.1395,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,218.1749,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,528.3144,0,42.68,2677.93 80,512,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,384.4754,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,319.2123,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,703.6877,0,64.09,2018.36 80,1024,7168,256,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,560.9966,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,549.7671,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,1110.7637,0,81.2,1288.58 +80,1,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,241.8455,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,132.6515,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,374.497,0,7.53,7527.21 +80,2,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,241.8455,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,132.6515,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,374.497,0,7.53,7527.21 +80,4,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,241.8455,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,132.6515,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,374.497,0,7.53,7527.21 +80,8,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,241.8455,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,132.6515,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,374.497,0,7.53,7527.21 80,16,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,241.8455,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,132.6515,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,374.497,0,7.53,7527.21 80,32,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,375.1735,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,214.1587,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.6%,589.3322,0,9.57,4783.82 80,64,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,497.6768,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,302.6195,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,800.2963,0,14.09,3523.63 @@ -702,6 +1098,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,578.2908,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,375.1603,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,953.4511,0,47.3,2961.95 80,512,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,727.1819,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,509.1783,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1236.3602,0,72.95,2288.64 80,1024,7168,512,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1059.0782,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_silu_F8_F8_B16,0.0%,841.9574,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1901.0356,0,94.89,1494.23 +80,1,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,221.5506,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,326.2097,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,547.7603,0,2.21,1102.99 +80,2,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,221.5506,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,326.2097,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,547.7603,0,2.21,1102.99 +80,4,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,221.5506,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,326.2097,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,547.7603,0,2.21,1102.99 +80,8,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,221.5506,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,326.2097,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,547.7603,0,2.21,1102.99 80,16,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,221.5506,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,326.2097,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,547.7603,0,2.21,1102.99 80,32,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,276.3545,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,408.058,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,684.4125,0,3.53,883.05 80,64,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,278.4781,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,520.2659,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,798.744,0,6.05,757.15 @@ -709,6 +1109,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,296.1626,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,515.0018,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,811.1644,0,23.83,748.46 80,512,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,319.2415,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,544.9096,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,864.1511,0,44.73,706.21 80,1024,4096,384,128,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,64,0,500.5853,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,854.6331,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightA8W8blkscale_v3_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1355.2184,0,57.05,454.95 +80,1,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,243.5773,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,134.825,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,378.4023,0,7.45,7449.52 +80,2,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,243.5773,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,134.825,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,378.4023,0,7.45,7449.52 +80,4,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,243.5773,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,134.825,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,378.4023,0,7.45,7449.52 +80,8,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,243.5773,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,134.825,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,378.4023,0,7.45,7449.52 80,16,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,243.5773,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,134.825,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,378.4023,0,7.45,7449.52 80,32,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,377.9925,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,208.2557,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,586.2482,0,9.62,4808.99 80,64,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,508.1175,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,301.8424,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.8%,809.9599,0,13.92,3481.59 @@ -717,6 +1121,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,512,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,734.3088,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,509.0543,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1243.3631,0,72.54,2275.75 80,1024,7168,512,256,8,ActivationType.Gelu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,1,16,0,1040.5956,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_gelu_F8_F8_B16,0.0%,844.24,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_F8_F8_B16,16.7%,1884.8356,0,95.71,1507.08 80,56,6144,4096,8,2,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,228.7482,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.5%,0.0,Null,0.0%,228.7482,1,73.93,2644.88 +80,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,245.0416,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,245.0416,1,6.47,5775.08 +80,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,245.0416,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,245.0416,1,6.47,5775.08 +80,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,245.0416,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,245.0416,1,6.47,5775.08 +80,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,245.0416,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,245.0416,1,6.47,5775.08 80,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,245.0416,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,245.0416,1,6.47,5775.08 80,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,370.7841,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,370.7841,1,8.55,3817.53 80,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,428.9409,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,428.9409,1,14.78,3301.54 @@ -724,6 +1132,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,536.7655,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,536.7655,1,47.26,2646.03 80,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,560.4425,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,560.4425,1,90.53,2544.06 80,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,827.4898,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,827.4898,1,122.62,1736.35 +80,1,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,274.0603,_ZN5aiter59fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x256_2tg_pf3E,4.9%,150.3324,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,0.3%,424.3927,0,33.21,3425.3 +80,2,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,274.0603,_ZN5aiter59fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x256_2tg_pf3E,4.9%,150.3324,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,0.3%,424.3927,0,33.21,3425.3 +80,4,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,274.0603,_ZN5aiter59fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x256_2tg_pf3E,4.9%,150.3324,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,0.3%,424.3927,0,33.21,3425.3 +80,8,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,274.0603,_ZN5aiter59fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x256_2tg_pf3E,4.9%,150.3324,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,0.3%,424.3927,0,33.21,3425.3 80,16,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,274.0603,_ZN5aiter59fmoe_stage1_bf16_pertokenFp8_blockscale_g1u1_16x256_2tg_pf3E,4.9%,150.3324,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,0.3%,424.3927,0,33.21,3425.3 80,32,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,16,0,359.0112,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.0%,190.8827,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,0.2%,549.8939,0,51.26,2644.17 80,64,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,631.2833,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.9%,0.0,Null,0.0%,631.2833,1,89.3,2304.36 @@ -731,6 +1143,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 80,256,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,1166.708,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.9%,0.0,Null,0.0%,1166.708,1,193.27,1250.38 80,512,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,2209.3824,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.8%,0.0,Null,0.0%,2209.3824,1,204.12,662.78 80,1024,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_1x128,1,0,32,0,4205.8762,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.9%,0.0,Null,0.0%,4205.8762,1,214.45,350.78 +80,1,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,442.3731,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,6.8%,0.0,Null,0.0%,442.3731,1,31.86,3286.07 +80,2,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,442.3731,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,6.8%,0.0,Null,0.0%,442.3731,1,31.86,3286.07 +80,4,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,442.3731,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,6.8%,0.0,Null,0.0%,442.3731,1,31.86,3286.07 +80,8,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,442.3731,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,6.8%,0.0,Null,0.0%,442.3731,1,31.86,3286.07 80,16,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,442.3731,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,6.8%,0.0,Null,0.0%,442.3731,1,31.86,3286.07 80,32,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,520.7061,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x512E,5.4%,0.0,Null,0.0%,520.7061,1,54.13,2792.39 80,64,7168,2048,33,10,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fnuz,torch.float8_e4m3fnuz,QuantType.per_Token,1,0,32,0,622.6569,_ZN5aiter45fmoe_bf16_pertokenFp8_g1u1_vs_silu_1tg_32x256E,6.9%,0.0,Null,0.0%,622.6569,1,90.53,2336.28 @@ -744,6 +1160,10 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,128,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,72.9667,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,41.6406,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,114.6073,0,42.16,5288.29 256,64,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,71.4856,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,39.357,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,110.8426,0,21.8,5458.45 256,32,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,62.7201,moe_ck2stages_gemm1_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,35.3504,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,98.0705,0,12.32,6163.97 +256,1,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,47.8381,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,28.9978,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,76.8359,0,7.86,7864.06 +256,2,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,47.8381,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,28.9978,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,76.8359,0,7.86,7864.06 +256,4,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,47.8381,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,28.9978,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,76.8359,0,7.86,7864.06 +256,8,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,47.8381,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,28.9978,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,76.8359,0,7.86,7864.06 256,16,4096,192,128,8,ActivationType.Silu,torch.bfloat16,torch.bfloat16,torch.bfloat16,QuantType.No,1,0,32,0,47.8381,moe_ck2stages_gemm1_256x32x64x128_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight0_silu_B16_B16_B16,0.0%,28.9978,moe_ck2stages_gemm2_256x32x64x64_1x4_TypeCast_v1_Nswizzle0_Quant0_MulRoutedWeight1_B16_B16_B16,0.1%,76.8359,0,7.86,7864.06 256,1,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,45.285,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,9.0945,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,54.3795,0,1.62,25916.16 256,2,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,46.5232,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,11.8082,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,58.3314,0,3.02,24160.73 @@ -756,15 +1176,19 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,256,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,249.5786,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,249.5786,1,90.35,5668.72 256,512,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,260.9691,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,260.9691,1,172.81,5442.39 256,1024,7168,256,256,8,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,359.4797,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,359.4797,1,250.9,3981.61 +256,1,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0 +256,2,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0 +256,4,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0 +256,8,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0 256,16,5120,1024,128,1,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_Token,1,1,32,0,0.0,_ZN5aiter46fmoe_bf16_pertokenFp8_g1u1_tkw1_silu_1tg_32x64E,0.0%,0.0,Null,0,0.0,1,0.0,0.0 -256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,45.285,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,9.0945,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,54.3795,0,1.62,25916.16 -256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,46.5232,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,11.8082,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.3%,58.3314,0,3.02,24160.73 -256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,48.2418,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,17.8498,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.4%,66.0916,0,5.33,21324.53 -256,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,53.6435,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,25.7951,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.7%,79.4386,0,8.87,17742.74 +256,1,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.1678,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,41.5098,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.6776,0,12.51,12510.3 +256,2,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.1678,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,41.5098,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.6776,0,12.51,12510.3 +256,4,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.1678,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,41.5098,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.6776,0,12.51,12510.3 +256,8,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.1678,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,41.5098,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.6776,0,12.51,12510.3 256,16,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,16,0,71.1678,moe_ck2stages_gemm1_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight0_silu_F8_F8_B16,0.1%,41.5098,moe_ck2stages_gemm2_256x16x128x256_1x4_MulABScaleExpertWeightA8W8blkscale_v1_Nswizzle0_Quant4_MulRoutedWeight1_F8_F8_B16,15.6%,112.6776,0,12.51,12510.3 256,32,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,158.4834,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,158.4834,1,17.78,8896.67 256,64,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,212.9873,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,212.9873,1,26.47,6623.22 256,128,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,241.6039,_ZN5aiter50fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_ps_32x256E,0.0%,0.0,Null,0.0%,241.6039,1,46.66,5844.44 256,256,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,249.5786,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,249.5786,1,90.35,5668.72 256,512,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,260.9691,_ZN5aiter47fmoe_bf16_blockscaleFp8_g1u1_vs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,260.9691,1,172.81,5442.39 -256,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,359.4797,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,359.4797,1,250.9,3981.61 \ No newline at end of file +256,1024,7168,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float8_e4m3fn,QuantType.per_1x128,1,0,32,0,359.4797,_ZN5aiter49fmoe_bf16_blockscaleFp8_g1u1_novs_silu_1tg_32x256E,0.0%,0.0,Null,0.0%,359.4797,1,250.9,3981.61 diff --git a/aiter/fused_moe.py b/aiter/fused_moe.py index abfefda272..3cbbbbf3c8 100644 --- a/aiter/fused_moe.py +++ b/aiter/fused_moe.py @@ -104,6 +104,7 @@ def fused_moe( intermediate_pad=0, bias1=None, bias2=None, + splitk=0, ): if not block_size_M: block_size_M = -1 @@ -217,7 +218,15 @@ def fused_moe_( quant_type = quant_remap.get(quant_type, quant_type) q_dtype_w = w1.dtype q_dtype_a = w1.dtype if w1.dtype != torch.uint32 else dtypes.fp8 - q_dtype_a = dtypes.fp4x2 if quant_type == QuantType.per_1x32 else q_dtype_a + bf16_fp8_bound = 512 + if quant_type == QuantType.per_1x32: + if activation == ActivationType.Swiglu: + if get_gfx() != "gfx950" or M < bf16_fp8_bound: + q_dtype_a = dtypes.bf16 + elif M >= bf16_fp8_bound: + q_dtype_a = dtypes.fp8 + else: + q_dtype_a = dtypes.fp4x2 metadata = get_2stage_cfgs( get_padded_M(M), # consider token_num > 1024 as prefill @@ -234,8 +243,6 @@ def fused_moe_( doweight_stage1, hidden_pad, intermediate_pad, - bias1, - bias2, ) block_size_M = metadata.block_m if block_size_M is None else block_size_M @@ -472,6 +479,33 @@ def get_block_size_M(token, topk, expert, inter_dim): return sorted(tmp, key=lambda x: x[:2])[0][-1] +@functools.lru_cache(maxsize=2048) +def get_ksplit(token, topk, expert, inter_dim, model_dim): + aiter_ksplit = int(os.environ.get("AITER_KSPLIT", "0")) + if aiter_ksplit != 0: + return aiter_ksplit + # only for moe_blk gemm1 a8w8 decode scenario + if token * topk > expert: + return 0 + cu_num = get_cu_num() + tileN = 128 + + tgM = token * topk # decode tile num + tgN = (inter_dim * 2 + tileN - 1) // tileN + + tg_num = tgN * tgM + # if all cu already active + if tg_num >= cu_num: + return 0 + tilek = 256 + split_max = (cu_num + tg_num - 1) // tg_num + # at least split = 2 + for i in reversed(range(2, split_max + 1)): + if (model_dim % i == 0) and ((model_dim // i) % tilek == 0): + return i + return 0 + + cfg_2stages = None # fmt: off fused_moe_1stage_dict = { @@ -512,7 +546,8 @@ def nextPow2(n): def get_padded_M(M): padded_m = M if M >= 1 and M <= 16: - padded_m = 16 + # decoding policy may be changed in the future. + padded_m = nextPow2(padded_m) elif M < 1024: padded_m = nextPow2(padded_m) elif M < 2048: @@ -531,6 +566,7 @@ class MOEMetadata: block_m: int ksplit: int run_1stage: bool = False + has_bias: bool = False @functools.lru_cache(maxsize=2048) @@ -549,8 +585,6 @@ def get_2stage_cfgs( doweight_stage1, hidden_pad, intermediate_pad, - bias1, - bias2, ): def get_cfg_2stages(tune_file): import pandas as pd @@ -620,8 +654,22 @@ def FinalFunc(): ) logger.info("\033[0m") + def use_cfg(): + problem_type = (activation, dtype, q_dtype_a, q_dtype_w, q_type) + bypass_type = ( + ActivationType.Silu, + dtypes.bf16, + dtypes.fp8, + dtypes.fp8, + QuantType.per_1x128, + ) + if problem_type == bypass_type and (token * topk) <= 128: # bypass tuned + aiter.logger.info("bypass tuned results for fp8 blockscale") + return False + return True + # cfg = cfg_2stages.get(keys, None) - cfg = cfg_2stages.get(keys, None) if cfg_2stages else None + cfg = cfg_2stages.get(keys, None) if cfg_2stages and use_cfg() else None if cfg is None and os.environ.get("AITER_ONLINE_TUNE", "0") == "1": lock_path = os.path.join(bd_dir, f"lock_fmoe_tune_{keys}") mp_lock(lock_path, MainFunc=MainFunc, FinalFunc=FinalFunc) @@ -630,7 +678,7 @@ def FinalFunc(): cfg = cfg_2stages.get(keys, None) if cfg_2stages else None if cfg is None: logger.warning(f"Fmoe tuning not support for {keys}") - if cfg is None: + if cfg is None or int(os.environ.get("AITER_BYPASS_TUNE_CONFIG", "0")): ksplit = 0 kernelName1 = "" kernelName2 = "" @@ -645,7 +693,7 @@ def FinalFunc(): doweight_stage1, ) in fused_moe_1stage_dict[get_gfx()]: if q_type == QuantType.per_1x128: - run_1stage = True and (inter_dim % 128 == 0) + run_1stage = token > 32 and (inter_dim % 256 == 0) elif q_type == QuantType.per_Token and q_dtype_w == dtypes.i8: run_1stage = token > 32 elif q_type == QuantType.per_Token and q_dtype_w == dtypes.fp8: @@ -657,11 +705,23 @@ def FinalFunc(): BLOCK_SIZE_M if run_1stage else ( - 64 + (64 if token > 32 else 16) if q_type == QuantType.per_1x128 else get_block_size_M(token, topk, expert, inter_dim) ) ) + ksplit = ( + ksplit + if (run_1stage) + else ( + get_ksplit(token, topk, expert, inter_dim, model_dim) + if q_type == QuantType.per_1x128 + else ksplit + ) + ) + aiter.logger.info( + f"run_1stage = {run_1stage}, ksplit = {ksplit} q_type = {q_type}" + ) else: block_m = cfg["block_m"] ksplit = cfg["ksplit"] @@ -673,6 +733,13 @@ def FinalFunc(): logger.info( f"[fused_moe] using {'1stage' if run_1stage else '2stage'} {'default' if cfg is None else tag} for {keys} " ) + + def get_block_m() -> int: + if q_dtype_a == dtypes.fp8: + return 32 + else: + return 16 if token < 2048 else 32 if token < 16384 else 64 + if run_1stage: return MOEMetadata( functools.partial( @@ -696,17 +763,16 @@ def FinalFunc(): cktile_moe_stage1, n_pad_zeros=intermediate_pad // 64 * 64 * (2 if use_g1u1 else 1), k_pad_zeros=hidden_pad // 128 * 128, - bias1=bias1, ), functools.partial( cktile_moe_stage2, n_pad_zeros=hidden_pad // 64 * 64, k_pad_zeros=intermediate_pad // 128 * 128, - bias2=bias2, ), - 16 if token < 2048 else 32 if token < 16384 else 64, + get_block_m(), ksplit, False, + True, ) if ( "ck2stages" in kernelName1 @@ -717,14 +783,16 @@ def FinalFunc(): dtypes.fp16, torch.uint32, dtypes.fp4x2, + dtypes.fp8, ] ): return MOEMetadata( functools.partial( - aiter.ck_moe_stage1_fwd, + ck_moe_stage1, kernelName=kernelName1, activation=activation, quant_type=q_type, + splitk=ksplit, ), functools.partial( aiter.ck_moe_stage2_fwd, @@ -812,17 +880,27 @@ def fused_moe_2stages( doweight_stage1, hidden_pad, intermediate_pad, - bias1, - bias2, ) if ( quant_type == QuantType.per_1x32 and dtype in [dtypes.bf16, dtypes.fp16] + and q_dtype_a in [dtypes.bf16, dtypes.fp16] and w1.dtype == dtypes.fp4x2 and activation == ActivationType.Swiglu ): a1 = hidden_states.to(dtype) a1_scale = None + elif ( + quant_type == aiter.QuantType.per_1x32 + and dtype in [dtypes.bf16, dtypes.fp16] + and q_dtype_a == dtypes.fp8 + and w1.dtype == dtypes.fp4x2 + and activation == aiter.ActivationType.Swiglu + ): + a1 = hidden_states.to(dtypes.fp8) + M = sorted_ids.shape[0] + N = a1.shape[-1] + a1_scale = torch.ones([M, N // 32], dtype=dtypes.fp8_e8m0, device=a1.device) elif quant_type == QuantType.per_1x32: if token_num <= token_num_quant_moe_sort_switch: a1, a1_scale = fused_dynamic_mxfp4_quant_moe_sort( @@ -874,7 +952,17 @@ def fused_moe_2stages( dtype=dtype, device=device, ) - + extra_stage1_args = {} + extra_stage2_args = {} + if ( + not metadata.run_1stage + and metadata.has_bias + and dtype in [dtypes.bf16, dtypes.fp16] + and quant_type == QuantType.per_1x32 + and activation == ActivationType.Swiglu + ): + extra_stage1_args["bias1"] = bias1 + extra_stage2_args["bias2"] = bias2 a2 = metadata.stage1( a1, w1, @@ -886,17 +974,31 @@ def fused_moe_2stages( topk, block_m=block_size_M, a1_scale=a1_scale, - w1_scale=w1_scale, + w1_scale=( + w1_scale.view(dtypes.fp8_e8m0) if w1.dtype == dtypes.fp4x2 else w1_scale + ), sorted_weights=sorted_weights if doweight_stage1 else None, + dtype=dtype, + **extra_stage1_args, ) if ( quant_type == QuantType.per_1x32 and dtype in [dtypes.bf16, dtypes.fp16] + and q_dtype_a in [dtypes.bf16, dtypes.fp16] and w1.dtype == dtypes.fp4x2 and activation == ActivationType.Swiglu ): a2_scale = None + elif ( + quant_type == aiter.QuantType.per_1x32 + and dtype in [dtypes.bf16] + and q_dtype_a == dtypes.fp8 + and w1.dtype == dtypes.fp4x2 + and activation == aiter.ActivationType.Swiglu + ): + a2 = a2.to(dtypes.fp8) + a2_scale = a1_scale elif quant_type == QuantType.per_1x32: a2 = a2.view(-1, inter_dim) if token_num <= token_num_quant_moe_sort_switch: @@ -952,10 +1054,13 @@ def fused_moe_2stages( num_valid_ids, moe_out, topk, - w2_scale=w2_scale, + w2_scale=( + w2_scale.view(dtypes.fp8_e8m0) if w2.dtype == dtypes.fp4x2 else w2_scale + ), a2_scale=a2_scale, block_m=block_size_M, sorted_weights=sorted_weights if not doweight_stage1 else None, + **extra_stage2_args, ) return moe_out @@ -1293,6 +1398,60 @@ def torch_moe_stage2( return out.sum(1).to(dtype) +def ck_moe_stage1( + hidden_states, + w1, # [E, inter_dim*2, model_dim] + w2, # [E, model_dim, inter_dim] + sorted_token_ids, # [max_num_tokens_padded] + sorted_expert_ids, # [max_num_m_blocks] + num_valid_ids, # [1] + out, + topk, + block_m, + a1_scale, + w1_scale, + kernelName="", + sorted_weights=None, + quant_type=aiter.QuantType.No, + activation=ActivationType.Gelu, + splitk=1, + dtype=None, +): + token_num = hidden_states.shape[0] + tmp_out = ( + torch.zeros( + (token_num, topk, w1.shape[1]), dtype=dtypes.fp32, device=out.device + ) + if splitk > 1 + else out + ) + aiter.ck_moe_stage1_fwd( + hidden_states, + w1, + w2, + sorted_token_ids, + sorted_expert_ids, + num_valid_ids, + tmp_out, + topk, + kernelName, + w1_scale, + a1_scale, + block_m, + sorted_weights, + quant_type, + activation, + splitk, + out.dtype, + ) + if splitk > 1: + if activation == ActivationType.Silu: + aiter.silu_and_mul(out, tmp_out.view(dtypes.fp32).to(out.dtype)) + else: + aiter.gelu_and_mul(out, tmp_out.view(dtypes.fp32).to(out.dtype)) + return out + + def cktile_moe_stage1( hidden_states, w1, @@ -1309,6 +1468,7 @@ def cktile_moe_stage1( n_pad_zeros=0, k_pad_zeros=0, bias1=None, + dtype=torch.bfloat16, ): token_num = hidden_states.shape[0] _, n1, k1 = w1.shape @@ -1318,9 +1478,8 @@ def cktile_moe_stage1( if w1.dtype is torch.uint32: D = D * 8 - out = torch.empty( - (token_num, topk, D), dtype=hidden_states.dtype, device=hidden_states.device - ) + out = torch.empty((token_num, topk, D), dtype=dtype, device=hidden_states.device) + # print("Run cktile_moe_stage1: M=%d, N(N*2)=%d, K=%d, topk=%d, expert=%d"%(token_num, w1.shape[1], hidden_states.shape[1], topk, w1.shape[0])) aiter.moe_cktile2stages_gemm1( hidden_states, diff --git a/aiter/ops/moe_op.py b/aiter/ops/moe_op.py index f3c24e043b..0afd79c38e 100755 --- a/aiter/ops/moe_op.py +++ b/aiter/ops/moe_op.py @@ -223,17 +223,22 @@ def cmdGenFunc_ck_moe_stage( sorted_weights: Optional[Tensor] = None, quant_type: int = 0, activation: int = 0, + splitk: int = 1, + dst_type: Optional[str] = None, ): mul_routed_weight_stage = 2 if sorted_weights is None else 1 + is_splitk = splitk > 1 + outtype = str2dtype_dict[dst_type] if is_splitk else out.dtype md_name, blob_gen_cmd = get_moe_stage_module( hidden_states.dtype, w1.dtype, - out.dtype, + outtype, activation, quant_type, mul_routed_weight_stage, getattr(w1, "is_shuffled", False), + is_splitk, ) return { "md_name": md_name, @@ -292,6 +297,8 @@ def ck_moe_stage1( sorted_weights: Optional[Tensor] = None, quant_type: int = 0, activation: int = 0, + splitk: int = 1, + dst_type: Optional[str] = None, ) -> None: ... @@ -431,6 +438,11 @@ def moe_cktile2stages_gemm2( torch.int4: "i4", } +str2dtype_dict = { + "f16": dtypes.fp16, + "b16": dtypes.bf16, +} + @functools.lru_cache(maxsize=1024) def get_moe_stage_module( @@ -441,6 +453,7 @@ def get_moe_stage_module( quant_type, mul_routed_weight_stage, preshuffle_mode=False, + is_splitk=False, ): if isinstance(activation, int): activation = ActivationType(activation) @@ -455,6 +468,7 @@ def get_moe_stage_module( if preshuffle_mode and weight_dtype == dtypes.fp4x2: preshuffle_str = "--preshuffle" + splitk_str = "--issplitk" if is_splitk else "" quant_type = ( QuantType.per_1x128 if quant_type == QuantType.per_128x128 else quant_type ) @@ -471,10 +485,11 @@ def get_moe_stage_module( act, quant_type, f"mulWeightStage{mul_routed_weight_stage}", + "splitk" if is_splitk else "", ] ) blob_gen_cmd = [ - f"{AITER_CSRC_DIR}/ck_gemm_moe_2stages_codegen/gen_instances.py -a {Adtype} -b {Bdtype} -c {Cdtype} -q {quant_type} -act {act} -m {mul_routed_weight_stage} {preshuffle_str} -w {{}}" + f"{AITER_CSRC_DIR}/ck_gemm_moe_2stages_codegen/gen_instances.py -a {Adtype} -b {Bdtype} -c {Cdtype} -q {quant_type} -act {act} -m {mul_routed_weight_stage} {preshuffle_str} {splitk_str} -w {{}}" ] return md_name, blob_gen_cmd @@ -496,6 +511,8 @@ def ck_moe_stage1_fwd( sorted_weights: Optional[Tensor] = None, quant_type: QuantType = QuantType.No, activation: ActivationType = ActivationType.Silu, + splitk: Optional[int] = 1, + dst_type: Optional[torch.dtype] = None, ): ck_moe_stage1( hidden_states, @@ -513,6 +530,8 @@ def ck_moe_stage1_fwd( sorted_weights, quant_type.value, activation.value, + splitk, + dtype2str_dict[dst_type], ) return out diff --git a/aiter/ops/quant.py b/aiter/ops/quant.py index 0d974fc933..7070a49a42 100644 --- a/aiter/ops/quant.py +++ b/aiter/ops/quant.py @@ -101,6 +101,42 @@ def per_1x32_f4_quant(x, scale=None, quant_dtype=dtypes.fp4x2, shuffle=False): return y, scale.view(dtypes.fp8_e8m0) +def per_1x32_f8_scale_f8_quant( + x, scale=None, quant_dtype=dtypes.fp8, scale_type=dtypes.fp32, shuffle=False +): + assert quant_dtype == dtypes.fp8 + block_size = 32 + dtypeMax = 448.0 + MAX_POW2 = int(torch.log2(torch.tensor(dtypeMax, dtype=torch.float32)).item()) + dtypeMax = 2.0**MAX_POW2 + + shape_original = x.shape + x = x.view(-1, shape_original[-1]) + + m, n = x.shape + x = x.view(-1, block_size) + max_abs = torch.amax(torch.abs(x.float()), 1) + + # fp8e8m0fnu_from_fp32_value + if scale_type == dtypes.fp32: + scale_f32 = max_abs / dtypeMax + scale_e8m0_biased = None + else: + scale_e8m0_biased = fp4_utils.f32_to_e8m0(max_abs / dtypeMax) + scale_f32 = fp4_utils.e8m0_to_f32(scale_e8m0_biased) + # scale_f32 = max_abs / dtypeMax + + y = x.float() / scale_f32.view(-1, 1) + y = y.view(*shape_original[:-1], -1) + if scale_type == dtypes.fp32: + scale = scale_f32.view(m, -1) + else: + scale = scale_e8m0_biased.view(m, -1) # .view(torch.uint8) + if shuffle: + scale = fp4_utils.e8m0_shuffle(scale) + return y.to(quant_dtype), scale + + def per_tensor_quant( x, scale=None, scale_dtype=dtypes.fp32, quant_dtype=dtypes.i8, dtypeMax=None ): diff --git a/aiter/ops/triton/fused_mxfp4_quant.py b/aiter/ops/triton/fused_mxfp4_quant.py index 173c3502bf..0218d385d3 100644 --- a/aiter/ops/triton/fused_mxfp4_quant.py +++ b/aiter/ops/triton/fused_mxfp4_quant.py @@ -5,7 +5,6 @@ from typing import Optional from aiter.utility import dtypes from aiter.ops.triton._triton_kernels.fused_mxfp4_quant import ( - _rmsmorm_op, _fused_rms_mxfp4_quant_kernel, _fused_flatten_mxfp4_quant, _fused_reduce_act_mul_and_dynamic_mxfp4_quant_kernel, @@ -650,3 +649,196 @@ def fused_dynamic_mxfp4_quant_moe_sort( x_fp4.view(dtypes.fp4x2), blockscale_e8m0_sorted.view(dtypes.fp8_e8m0).view(-1, N_o), ) + + +@triton.jit +def _fused_quant_fp8_sort_kernel( + # Pointers + input_ptr, + sorted_ids_ptr, + num_valid_ids_ptr, + x_fp8_ptr, + scale_sorted_ptr, + # Input/Output strides + stride_input_m: tl.constexpr, + stride_input_n: tl.constexpr, + stride_x_fp8_m: tl.constexpr, + stride_x_fp8_n: tl.constexpr, + stride_scale_o3: tl.constexpr, + stride_scale_o2: tl.constexpr, + stride_scale_o1: tl.constexpr, + stride_scale_o0: tl.constexpr, + # Problem size + M_input: tl.constexpr, + N_input: tl.constexpr, + N_scale_cols: tl.constexpr, + token_num: tl.constexpr, + # Block configuration + BLOCK_SIZE_M: tl.constexpr, + BLOCK_SIZE_N: tl.constexpr, # quant_block_size / 2 + QUANT_BLOCK_SIZE: tl.constexpr, + TOPK: tl.constexpr, + # Quantization parameters + DTYPE_MAX: tl.constexpr, + DTYPE_MIN: tl.constexpr, +): + pid_m = tl.program_id(0) * 2 + pid_n = tl.program_id(1) * 2 + + num_valid_ids = tl.load(num_valid_ids_ptr) + if pid_m * BLOCK_SIZE_M >= num_valid_ids: + return + + out = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.uint32) + + for i in range(4): + m = i % 2 * BLOCK_SIZE_M # 0 or BLOCK_SIZE_M + n = i // 2 * BLOCK_SIZE_N # 0 or BLOCK_SIZE_N + + sorted_ids_offs_m = pid_m * BLOCK_SIZE_M + m + tl.arange(0, BLOCK_SIZE_M) + sorted_ids_mask = sorted_ids_offs_m < num_valid_ids + sorted_ids = tl.load( + sorted_ids_ptr + sorted_ids_offs_m, + mask=sorted_ids_mask, + other=0, + ) + topk_ids = sorted_ids >> 24 + token_ids = sorted_ids & 0xFFFFFF + + if TOPK == 1: + original_m_idx = token_ids + else: + original_m_idx = token_ids * TOPK + topk_ids + + input_offs_n = (pid_n * BLOCK_SIZE_N + n) * QUANT_BLOCK_SIZE + tl.arange( + 0, BLOCK_SIZE_N * QUANT_BLOCK_SIZE + ) + input_offs = ( + original_m_idx[:, None] * stride_input_m + + input_offs_n[None, :] * stride_input_n + ) + input_mask = (original_m_idx < M_input)[:, None] & (input_offs_n < N_input)[ + None, : + ] + + x = tl.load(input_ptr + input_offs, mask=input_mask, other=0.0).to(tl.float32) + + x_reshaped = x.reshape(BLOCK_SIZE_M * BLOCK_SIZE_N, QUANT_BLOCK_SIZE) + + amax = tl.max(tl.abs(x_reshaped), axis=-1, keep_dims=True) + + amax = amax.to(tl.int32, bitcast=True) + amax = (amax + 0x200000).to(tl.uint32, bitcast=True) & 0xFF800000 + amax = amax.to(tl.float32, bitcast=True) + + scale_e8m0_unbiased = tl.log2(amax).floor() - tl.log2(DTYPE_MAX).floor() + scale_e8m0_unbiased = tl.clamp(scale_e8m0_unbiased, min=-127, max=127) + + quant_scale = tl.exp2(-scale_e8m0_unbiased) + x_fp8 = tl.clamp(x_reshaped * quant_scale, DTYPE_MIN, DTYPE_MAX) + x_fp8 = x_fp8.reshape(BLOCK_SIZE_M, BLOCK_SIZE_N * QUANT_BLOCK_SIZE) + + scale_e8m0 = (scale_e8m0_unbiased.to(tl.uint8) + 127).to(tl.uint8) + scale_e8m0 = scale_e8m0.reshape(BLOCK_SIZE_M, BLOCK_SIZE_N) # [BLOCK_SIZE_M] + + out_offs_n = (pid_n * BLOCK_SIZE_N + n) * QUANT_BLOCK_SIZE + tl.arange( + 0, BLOCK_SIZE_N * QUANT_BLOCK_SIZE + ) + out_offs = ( + original_m_idx[:, None] * stride_x_fp8_m + + out_offs_n[None, :] * stride_x_fp8_n + ) + out_mask = (original_m_idx < M_input)[:, None] & (out_offs_n < N_input)[None, :] + tl.store( + x_fp8_ptr + out_offs, x_fp8.to(x_fp8_ptr.type.element_ty), mask=out_mask + ) + + out = out | (scale_e8m0.to(tl.uint32) << (i * 8)) + + offs_0 = tl.arange(0, BLOCK_SIZE_M) + offs_1 = tl.arange(0, BLOCK_SIZE_N) + offs_2 = pid_n // 2 + offs_3 = pid_m // 2 + offs = ( + offs_0[:, None] * stride_scale_o0 + + offs_1[None, :] * stride_scale_o1 + + offs_2 * stride_scale_o2 + + offs_3 * stride_scale_o3 + ) + tl.store(scale_sorted_ptr + offs, out) + + +def fused_quant_fp8_sort( + input: torch.Tensor, + sorted_ids: torch.Tensor, + num_valid_ids: torch.Tensor, + token_num: int, + block_size: int = 32, + quant_block_size: int = 8, + quant_dtype: torch.dtype = dtypes.fp8, +) -> tuple[torch.Tensor, torch.Tensor]: + BLOCK_SIZE_M = block_size + BLOCK_SIZE_N = quant_block_size + BLOCK_SIZE_M_u32 = BLOCK_SIZE_M // 2 + BLOCK_SIZE_N_u32 = BLOCK_SIZE_N // 2 + + M, N = input.shape + assert ( + N % quant_block_size == 0 + ), f"N ({N}) must be multiple of quant_block_size ({quant_block_size})" + assert block_size % 32 == 0, "block_size must be multiple of 32" + + N_blocks = triton.cdiv(N, block_size) + + if quant_dtype == dtypes.fp8: + DTYPE_MAX = 448.0 + DTYPE_MIN = -448.0 + elif quant_dtype == torch.float8_e4m3fn: + DTYPE_MAX = 448.0 + DTYPE_MIN = -448.0 + else: + DTYPE_MAX = 448.0 + DTYPE_MIN = -448.0 + + x_fp8 = torch.empty_like(input, dtype=quant_dtype, device="cuda") + M_o, N_o = sorted_ids.shape[0], N_blocks + + # [M_sorted_blocks/2, N_blocks/2, BLOCK_SIZE_N_u32, BLOCK_SIZE_M_u32] + scale_e8m0_packed = torch.empty( + ( + triton.cdiv(M_o, BLOCK_SIZE_M), + triton.cdiv(N_o, BLOCK_SIZE_N), + BLOCK_SIZE_N_u32, + BLOCK_SIZE_M_u32, + ), + dtype=torch.uint32, + device=input.device, + ) + + grid = ( + triton.cdiv(M_o, BLOCK_SIZE_M), # 32 + triton.cdiv(N_o, BLOCK_SIZE_N), # 8 + ) + + _fused_quant_fp8_sort_kernel[grid]( + input, + sorted_ids, + num_valid_ids, + x_fp8, + scale_e8m0_packed, + *input.stride(), + *x_fp8.stride(), + *scale_e8m0_packed.stride(), + M_input=M, + N_input=N, + N_scale_cols=N_blocks, + token_num=token_num, + BLOCK_SIZE_M=BLOCK_SIZE_M // 2, + BLOCK_SIZE_N=BLOCK_SIZE_N // 2, + QUANT_BLOCK_SIZE=32, + TOPK=M // token_num, + DTYPE_MAX=DTYPE_MAX, + DTYPE_MIN=DTYPE_MIN, + ) + + return x_fp8, scale_e8m0_packed.view(dtypes.fp8_e8m0).view(-1, N_o) diff --git a/aiter/ops/triton/utils/gemm_config_utils.py b/aiter/ops/triton/utils/gemm_config_utils.py index 23f10dd448..56ed8a20c9 100644 --- a/aiter/ops/triton/utils/gemm_config_utils.py +++ b/aiter/ops/triton/utils/gemm_config_utils.py @@ -17,7 +17,7 @@ Cold start: 290.8928 ms LRU Cache: ENABLED Avg per call: 0.110 us -vs +vs LRU Cache: DISABLED Avg per call: 2.503 us """ diff --git a/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py b/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py index 469afbab0e..4ca15dfe02 100755 --- a/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py +++ b/csrc/ck_gemm_a4w4_blockscale/gemm_a4w4_blockscale_tune.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: MIT # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. -import argparse import os import pandas as pd diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.cu b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.cu index a5be94138e..322ea56c71 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.cu +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.cu @@ -56,13 +56,23 @@ void ck_moe_stage1(torch::Tensor &hidden_states, // [m, k], input token std::optional block_m = 32, std::optional sorted_weights = std::nullopt, int quant_type = 0, - int activation = 0) + int activation = 0, + int splitk = 1, + std::optional dst_type = std::nullopt) { const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(out)); at::hip::getCurrentHIPStream(); - TORCH_CHECK(out.dtype() == at::ScalarType::BFloat16 || out.dtype() == at::ScalarType::Half, - "Out dtype only support BFloat16/Float16!") + if (splitk > 1) + { + TORCH_CHECK(out.dtype() == at::ScalarType::Float, + "Out dtype only support Float when splitk > 1!") + } + else + { + TORCH_CHECK(out.dtype() == at::ScalarType::BFloat16 || out.dtype() == at::ScalarType::Half, + "Out dtype only support BFloat16/Float16!") + } int tokens = hidden_states.size(0); int sorted_size = std::min(int64_t(tokens * topk * block_m.value()), sorted_token_ids.size(0)); @@ -99,7 +109,7 @@ void ck_moe_stage1(torch::Tensor &hidden_states, // [m, k], input token kernel(at::hip::getCurrentHIPStream(), tokens, sorted_size, N, K, topk, - hidden_states_ptr, w1_ptr, w2_ptr, sorted_token_ids_ptr, sorted_expert_ids_ptr, sorted_weights_ptr, num_valid_ids_ptr, out_ptr, w1_scale_ptr, a1_scale_ptr); + hidden_states_ptr, w1_ptr, w2_ptr, sorted_token_ids_ptr, sorted_expert_ids_ptr, sorted_weights_ptr, num_valid_ids_ptr, out_ptr, w1_scale_ptr, a1_scale_ptr, splitk); } void ck_moe_stage2(torch::Tensor &inter_states, // [m, k], input token @@ -116,7 +126,9 @@ void ck_moe_stage2(torch::Tensor &inter_states, // [m, k], input token std::optional block_m = 32, std::optional sorted_weights = std::nullopt, int quant_type = 0, - int activation = 0) + int activation = 0, + int splitk = 1, + std::optional dst_type = std::nullopt) { TORCH_CHECK(out.dtype() == at::ScalarType::BFloat16 || out.dtype() == at::ScalarType::Half, "Out dtype only support BFloat16/Float16!") @@ -155,5 +167,5 @@ void ck_moe_stage2(torch::Tensor &inter_states, // [m, k], input token kernel(at::hip::getCurrentHIPStream(), tokens, sorted_size, N, K, topk, - inter_states_ptr, w1_ptr, w2_ptr, sorted_token_ids_ptr, sorted_expert_ids_ptr, sorted_weights_ptr, num_valid_ids_ptr, out_ptr, w2_scale_ptr, a2_scale_ptr); + inter_states_ptr, w1_ptr, w2_ptr, sorted_token_ids_ptr, sorted_expert_ids_ptr, sorted_weights_ptr, num_valid_ids_ptr, out_ptr, w2_scale_ptr, a2_scale_ptr, splitk); } \ No newline at end of file diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.h b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.h index 7c22ab857b..f1ef022159 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.h +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages.h @@ -335,6 +335,19 @@ struct MulABScaleExpertWeightA8W8blkscale } }; +struct MulABScaleExpertWeightA8W8blkscaleSplitk +{ + template + __host__ __device__ constexpr void operator()(E& e, const C& c, const D2& d2) const; + template <> + __host__ __device__ constexpr void + operator()(float& e, const float& c, const float& d2) const + { + (void)d2; + e = ck::type_convert(c); + } +}; + using MoeKernel = std::function, - std::optional)>; + std::optional, + std::optional)>; template w1_scale = std::nullopt, // [e, 1, n], gate(up) scale - std::optional a1_scale = std::nullopt // [m, 1], token scale + std::optional a1_scale = std::nullopt, // [m, 1], token scale + std::optional splitk = 1 // splitk ); template w2_scale = std::nullopt, // [e, 1, n], gate(up) scale - std::optional a2_scale = std::nullopt // [max_num_tokens_padded, 1], token scale + std::optional a2_scale = std::nullopt, // [max_num_tokens_padded, 1], token scale + std::optional splitk = 1 // splitk ); diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.cuh b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.cuh index 55cf72b4a9..aa7cd67366 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.cuh +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.cuh @@ -1,40 +1,44 @@ // SPDX-License-Identifier: MIT // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #pragma once -#include "gemm_moe_ck2stages.h" -#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp" +#include "gemm_moe_ck2stages.h" #include -template < - typename A0DataType, - typename B0DataType, - typename AccDataType, - typename EDataType, - typename CDEElementOp, - PipelineVersion PipelineVer, - int BLOCKSIZE, - int MPerBlock, - int NPerBlock, - int KPerBlock, - int MWaves, - int NWaves, - bool Nswizzle, - bool PerTensorQuant, - bool MulRoutedWeight, - int ActOP> -void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, int N, int K, +template +void ck_moe_stage1_gemm(const hipStream_t& stream, + int tokens, + int sorted_size, + int N, + int K, int topk, - void *&hidden_states, // [m, k], input token - void *&w1, // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) - void *&w2, // [expert, dim, inter_dim], pre-shuffle([e, nr, kr, w]) - void *&sorted_token_ids, // [max_num_tokens_padded] - void *&sorted_expert_ids, // [max_num_m_blocks] - void *&sorted_weights, - void *&num_valid_ids, // [1] - void *&out, // [max_num_tokens_padded, inter_dim] - std::optional w1_scale, // [e, 1, n], gate(up) scale - std::optional a1_scale // [m, 1], token scale + void*& hidden_states, // [m, k], input token + void*& w1, // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) + void*& w2, // [expert, dim, inter_dim], pre-shuffle([e, nr, kr, w]) + void*& sorted_token_ids, // [max_num_tokens_padded] + void*& sorted_expert_ids, // [max_num_m_blocks] + void*& sorted_weights, + void*& num_valid_ids, // [1] + void*& out, // [max_num_tokens_padded, inter_dim] + std::optional w1_scale, // [e, 1, n], gate(up) scale + std::optional a1_scale, // [m, 1], token scale + std::optional splitk // splitk ) { // ~~~~~~~~~~~~~~~~~~~~~~~~following start with ck things @@ -42,43 +46,47 @@ void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, ck::index_t StrideB = K; ck::index_t StrideD = 0; ck::index_t StrideE = N; - ck::index_t KBatch = 1; + ck::index_t KBatch = 1; // using AccDataType = F32; using CShuffleDataType = F32; - using DsDataType = ck::Tuple; + using DsDataType = ck::Tuple; using A0Layout = Row; using B0Layout = Col; using D0Layout = Row; using D1Layout = Col; - using ELayout = Row; + using ELayout = Row; using D2Layout = ELayout; using DsLayout = ck::Tuple; using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using AElementOp = PassThrough; - using BElementOp = PassThrough; + using AElementOp = PassThrough; + using BElementOp = PassThrough; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default; - static constexpr ck::index_t MNPerXDL = 16; - static constexpr ck::index_t WAVES = BLOCKSIZE / 64; + static constexpr ck::index_t MNPerXDL = 16; + static constexpr ck::index_t WAVES = BLOCKSIZE / 64; static constexpr ck::index_t MXDLPerWave = MPerBlock / (MNPerXDL * MWaves); static constexpr ck::index_t NXDLPerWave = NPerBlock / (MNPerXDL * NWaves); - // static constexpr ck::index_t NPerBlock = PipelineVer == ck::BlockGemmPipelineVersion::v1 ? 64 : 128; - static constexpr ck::index_t CShuffleMXDLPerWave = ck::is_same_v ? 2 : MXDLPerWave; - static constexpr ck::index_t CShuffleNXDLPerWave = ck::is_same_v ? 1 : NXDLPerWave; + // static constexpr ck::index_t NPerBlock = PipelineVer == ck::BlockGemmPipelineVersion::v1 ? 64 + // : 128; + static constexpr ck::index_t CShuffleMXDLPerWave = + ck::is_same_v ? 2 : MXDLPerWave; + static constexpr ck::index_t CShuffleNXDLPerWave = + ck::is_same_v ? 1 : NXDLPerWave; // Note: some fp8 instances didn't compile with AK1/BK1=16 - static constexpr ck::index_t K1 = (NPerBlock == 64 && sizeof(A0DataType) == 1 && sizeof(B0DataType) == 1) ? 8 : 16; + static constexpr ck::index_t K1 = + (NPerBlock == 64 && sizeof(A0DataType) == 1 && sizeof(B0DataType) == 1) ? 8 : 16; static constexpr ck::index_t AK1 = K1 / sizeof(A0DataType); static constexpr ck::index_t BK1 = ck::is_same_v ? 32 : K1 / sizeof(B0DataType); - static constexpr ck::index_t EVec = 16 / sizeof(EDataType); - static constexpr ck::index_t K0_A = KPerBlock / AK1; - static constexpr ck::index_t K0_B = KPerBlock / BK1; + static constexpr ck::index_t EVec = 16 / sizeof(EDataType); + static constexpr ck::index_t K0_A = KPerBlock / AK1; + static constexpr ck::index_t K0_B = KPerBlock / BK1; static constexpr ck::index_t K0_M_A = BLOCKSIZE / K0_A; static constexpr ck::index_t K0_N_B = BLOCKSIZE / K0_B; - static constexpr ck::index_t D0Vec = 1; - static constexpr ck::index_t D1Vec = PerTensorQuant ? 1 : EVec; - static constexpr ck::index_t D2Vec = 1; + static constexpr ck::index_t D0Vec = 1; + static constexpr ck::index_t D1Vec = PerTensorQuant ? 1 : EVec; + static constexpr ck::index_t D2Vec = 1; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm // clang-format off @@ -88,7 +96,7 @@ void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, ///######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S| ///###### RCR < Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, - AElementOp, BElementOp, CDEElementOp, GemmSpec, + AElementOp, BElementOp, CDEElementOp, GemmSpec, BLOCKSIZE, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MNPerXDL, MNPerXDL, @@ -99,45 +107,45 @@ void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, ck::BlockGemmPipelineScheduler::Intrawave, PipelineVer, ActOP, Nswizzle, true, MulRoutedWeight, !PerTensorQuant, ck::index_t, A0DataType>; // clang-format on - auto a_element_op = AElementOp{}; - auto b_element_op = BElementOp{}; + auto a_element_op = AElementOp{}; + auto b_element_op = BElementOp{}; auto cde_element_op = CDEElementOp{}; constexpr ck::index_t NumDTensor = DsDataType::Size(); - constexpr auto I0 = ck::Number<0>{}; - constexpr auto I1 = ck::Number<1>{}; + constexpr auto I0 = ck::Number<0>{}; + constexpr auto I1 = ck::Number<1>{}; static constexpr auto DStride = PerTensorQuant ? I0 : I1; // do GEMM auto device_op = DeviceOpInstance{}; - auto invoker = device_op.MakeInvoker(); - auto argument = - device_op.MakeArgument(sorted_token_ids, - sorted_expert_ids, - num_valid_ids, - hidden_states, - w1, - std::array{a1_scale.has_value() ? a1_scale.value() : nullptr, - w1_scale.has_value() ? w1_scale.value() : nullptr, - MulRoutedWeight ? sorted_weights : nullptr}, - out, - tokens, - topk, - sorted_size, - N, - K, - StrideA, - StrideB, - std::array{DStride, DStride, I0}, - StrideE, - KBatch, - a_element_op, - b_element_op, - cde_element_op); + auto invoker = device_op.MakeInvoker(); + auto argument = device_op.MakeArgument( + sorted_token_ids, + sorted_expert_ids, + num_valid_ids, + hidden_states, + w1, + std::array{a1_scale.has_value() ? a1_scale.value() : nullptr, + w1_scale.has_value() ? w1_scale.value() : nullptr, + MulRoutedWeight ? sorted_weights : nullptr}, + out, + tokens, + topk, + sorted_size, + N, + K, + StrideA, + StrideB, + std::array{DStride, DStride, I0}, + StrideE, + KBatch, + a_element_op, + b_element_op, + cde_element_op); - if (!device_op.IsSupportedArgument(argument)) + if(!device_op.IsSupportedArgument(argument)) { throw std::runtime_error( "wrong! device_gemm with the specified compilation parameters does " @@ -147,51 +155,74 @@ void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, invoker.Run(argument, StreamConfig{stream}); } -#define CK_MOE_STAGE1_GEMM_DEFINE(BLOCKSIZE, MPerfBlock, NPerBlock, KPerBlock, MWaves, NWaves, PipelineVer) \ - template void ck_moe_stage1_gemm( \ - const hipStream_t &stream, \ - int tokens, int sorted_size, int N, int K, \ - int topk, \ - void *&hidden_states, \ - void *&w1, \ - void *&w2, \ - void *&sorted_token_ids, \ - void *&sorted_expert_ids, \ - void *&sorted_weights, \ - void *&num_valid_ids, \ - void *&out, \ - std::optional w1_scale, \ - std::optional a1_scale); +#define CK_MOE_STAGE1_GEMM_DEFINE( \ + BLOCKSIZE, MPerfBlock, NPerBlock, KPerBlock, MWaves, NWaves, PipelineVer) \ + template void ck_moe_stage1_gemm(const hipStream_t& stream, \ + int tokens, \ + int sorted_size, \ + int N, \ + int K, \ + int topk, \ + void*& hidden_states, \ + void*& w1, \ + void*& w2, \ + void*& sorted_token_ids, \ + void*& sorted_expert_ids, \ + void*& sorted_weights, \ + void*& num_valid_ids, \ + void*& out, \ + std::optional w1_scale, \ + std::optional a1_scale, \ + std::optional splitk); -template < - typename A0DataType, - typename B0DataType, - typename AccDataType, - typename EDataType, - typename CDEElementOp, - PipelineVersion PipelineVer, - int BLOCKSIZE, - int MPerBlock, - int NPerBlock, - int KPerBlock, - int MWaves, - int NWaves, - bool Nswizzle, - bool PerTensorQuant, - bool MulRoutedWeight, - int ActOP = 0> -void ck_moe_stage2_gemm(const hipStream_t &stream, int tokens, int sorted_size, int N, int K, +template +void ck_moe_stage2_gemm(const hipStream_t& stream, + int tokens, + int sorted_size, + int N, + int K, int topk, - void *&inter_states, // [max_num_tokens_padded, k], input token - void *&w1, // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) - void *&w2, // [expert, dim, inter_dim], pre-shuffle([e, nr, kr, w]) - void *&sorted_token_ids, // [max_num_tokens_padded] - void *&sorted_expert_ids, // [max_num_m_blocks] - void *&sorted_weights, // [max_num_tokens_padded] - void *&num_valid_ids, //[1] - void *&out, // [m, out_dim] - std::optional w2_scale, // [e, 1, n], gate(up) scale - std::optional a2_scale // [max_num_tokens_padded, 1], token scale + void*& inter_states, // [max_num_tokens_padded, k], input token + void*& w1, // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) + void*& w2, // [expert, dim, inter_dim], pre-shuffle([e, nr, kr, w]) + void*& sorted_token_ids, // [max_num_tokens_padded] + void*& sorted_expert_ids, // [max_num_m_blocks] + void*& sorted_weights, // [max_num_tokens_padded] + void*& num_valid_ids, //[1] + void*& out, // [m, out_dim] + std::optional w2_scale, // [e, 1, n], gate(up) scale + std::optional a2_scale, // [max_num_tokens_padded, 1], token scale + std::optional splitk // splitk ) { // ~~~~~~~~~~~~~~~~~~~~~~~~following start with ck things @@ -199,45 +230,50 @@ void ck_moe_stage2_gemm(const hipStream_t &stream, int tokens, int sorted_size, ck::index_t StrideB = K; ck::index_t StrideD = 0; ck::index_t StrideE = N; - ck::index_t KBatch = 1; + ck::index_t KBatch = 1; // using AccDataType = F32; using CShuffleDataType = F32; - using DsDataType = ck::Tuple; + using DsDataType = ck::Tuple; using A0Layout = Row; using B0Layout = Col; - using ELayout = Row; + using ELayout = Row; using D0Layout = Row; using D1Layout = Col; using DsLayout = ck::Tuple; using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using AElementOp = PassThrough; - using BElementOp = PassThrough; + using AElementOp = PassThrough; + using BElementOp = PassThrough; static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default; // static constexpr ck::index_t BLOCKSIZE = 256; - static constexpr ck::index_t WAVES = BLOCKSIZE / 64; - static constexpr ck::index_t MNPerXDL = 16; + static constexpr ck::index_t WAVES = BLOCKSIZE / 64; + static constexpr ck::index_t MNPerXDL = 16; static constexpr ck::index_t MXDLPerWave = MPerBlock / (MNPerXDL * MWaves); static constexpr ck::index_t NXDLPerWave = NPerBlock / (MNPerXDL * NWaves); - static constexpr ck::index_t CShuffleMXDLPerWave = ck::is_same_v ? 2 : MXDLPerWave; - static constexpr ck::index_t CShuffleNXDLPerWave = ck::is_same_v ? 2 : NXDLPerWave; - static constexpr ck::index_t CShuffleNLane = ck::is_same_v ? 32 : NPerBlock / 2 / NXDLPerWave; // 64 + static constexpr ck::index_t CShuffleMXDLPerWave = + ck::is_same_v ? 2 : MXDLPerWave; + static constexpr ck::index_t CShuffleNXDLPerWave = + ck::is_same_v ? 2 : NXDLPerWave; + static constexpr ck::index_t CShuffleNLane = + ck::is_same_v ? 32 : NPerBlock / 2 / NXDLPerWave; // 64 static constexpr ck::index_t CShuffleMLane = BLOCKSIZE / CShuffleNLane; // Note: some fp8 instances didn't compile with AK1/BK1=16 - static constexpr ck::index_t K1 = (KPerBlock == 64 && sizeof(A0DataType) == 1 && sizeof(B0DataType) == 1) ? 8 : 16; + static constexpr ck::index_t K1 = + (KPerBlock == 64 && sizeof(A0DataType) == 1 && sizeof(B0DataType) == 1) ? 8 : 16; static constexpr ck::index_t AK1 = K1 / sizeof(A0DataType); - static constexpr ck::index_t BK1 = ck::is_same_v ? 32 / sizeof(B0DataType) : K1 / sizeof(B0DataType); - static constexpr ck::index_t EVec = 2; + static constexpr ck::index_t BK1 = + ck::is_same_v ? 32 / sizeof(B0DataType) : K1 / sizeof(B0DataType); + static constexpr ck::index_t EVec = 2; static constexpr ck::index_t D0Vec = 1; static constexpr ck::index_t D1Vec = PerTensorQuant ? 1 : EVec; static constexpr ck::index_t D2Vec = 1; - static constexpr ck::index_t K0_A = KPerBlock / AK1; - static constexpr ck::index_t K0_B = KPerBlock / BK1; - static constexpr ck::index_t K0_M = BLOCKSIZE / K0_A; - static constexpr ck::index_t K0_N = BLOCKSIZE / K0_B; + static constexpr ck::index_t K0_A = KPerBlock / AK1; + static constexpr ck::index_t K0_B = KPerBlock / BK1; + static constexpr ck::index_t K0_M = BLOCKSIZE / K0_A; + static constexpr ck::index_t K0_N = BLOCKSIZE / K0_B; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemm // clang-format off @@ -247,7 +283,7 @@ void ck_moe_stage2_gemm(const hipStream_t &stream, int tokens, int sorted_size, ///#####| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S| ///##### RCR < Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, - AElementOp, BElementOp, CDEElementOp, GemmSpec, + AElementOp, BElementOp, CDEElementOp, GemmSpec, BLOCKSIZE, MPerBlock, NPerBlock, KPerBlock, AK1, BK1, MNPerXDL, MNPerXDL, @@ -319,4 +355,5 @@ void ck_moe_stage2_gemm(const hipStream_t &stream, int tokens, int sorted_size, void *&num_valid_ids, \ void *&out, \ std::optional w2_scale, \ - std::optional a2_scale); \ No newline at end of file + std::optional a2_scale, \ + std::optional splitk); diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py index f849b42d44..20618c331d 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common.py @@ -358,6 +358,7 @@ def get_gemm1_kernels_list( ActOP: str, MulRoutedWeight: bool, preshuffle: bool = False, + splitk: bool = False, ) -> list: arch = get_gfx() if Adtype in bit16_list and Bdtype in bit16_list and Adtype == Adtype: @@ -403,7 +404,10 @@ def get_gemm1_kernels_list( if tag == "a8w4": kernel.CDEElementOp = "MulABScaleWint4" elif tag == "a8w8blkscale": - kernel.CDEElementOp = "MulABScaleExpertWeightA8W8blkscale" + if splitk: + kernel.CDEElementOp = "MulABScaleExpertWeightA8W8blkscaleSplitk" + else: + kernel.CDEElementOp = "MulABScaleExpertWeightA8W8blkscale" elif tag == "a8w8" or tag == "a4w4" or tag == "a4w4_bns": kernel.CDEElementOp = "MulABScale" elif tag == "a16w16": diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_blockscale.cuh b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_blockscale.cuh index dcd6d096cc..41c918c992 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_blockscale.cuh +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_blockscale.cuh @@ -33,7 +33,9 @@ void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, void *&num_valid_ids, // [1] void *&out, // [max_num_tokens_padded, inter_dim] std::optional w1_scale, // [e, 1, n], gate(up) scale - std::optional a1_scale // [m, 1], token scale + std::optional a1_scale, // [m, 1], token scale + std::optional splitk // splitk + ) { // ~~~~~~~~~~~~~~~~~~~~~~~~following start with ck things @@ -45,8 +47,14 @@ void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, ck::index_t StrideA = K; ck::index_t StrideB = K; - ck::index_t StrideE = N; - ck::index_t KBatch = 1; + ck::index_t SplitK = splitk.has_value() ? splitk.value() : 1; + + ck::index_t KBatch = SplitK > 1 ? K / (SplitK * KPerBlock) : 1; + if (KBatch > 1){ + TORCH_CHECK((KBatch * KPerBlock * SplitK == K), + "K(", K, ") must be a multiple of KPerBlock(", KPerBlock, ") * splitk(", splitk.value(), ").\n"); + } + ck::index_t StrideE = N * (KBatch > 1 ? 2 : 1); using A0Layout = Row; using B0Layout = Col; @@ -83,6 +91,7 @@ void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, static constexpr ck::index_t Scale_Block_M = 1; static constexpr ck::index_t Scale_Block_N = 128; static constexpr ck::index_t Scale_Block_K = 128; + static constexpr bool IsSplitK = std::is_same_v; using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmBlockScale // clang-format off < Row, Col, DsLayout, ELayout, @@ -96,7 +105,7 @@ void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, S, S<1, 0, 2>, S<1, 0, 2>, 2, AK1, AK1, 0, S, S<1, 0, 2>, S<1, 0, 2>, 2, BK1, BK1, 0, MXDLPerWave, NXDLPerWave, S<1, K0_M_A, 1, K0_A>, S<2, 1, 1, 1>, - ck::BlockGemmPipelineScheduler::Intrawave, PipelineVer, ActOP, Nswizzle, true, MulRoutedWeight, int32_t, A0DataType>; + ck::BlockGemmPipelineScheduler::Intrawave, PipelineVer, ActOP, Nswizzle, true, IsSplitK, MulRoutedWeight, int32_t, A0DataType>; // clang-format on @@ -157,7 +166,8 @@ void ck_moe_stage1_gemm(const hipStream_t &stream, int tokens, int sorted_size, void *&num_valid_ids, \ void *&out, \ std::optional w1_scale, \ - std::optional a1_scale); + std::optional a1_scale, \ + std::optional splitk); template < typename A0DataType, @@ -187,7 +197,8 @@ void ck_moe_stage2_gemm(const hipStream_t &stream, int tokens, int sorted_size, void *&num_valid_ids, //[1] void *&out, // [m, out_dim] std::optional w2_scale, // [e, 1, n], gate(up) scale - std::optional a2_scale // [max_num_tokens_padded, 1], token scale + std::optional a2_scale, // [max_num_tokens_padded, 1], token scale + std::optional splitk // splitk ) { // ~~~~~~~~~~~~~~~~~~~~~~~~following start with ck things @@ -252,7 +263,7 @@ void ck_moe_stage2_gemm(const hipStream_t &stream, int tokens, int sorted_size, S, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, S, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0, MXDLPerWave, NXDLPerWave, S<1, K0_M, 1, K0_A>, S<2, 1, 1, 1>, - ck::BlockGemmPipelineScheduler::Intrawave, PipelineVer, 0, false, false, MulRoutedWeight, int32_t, A0DataType>; + ck::BlockGemmPipelineScheduler::Intrawave, PipelineVer, 0, false, false, false, MulRoutedWeight, int32_t, A0DataType>; @@ -313,4 +324,5 @@ void ck_moe_stage2_gemm(const hipStream_t &stream, int tokens, int sorted_size, void *&num_valid_ids, \ void *&out, \ std::optional w2_scale, \ - std::optional a2_scale); + std::optional a2_scale, \ + std::optional splitk); diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4.cuh b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4.cuh index 9321f5950a..f2315b1dd0 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4.cuh +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4.cuh @@ -38,7 +38,8 @@ void ck_moe_stage1_gemm(const hipStream_t& stream, void*& num_valid_ids, // [1] void*& out, // [max_num_tokens_padded, inter_dim] std::optional w1_scale, // [e, 1, n], gate(up) scale - std::optional a1_scale // [m, 1], token scale + std::optional a1_scale, // [m, 1], token scale + std::optional splitk // splitk ) { // ~~~~~~~~~~~~~~~~~~~~~~~~following start with ck things @@ -195,7 +196,8 @@ void ck_moe_stage1_gemm(const hipStream_t& stream, void*& num_valid_ids, \ void*& out, \ std::optional w1_scale, \ - std::optional a1_scale); + std::optional a1_scale, \ + std::optional splitk); template w2_scale, // [e, 1, n], gate(up) scale - std::optional a2_scale // [max_num_tokens_padded, 1], token scale + std::optional a2_scale, // [max_num_tokens_padded, 1], token scale + std::optional splitk // splitk ) { // ~~~~~~~~~~~~~~~~~~~~~~~~following start with ck things @@ -366,4 +369,5 @@ void ck_moe_stage2_gemm(const hipStream_t& stream, void *&num_valid_ids, \ void *&out, \ std::optional w2_scale, \ - std::optional a2_scale); + std::optional a2_scale, \ + std::optional splitk); diff --git a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4_bns.cuh b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4_bns.cuh index 63d7b29d33..ac9d71be29 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4_bns.cuh +++ b/csrc/ck_gemm_moe_2stages_codegen/gemm_moe_ck2stages_common_mxfp4_bns.cuh @@ -37,7 +37,8 @@ void ck_moe_stage1_gemm(const hipStream_t& stream, void*& num_valid_ids, // [1] void*& out, // [max_num_tokens_padded, inter_dim] std::optional w1_scale, // [e, 1, n], gate(up) scale - std::optional a1_scale // [m, 1], token scale + std::optional a1_scale, // [m, 1], token scale + std::optional splitk // splitk ) { // ~~~~~~~~~~~~~~~~~~~~~~~~following start with ck things @@ -96,10 +97,10 @@ void ck_moe_stage1_gemm(const hipStream_t& stream, ///######| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| ///######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S| ///###### RCR - < Row, Col, DsLayout, ELayout, + < Row, Col, DsLayout, ELayout, A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, - AElementOp, BElementOp, CDEElementOp, GemmSpec, - 32, BLOCKSIZE, + AElementOp, BElementOp, CDEElementOp, GemmSpec, + 32, BLOCKSIZE, MPerBlock, NPerBlock, 128, AK1, BK1, MNPerXDL, MNPerXDL, @@ -193,7 +194,8 @@ void ck_moe_stage1_gemm(const hipStream_t& stream, void*& num_valid_ids, \ void*& out, \ std::optional w1_scale, \ - std::optional a1_scale); + std::optional a1_scale, \ + std::optional splitk); template w2_scale, // [e, 1, n], gate(up) scale - std::optional a2_scale // [max_num_tokens_padded, 1], token scale + std::optional a2_scale, // [max_num_tokens_padded, 1], token scale + std::optional splitk // splitk ) { // ~~~~~~~~~~~~~~~~~~~~~~~~following start with ck things @@ -285,10 +288,10 @@ void ck_moe_stage2_gemm(const hipStream_t& stream, ///#####| | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| ///#####| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | S| ///##### RCR - < Row, Col, DsLayout, ELayout, + < Row, Col, DsLayout, ELayout, A0DataType, A1DataType, B0DataType, B1DataType, DsDataType, EDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CDEElementOp, GemmSpec, - 32, BLOCKSIZE, + 32, BLOCKSIZE, MPerBlock, NPerBlock, 128, AK1, BK1, MNPerXDL, MNPerXDL, @@ -364,4 +367,5 @@ void ck_moe_stage2_gemm(const hipStream_t& stream, void *&num_valid_ids, \ void *&out, \ std::optional w2_scale, \ - std::optional a2_scale); \ No newline at end of file + std::optional a2_scale, \ + std::optional splitk); diff --git a/csrc/ck_gemm_moe_2stages_codegen/gen_instances.py b/csrc/ck_gemm_moe_2stages_codegen/gen_instances.py index 8062c548d0..38b7826094 100644 --- a/csrc/ck_gemm_moe_2stages_codegen/gen_instances.py +++ b/csrc/ck_gemm_moe_2stages_codegen/gen_instances.py @@ -715,6 +715,7 @@ def __init__( activation, mul_routed_weight_stage, preshuffle, + splitk, ): self.working_path = working_path self.a_dtype = a_dtype.upper() @@ -725,6 +726,7 @@ def __init__( self.mul_routed_weight_stage = mul_routed_weight_stage self.nswizzle = False self.preshuffle = preshuffle + self.splitk = splitk def generate_instance_and_lookUpTable(self): _, gemm1_kernel_list = get_gemm1_kernels_list( @@ -736,6 +738,7 @@ def generate_instance_and_lookUpTable(self): self.activation, self.mul_routed_weight_stage == 1, self.preshuffle, + self.splitk, ) tag, gemm2_kernel_list = get_gemm2_kernels_list( self.a_dtype, @@ -770,6 +773,9 @@ def generate_instance_and_lookUpTable(self): quanttype = "_mxfp4" else: quanttype = "" + gemm1_fp32 = ( + self.splitk and (kernel.stage == 1) and (quanttype == "_blockscale") + ) if not os.path.exists(f_instance): with open(f_instance, "a") as f_ins: stage_instance = STG_INSTANCE_IMPL.format( @@ -777,7 +783,7 @@ def generate_instance_and_lookUpTable(self): A0DataType=self.a_dtype, B0DataType=self.b_dtype, AccDataType="F32" if self.a_dtype != "I8" else "I32", - EDataType=self.c_dtype, + EDataType="F32" if gemm1_fp32 else self.c_dtype, CDEElementOp=kernel.CDEElementOp, Nswizzle=str(self.nswizzle).lower(), Quant=self.quant_type, @@ -806,7 +812,7 @@ def generate_instance_and_lookUpTable(self): A0DataType=self.a_dtype, B0DataType=self.b_dtype, AccDataType="F32" if self.a_dtype != "I8" else "I32", - EDataType=self.c_dtype, + EDataType="F32" if gemm1_fp32 else self.c_dtype, CDEElementOp=kernel.CDEElementOp, Nswizzle=str(self.nswizzle).lower(), Quant=self.quant_type, @@ -832,11 +838,12 @@ def generate_instance_and_lookUpTable(self): tag ] with open(f_gemm1_heuristic_dispatch, "a") as f_h: + gemm1_fp32 = self.splitk and (quanttype == "_blockscale") gemm1_heuristic_dispatch_str = gemm1_heuristic_dispatch.format( A0DataType=self.a_dtype, B0DataType=self.b_dtype, AccDataType="F32" if self.a_dtype != "I8" else "I32", - EDataType=self.c_dtype, + EDataType="F32" if gemm1_fp32 else self.c_dtype, CDEElementOp=kernel_list[0].CDEElementOp, Nswizzle=str(self.nswizzle).lower(), Quant=self.quant_type, @@ -949,6 +956,12 @@ def generate_instance_and_lookUpTable(self): help="enable pre-shuffle weight mode", ) + parser.add_argument( + "--issplitk", + action="store_true", + help="enable moe_stage1 splitk mode", + ) + args = parser.parse_args() args.quant_type = ( "per_1x128" if args.quant_type == "per_128x128" else args.quant_type @@ -998,13 +1011,15 @@ def generate_instance_and_lookUpTable(self): act, routed_weight, preshuffle_mode, + False, # splitk ) codegen.generate_instance_and_lookUpTable() # blk-quant moe blk_quant_l = ["per_1x128"] - for c_dtype, act, routed_weight, quant in itertools.product( - c_dtypes, acts, routed_weight_l, blk_quant_l + blk_splitk_l = [False, True] + for c_dtype, act, routed_weight, quant, splitk in itertools.product( + c_dtypes, acts, routed_weight_l, blk_quant_l, blk_splitk_l ): codegen = ck_moe_2stage_gemm_codegen( args.working_path, @@ -1015,6 +1030,7 @@ def generate_instance_and_lookUpTable(self): act, routed_weight, preshuffle_mode, + splitk, ) codegen.generate_instance_and_lookUpTable() @@ -1039,6 +1055,7 @@ def generate_instance_and_lookUpTable(self): act, routed_weight, preshuffle_mode, + False, # splitk ) codegen.generate_instance_and_lookUpTable() else: @@ -1053,6 +1070,7 @@ def generate_instance_and_lookUpTable(self): args.activation, args.mul_routed_weight_stage, args.preshuffle, + args.issplitk, ) codegen.generate_instance_and_lookUpTable() diff --git a/csrc/ck_tile_gemm_moe_2stages/gen_instances.py b/csrc/ck_tile_gemm_moe_2stages/gen_instances.py index 03d13d1846..bc76068637 100644 --- a/csrc/ck_tile_gemm_moe_2stages/gen_instances.py +++ b/csrc/ck_tile_gemm_moe_2stages/gen_instances.py @@ -12,6 +12,7 @@ get_heuristic_dispatch_template, ) import sys +from chip_info import get_gfx this_dir = os.path.dirname(os.path.abspath(__file__)) AITER_CORE_DIR = os.path.abspath(f"{this_dir}/../../../") @@ -28,7 +29,7 @@ class cktile_moe_2stage_gemm_codegen: def __init__( self, working_path, - ab_dtype, + a_dtypes, acc_dtype, c_dtype, quant_type, @@ -36,11 +37,15 @@ def __init__( mul_routed_weight_stage, istune=False, ): + self.init = True self.working_path = working_path self.impl_path = os.path.join(working_path, "impl") self.instances_path = os.path.join(working_path, "instances") + self.dispatchers_path = os.path.join(working_path, "dispatchers") self.istune = istune - self.ab_dtype = ab_dtype.lower() + self.kernel_name_list = [] + self.a_dtypes = a_dtypes + self.b_dtypes = ["fp4"] self.acc_dtype = acc_dtype.lower() self.c_dtype = c_dtype.lower() self.quant_type = quant_type @@ -58,7 +63,7 @@ def get_suffix(self, stage: int) -> str: if element != "" ) - def gen_instance(self, k: kernelInstance): + def gen_instance(self, k: kernelInstance, a_type): INSTANCE_IMPL = f"""// SPDX-License-Identifier: MIT // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include "moe_cktile2stages_common.cuh" @@ -115,10 +120,11 @@ def gen_instance(self, k: kernelInstance): xptr = "static_cast(x_scale.value().data_ptr())" wptr = "static_cast(w_scale.value().data_ptr())" elif k.QuantType == "1x32": - scaleGranA = "-1" + # scaleGranA = "-1" + scaleGranA = "1, 32" scaleGranB = "1, 32" biasGran = "1" - xptr = "nullptr" + xptr = "x_scale.has_value() ? static_cast(x_scale.value().data_ptr()) : nullptr" wptr = "static_cast(w_scale.value().data_ptr())" biasptr = "static_cast(exp_bias.value().data_ptr())" @@ -226,8 +232,8 @@ def fill_template(name, a_type, b_type, acc_type, c_type): ) ).write_text(intsance) - if (k.QuantType == "1x32") and (self.ab_dtype in ["bf16", "fp16"]): - fill_template(k.name, self.ab_dtype, "pk_fp4", self.acc_dtype, self.c_dtype) + if (k.QuantType == "1x32") and (a_type in ["bf16", "fp16", "fp8"]): + fill_template(k.name, a_type, "pk_fp4", self.acc_dtype, self.c_dtype) else: for CDtype in ["bf16", "fp16"]: for ABDtype in ["fp8"]: # "bf16", "fp16", @@ -258,8 +264,9 @@ def validate_and_format(template: str, mapping: dict) -> str: # print(placeholders) # print(str_mapping) if missing: - raise KeyError(f"Missing keys in mapping: {missing}") - result = template + for mis in missing: + placeholders.remove(mis) + # result = template # for placeholder in placeholders: # result = result.replace(placeholder, str_mapping[placeholder]) # return result @@ -267,7 +274,9 @@ def validate_and_format(template: str, mapping: dict) -> str: # create heuristic heirarchy with open( - os.path.join(self.working_path, "moe_cktile2stages_heuristic_dispatch.h"), + os.path.join( + self.dispatchers_path, f"moe_cktile2stages_heuristic_dispatch_{tag}.h" + ), "w", ) as f: f.write(validate_and_format(HEURISTIC_template, kernels_dict)) @@ -281,6 +290,46 @@ def validate_and_format(template: str, mapping: dict) -> str: # ) # ) + """genarete heuristic dispatch header for multi dtype""" + + def gen_heuristic_dispatch_header(self, tags): + HEURISTIC_dispatch_header = """#pragma once +// SPDX-License-Identifier: MIT +// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +#include "moe_cktile2stages.h" + +""" + for tag in tags: + HEURISTIC_headers = f"""#include "./dispatchers/moe_cktile2stages_heuristic_dispatch_{tag}.h" +""" + HEURISTIC_dispatch_header += HEURISTIC_headers + + HEURISTIC_function = """#pragma once +// SPDX-License-Identifier: MIT +// Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. + +#include "moe_cktile2stages.h" + +template +MoeKernel moe_gemm1_heuristic_dispatch(int M, int N, int K, int block_m); + +template +MoeKernel moe_gemm2_heuristic_dispatch(int M, int N, int K, int block_m); +""" + # create heuristic heirarchy + with open( + os.path.join(self.working_path, "moe_cktile2stages_heuristic_dispatch.h"), + "w", + ) as f: + f.write(HEURISTIC_dispatch_header) + with open( + os.path.join( + self.dispatchers_path, "moe_cktile2stages_heuristic_dispatch_common.h" + ), + "w", + ) as f: + f.write(HEURISTIC_function) + """generate lookup.h linking MNK/datatype to specific instance""" def gen_lookup_dict(self, kernels_dict): @@ -327,7 +376,7 @@ def gen_lookup_dict(self, kernels_dict): """generate manifest.h for instance header""" - def gen_manifest_head(self, kernels_dict): + def gen_manifest_head(self): MAINFEST_head = """#pragma once // SPDX-License-Identifier: MIT // Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. @@ -339,7 +388,8 @@ def gen_manifest_head(self, kernels_dict): #include """ MAINFEST_template = """ -template +// template +template torch::Tensor {kernel_name}( torch::Tensor& XQ, @@ -365,25 +415,32 @@ def gen_manifest_head(self, kernels_dict): os.path.join(self.working_path, "moe_cktile2stages_manifest.h"), "w" ) as f: f.write(MAINFEST_head) - for mnk, k in kernels_dict.items(): - f.write(MAINFEST_template.format(kernel_name=k.name)) + for k_name in self.kernel_name_list: + f.write(MAINFEST_template.format(kernel_name=k_name)) f.write(MAINFEST_end) """generate all instances and headers""" - def gen_instances(self, tag, kernels_dict): - if os.path.exists(self.impl_path): - shutil.rmtree(self.impl_path) - os.mkdir(self.impl_path) - if os.path.exists(self.instances_path): - shutil.rmtree(self.instances_path) - os.mkdir(self.instances_path) + def gen_instances(self, tag, kernels_dict, a_type): + if self.init: + if os.path.exists(self.impl_path): + shutil.rmtree(self.impl_path) + os.mkdir(self.impl_path) + if os.path.exists(self.instances_path): + shutil.rmtree(self.instances_path) + os.mkdir(self.instances_path) + if os.path.exists(self.dispatchers_path): + shutil.rmtree(self.dispatchers_path) + os.mkdir(self.dispatchers_path) + + self.init = False for mnk, k in kernels_dict.items(): - self.gen_instance(k) + self.gen_instance(k, a_type) + if k.name not in self.kernel_name_list: + self.kernel_name_list.append(k.name) self.gen_lookup_dict(kernels_dict) - self.gen_manifest_head(kernels_dict) self.gen_heuristic_dispatch(tag, kernels_dict) @@ -544,7 +601,9 @@ def gen_instances(self, tag, kernels_dict): # b_type = "fp8" # quant_type = "per_token" - a_type = "bf16" + a_types = ["bf16"] + if get_gfx() == "gfx950": + a_types.append("fp8") b_type = "fp4" quant_type = "1x32" @@ -552,27 +611,33 @@ def gen_instances(self, tag, kernels_dict): c_type = "bf16" act_type = "silu" codegen = cktile_moe_2stage_gemm_codegen( - args.working_path, a_type, acc_type, c_type, quant_type, act_type, 2, False + args.working_path, a_types, acc_type, c_type, quant_type, act_type, 2, False ) # gen all instances for gemm1 and gemm2 - _, gemm1_kernel_list = get_gemm1_kernels_list( - a_type, - b_type, - quant_type, - act_type, - False, - ) - tag, gemm2_kernel_list = get_gemm2_kernels_list( - a_type, - b_type, - quant_type, - "", - True, - ) - # merge gemm1/gemm2 dict with key = {stage, key} - kernel_dict_merge = { - **{(1, key): value for key, value in gemm1_kernel_list.items()}, - **{(2, key): value for key, value in gemm2_kernel_list.items()}, - } - # print(kernel_dict_merge) - codegen.gen_instances(tag, kernel_dict_merge) + tags = [] + kernel_list = [] + for a_type in a_types: + _, gemm1_kernel_list = get_gemm1_kernels_list( + a_type, + b_type, + quant_type, + act_type, + False, + ) + tag, gemm2_kernel_list = get_gemm2_kernels_list( + a_type, + b_type, + quant_type, + "", + True, + ) + # merge gemm1/gemm2 dict with key = {stage, key} + kernel_dict_merge = { + **{(1, key): value for key, value in gemm1_kernel_list.items()}, + **{(2, key): value for key, value in gemm2_kernel_list.items()}, + } + # print(kernel_dict_merge) + codegen.gen_instances(tag, kernel_dict_merge, a_type) + tags.append(tag) + codegen.gen_heuristic_dispatch_header(tags) + codegen.gen_manifest_head() diff --git a/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages_common.cuh b/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages_common.cuh index 63f8e39196..673394c298 100644 --- a/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages_common.cuh +++ b/csrc/ck_tile_gemm_moe_2stages/include/moe_cktile2stages_common.cuh @@ -65,8 +65,6 @@ struct MoeFlatmmConfig static constexpr bool TiledMMAPermuteN = false; }; - - template ; // Preshuffle_ - constexpr bool MXFP4_Pipeline = std::is_same_v; + constexpr bool AQUANT_Pipeline = std::is_same_v || + std::is_same_v || + std::is_same_v; + constexpr bool BMXFP4_Pipeline = std::is_same_v; - if constexpr(!MXFP4_Pipeline && moe_kind == ck_tile::MoeFlatmmKind::kFFN_gemm1_gate_up) + if constexpr(!BMXFP4_Pipeline && moe_kind == ck_tile::MoeFlatmmKind::kFFN_gemm1_gate_up) { static_assert( FlatmmConfig::N_Tile % (FlatmmConfig::N_Warp * FlatmmConfig::N_Warp_Tile * 2) == 0, @@ -128,11 +129,8 @@ void moe_gemm(const MoeFlatmmHostArgs& args, const ck_stream_config& s) static_assert(sizeof(ComputeDataType) >= sizeof(BDataType), "mixed_prec_flatmm requires ADataType is a wider type than BDataType"); - using GemmPipelineProblem = ck_tile::GemmPipelineProblem; + using GemmPipelineProblem = + ck_tile::GemmPipelineProblem; using BaseGemmPipeline = ck_tile::BaseFlatmmPipelineAGmemBGmemCRegV1; @@ -142,11 +140,8 @@ void moe_gemm(const MoeFlatmmHostArgs& args, const ck_stream_config& s) const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); - const ck_tile::amd_buffer_coherence_enum b_mem_nt_type = - BaseGemmPipeline::GetBMemNTType( - args.NumTokens, - args.N, - args.K); + const int32_t b_mem_nt_type = + static_cast(BaseGemmPipeline::GetBMemNTType(args.NumTokens, args.N, args.K)); float ave_time{0}; @@ -158,10 +153,21 @@ void moe_gemm(const MoeFlatmmHostArgs& args, const ck_stream_config& s) constexpr auto tail_number_v = tail_number_.value; constexpr auto scheduler = FlatmmConfig::Scheduler; constexpr auto memory_operation = memory_operation_.value; - constexpr auto b_mem_nt_type_v = b_mem_nt_type_.value; - - using CodegenPipelineProblem = - std::conditional_t(b_mem_nt_type_.value); + + using CodegenPipelineProblem = std::conditional_t< + BMXFP4_Pipeline, + std::conditional_t, ck_tile::F16xMXF4FlatmmPipelineProblem, - ck_tile::FlatmmPipelineProblem>; + b_mem_nt_type_v>>, + ck_tile::FlatmmPipelineProblem>; constexpr int BlockedXDLN_PerWarp = - (MXFP4_Pipeline || (moe_kind == ck_tile::MoeFlatmmKind::kFFN_gemm1_gate_up)) + (BMXFP4_Pipeline || (moe_kind == ck_tile::MoeFlatmmKind::kFFN_gemm1_gate_up)) ? 2 : 1; // determined by scale shuffle pattern @@ -211,12 +217,15 @@ void moe_gemm(const MoeFlatmmHostArgs& args, const ck_stream_config& s) BlockedXDLN_PerWarp>>; using CodegenFlatmmPipeline = std::conditional_t< - MXFP4_Pipeline, - ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1, + BMXFP4_Pipeline, + std::conditional_t< + AQUANT_Pipeline, + ck_tile::F8xMXF4FlatmmPipelineAGmemBGmemCRegV1, + ck_tile::F16xMXF4FlatmmPipelineAGmemBGmemCRegV1>, ck_tile::MoeFlatmmPipelineAGmemBGmemCRegV1>; using FusedAct = - std::conditional_t; + std::conditional_t; using Kernel = ck_tile::MoeFlatmmKernel{}); - } - else - { - Run(has_hot_loop_, - tail_number_, - memory_operation_, - ck_tile::integral_constant{}); - } - }; + const auto RunBMem = + [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) { + switch(b_mem_nt_type) + { + case 2: { + Run(has_hot_loop_, + tail_number_, + memory_operation_, + ck_tile::integral_constant{}); + } + break; + default: { + Run(has_hot_loop_, + tail_number_, + memory_operation_, + ck_tile::integral_constant{}); + } + } + }; const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { if(args.k_batch == 1) diff --git a/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages.cu b/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages.cu index 73674ed146..850676add3 100644 --- a/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages.cu +++ b/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages.cu @@ -111,6 +111,22 @@ torch::Tensor cktile_moe_gemm1(torch::Tensor& XQ, // moe_dispatch(M, N, K, MPerBlock)(XQ, WQ, Y, sorted_ids, // sorted_expert_ids, max_token_ids, topk, topk_weight, x_scale, w_scale, exp_bias); // } + if (WQ.dtype() == torch_fp4x2 && Y.dtype() == at::ScalarType::BFloat16) + { + moe_dispatch(M, N, K, MPerBlock)(XQ, + WQ, + Y, + sorted_ids, + sorted_expert_ids, + max_token_ids, + topk, + n_padded_zeros, + k_padded_zeros, + topk_weight, + x_scale, + w_scale, + exp_bias); + } } else if((XQ.dtype() == at::ScalarType::BFloat16 || XQ.dtype() == at::ScalarType::Half) && (WQ.dtype() == torch_fp4x2)) // a16w4 @@ -184,6 +200,22 @@ torch::Tensor cktile_moe_gemm2(torch::Tensor& XQ, // moe_dispatch(M, N, K, MPerBlock)(XQ, WQ, Y, sorted_ids, // sorted_expert_ids, max_token_ids, topk, topk_weight, x_scale, w_scale, exp_bias); // } + if (WQ.dtype() == torch_fp4x2 && Y.dtype() == at::ScalarType::BFloat16) + { + moe_dispatch(M, N, K, MPerBlock)(XQ, + WQ, + Y, + sorted_ids, + sorted_expert_ids, + max_token_ids, + topk, + n_padded_zeros, + k_padded_zeros, + topk_weight, + x_scale, + w_scale, + exp_bias); + } } else if((XQ.dtype() == at::ScalarType::BFloat16 || XQ.dtype() == at::ScalarType::Half) && (WQ.dtype() == torch_fp4x2)) // a16w4 @@ -215,4 +247,4 @@ torch::Tensor cktile_moe_gemm2(torch::Tensor& XQ, TORCH_CHECK(false, "Unsupported scales/output dtype!"); } return Y; -} \ No newline at end of file +} diff --git a/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages_common.py b/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages_common.py index f1be74edd8..189f0331d6 100644 --- a/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages_common.py +++ b/csrc/ck_tile_gemm_moe_2stages/moe_cktile2stages_common.py @@ -158,12 +158,36 @@ def name(self) -> str: # 4: kernelInstance( 2, 256, 256, 128, 128, 16, 16, 32, 1, 4,), } +# gemm1 out:bf16/fp16 AB:fp8/fp4 +a8w4_gemm1_kernels_list_gfx950= { + # kernel: stage| BLOCK_SIZE|MPerBLOCK| NPerBLOCK| KPerBLOCK| WAVE_TILE_M| WAVE_TILE_N| WAVE_TILE_K| WAVE_MAP_M| WAVE_MAP_N| BlockPerCU| + # 0: kernelInstance( 1, 256, 16, 128, 256, 16, 16, 128, 1, 4, 2,), + # 5: kernelInstance( 2, 256, 16, 512, 256, 16, 16, 32, 1, 4, 4,), + 1: kernelInstance( 1, 256, 32, 256, 256, 16, 16, 128, 1, 4, 2,), + 3: kernelInstance( 1, 256, 64, 256, 256, 16, 16, 128, 1, 4, 1,), + # 4: kernelInstance( 2, 256, 128, 256, 128, 16, 16, 32, 1, 4, 1,), + # 4: kernelInstance( 2, 256, 256, 256, 256, 16, 16, 32, 1, 4,), + # 4: kernelInstance( 2, 256, 256, 128, 128, 16, 16, 32, 1, 4,), +} +# gemm2 out:bf16/fp16 AB:fp8/fp4 +a8w4_gemm2_kernels_list_gfx950= { + # kernel: stage| BLOCK_SIZE|MPerBLOCK| NPerBLOCK| KPerBLOCK| WAVE_TILE_M| WAVE_TILE_N| WAVE_TILE_K| WAVE_MAP_M| WAVE_MAP_N| BlockPerCU| + # 0: kernelInstance( 2, 256, 16, 128, 256, 16, 16, 128, 1, 4, 2,), + # 5: kernelInstance( 2, 256, 16, 512, 256, 16, 16, 32, 1, 4, 4,), + 1: kernelInstance( 2, 256, 32, 256, 256, 16, 16, 128, 1, 4, 2,), + 3: kernelInstance( 2, 256, 64, 256, 256, 16, 16, 128, 1, 4, 1,), + # 4: kernelInstance( 2, 256, 128, 256, 128, 16, 16, 32, 1, 4, 1,), + # 4: kernelInstance( 2, 256, 256, 256, 256, 16, 16, 32, 1, 4,), + # 4: kernelInstance( 2, 256, 256, 128, 128, 16, 16, 32, 1, 4,), +} + # fmt: on gemm1_kernels_dict = { "a8w8_gfx950": a8w8_gemm1_kernels_list_gfx950, "a8w8": a8w8_gemm1_kernels_list, "a16w4_gfx950": a16w4_gemm1_kernels_list_gfx950, "a16w4": a16w4_gemm1_kernels_list, + "a8w4_gfx950": a8w4_gemm1_kernels_list_gfx950, } gemm2_kernels_dict = { @@ -171,6 +195,7 @@ def name(self) -> str: "a8w8": a8w8_gemm2_kernels_list, "a16w4_gfx950": a16w4_gemm2_kernels_list_gfx950, "a16w4": a16w4_gemm2_kernels_list, + "a8w4_gfx950": a8w4_gemm2_kernels_list_gfx950, } @@ -178,26 +203,27 @@ def name(self) -> str: // SPDX-License-Identifier: MIT // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include "moe_cktile2stages.h" +#include "moe_cktile2stages_heuristic_dispatch_common.h" -template -MoeKernel moe_gemm1_heuristic_dispatch(int M, int N, int K, int block_m) +template <> +MoeKernel moe_gemm1_heuristic_dispatch(int M, int N, int K, int block_m) {{ // Apply shape heuristics to find a suitable kernel implementation. if (block_m == 32) {{ - return {(1, 1)}; + return {(1, 1)}; }} else if (block_m == 64) {{ - return {(1, 2)}; + return {(1, 2)}; }} //else if (block_m == 128) //{{ - // return {(1, 4)}; + // return {(1, 4)}; //}} //else if (block_m == 256) //{{ - // return {(1, 6)}; + // return {(1, 6)}; //}} else {{ @@ -208,25 +234,25 @@ def name(self) -> str: }} }} -template -MoeKernel moe_gemm2_heuristic_dispatch(int M, int N, int K, int block_m) +template <> +MoeKernel moe_gemm2_heuristic_dispatch(int M, int N, int K, int block_m) {{ // Apply shape heuristics to find a suitable kernel implementation. if (block_m == 32) {{ - return {(2, 0)}; + return {(2, 0)}; }} else if (block_m == 64) {{ - return {(2, 1)}; + return {(2, 1)}; }} //else if (block_m == 128) //{{ - // return {(2, 2)}; + // return {(2, 2)}; //}} //else if (block_m == 256) //{{ - // return {(2, 3)}; + // return {(2, 3)}; //}} else {{ @@ -242,22 +268,23 @@ def name(self) -> str: // SPDX-License-Identifier: MIT // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include "moe_cktile2stages.h" +#include "moe_cktile2stages_heuristic_dispatch_common.h" -template -MoeKernel moe_gemm1_heuristic_dispatch(int M, int N, int K, int block_m) +template <> +MoeKernel moe_gemm1_heuristic_dispatch(int M, int N, int K, int block_m) {{ // Apply shape heuristics to find a suitable kernel implementation. if (block_m == 16) {{ - return {(1, 0)}; + return {(1, 0)}; }} else if (block_m == 32) {{ - return {(1, 1)}; + return {(1, 1)}; }} else if (block_m == 64) {{ - return {(1, 3)}; + return {(1, 3)}; }} else {{ @@ -268,21 +295,21 @@ def name(self) -> str: }} }} -template -MoeKernel moe_gemm2_heuristic_dispatch(int M, int N, int K, int block_m) +template <> +MoeKernel moe_gemm2_heuristic_dispatch(int M, int N, int K, int block_m) {{ // Apply shape heuristics to find a suitable kernel implementation. if (block_m == 16) {{ - return {(2, 0)}; + return {(2, 0)}; }} else if (block_m == 32) {{ - return {(2, 1)}; + return {(2, 1)}; }} else if (block_m == 64) {{ - return {(2, 3)}; + return {(2, 3)}; }} else {{ @@ -298,22 +325,23 @@ def name(self) -> str: // SPDX-License-Identifier: MIT // Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include "moe_cktile2stages.h" +#include "moe_cktile2stages_heuristic_dispatch_common.h" -template -MoeKernel moe_gemm1_heuristic_dispatch(int M, int N, int K, int block_m) +template <> +MoeKernel moe_gemm1_heuristic_dispatch(int M, int N, int K, int block_m) {{ // Apply shape heuristics to find a suitable kernel implementation. if (block_m == 16) {{ - return {(1, 0)}; + return {(1, 0)}; }} else if (block_m == 32) {{ - return {(1, 1)}; + return {(1, 1)}; }} else if (block_m == 64) {{ - return {(1, 3)}; + return {(1, 3)}; }} else {{ @@ -324,21 +352,78 @@ def name(self) -> str: }} }} -template -MoeKernel moe_gemm2_heuristic_dispatch(int M, int N, int K, int block_m) +template <> +MoeKernel moe_gemm2_heuristic_dispatch(int M, int N, int K, int block_m) {{ // Apply shape heuristics to find a suitable kernel implementation. if (block_m == 16) {{ - return {(2, 0)}; + return {(2, 0)}; }} else if (block_m == 32) {{ - return {(2, 1)}; + return {(2, 1)}; + }} + else if (block_m == 64) + {{ + return {(2, 3)}; + }} + else + {{ + TORCH_CHECK( + false, + "Unsupported block_m value for moe_gemm2 heuristic dispatch: ", + block_m); + }} +}} +""" + +a8w4_gfx950_heuristic_dispatch = """#pragma once +// SPDX-License-Identifier: MIT +// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. +#include "moe_cktile2stages.h" +#include "moe_cktile2stages_heuristic_dispatch_common.h" + +template <> +MoeKernel moe_gemm1_heuristic_dispatch( + int M, + int N, + int K, + int block_m) +{{ + // Apply shape heuristics to find a suitable kernel implementation. + if (block_m == 32) + {{ + return {(1, 1)}; }} else if (block_m == 64) {{ - return {(2, 3)}; + return {(1, 3)}; + }} + else + {{ + TORCH_CHECK( + false, + "Unsupported block_m value for moe_geem1 heuristic dispatch: ", + block_m); + }} +}} + +template <> +MoeKernel moe_gemm2_heuristic_dispatch( + int M, + int N, + int K, + int block_m) +{{ + // Apply shape heuristics to find a suitable kernel implementation. + if (block_m == 32) + {{ + return {(2, 1)}; + }} + else if (block_m == 64) + {{ + return {(2, 3)}; }} else {{ @@ -355,6 +440,7 @@ def name(self) -> str: # "a8w8": a8w8_gemm2_kernels_list, "a16w4_gfx950": a16w4_gfx950_heuristic_dispatch, "a16w4": a16w4_heuristic_dispatch, + "a8w4_gfx950": a8w4_gfx950_heuristic_dispatch, } @@ -382,6 +468,13 @@ def get_gemm1_kernels_list( tag = "a16w4_gfx950" else: tag = "a16w4" + elif Adtype.lower() in bit8_list and Bdtype in bit4_list: + if arch == "gfx950": + tag = "a8w4_gfx950" + else: + raise ValueError( + f"Unsupported data type combination: {Adtype}, {Bdtype} on {arch}" + ) else: raise ValueError(f"Unsupported data type combination: {Adtype}, {Bdtype}") kernels_list = gemm1_kernels_dict[tag] @@ -421,6 +514,13 @@ def get_gemm2_kernels_list( tag = "a16w4_gfx950" else: tag = "a16w4" + elif Adtype.lower() in bit8_list and Bdtype in bit4_list: + if arch == "gfx950": + tag = "a8w4_gfx950" + else: + raise ValueError( + f"Unsupported data type combination: {Adtype}, {Bdtype} on {arch}" + ) else: raise ValueError(f"Unsupported data type combination: {Adtype}, {Bdtype}") kernels_list = gemm2_kernels_dict[tag] diff --git a/csrc/cpp_itfs/mha_bwd.cpp b/csrc/cpp_itfs/mha_bwd.cpp index f90b2fb421..2890795854 100644 --- a/csrc/cpp_itfs/mha_bwd.cpp +++ b/csrc/cpp_itfs/mha_bwd.cpp @@ -463,8 +463,9 @@ float fmha_v3_bwd(mha_bwd_args a, const ck_tile::stream_config& s) if(a.mask_type == 3) { - // Note: sink_size=0 is passed as the 3rd parameter (attention sink not supported in bwd yet) - auto sink_size = 0; + // Note: sink_size=0 is passed as the 3rd parameter (attention sink not supported in bwd + // yet) + auto sink_size = 0; auto generic_mask = ck_tile::make_generic_attention_mask_coordinates_from_lr_window( a.window_size_left, a.window_size_right, diff --git a/csrc/include/mha_bwd.h b/csrc/include/mha_bwd.h index 99fc32e87e..e552c01be5 100644 --- a/csrc/include/mha_bwd.h +++ b/csrc/include/mha_bwd.h @@ -10,7 +10,8 @@ namespace aiter { -struct mha_bwd_args { +struct mha_bwd_args +{ // aiter args int mask_type; // 0: no mask 1: top_left_causal 2: bottom_right_causal 3: sliding_window bool use_asm_v3; @@ -146,93 +147,100 @@ struct mha_bwd_args { struct __attribute__((packed)) fmha_bwd_dqdkdv_args { - void* ptr_dq; // 0x00: dq or dq_acc + void* ptr_dq; // 0x00: dq or dq_acc p2 _p0; - void* ptr_dk; // 0x10 + void* ptr_dk; // 0x10 p2 _p1; - void* ptr_dv; // 0x20 + void* ptr_dv; // 0x20 p2 _p2; - const void* ptr_q; // 0x30 + const void* ptr_q; // 0x30 p2 _p3; - const void* ptr_k; // 0x40 + const void* ptr_k; // 0x40 p2 _p4; - const void* ptr_v; // 0x50 + const void* ptr_v; // 0x50 p2 _p5; - const void* ptr_do; // 0x60 + const void* ptr_do; // 0x60 p2 _p6; - const void* ptr_lse; // 0x70 + const void* ptr_lse; // 0x70 p2 _p7; - const void* ptr_d; // 0x80 + const void* ptr_d; // 0x80 p2 _p8; - float scalar; // 0x90 + float scalar; // 0x90 p3 _p9; - float log2e; // 0xa0 + float log2e; // 0xa0 p3 _p10; - unsigned int seqlen_q; // 0xb0: s_seq_len_q + unsigned int seqlen_q; // 0xb0: s_seq_len_q p3 _p11; - unsigned int Ts; // 0xc0: s_Seqs_k*sub_K + unsigned int Ts; // 0xc0: s_Seqs_k*sub_K p3 _p12; - unsigned int Hs_q; // 0xd0: s_Hs_q + unsigned int Hs_q; // 0xd0: s_Hs_q p3 _p13; - unsigned int BAs_q; // 0xe0: s_BAs_q + unsigned int BAs_q; // 0xe0: s_BAs_q p3 _p14; - unsigned int Seqs_q; // 0xf0: s_Seqs_q + unsigned int Seqs_q; // 0xf0: s_Seqs_q p3 _p15; - unsigned int ratio; // 0x100 + unsigned int ratio; // 0x100 p3 _p16; - unsigned int Hs_k; // 0x110: s_Hs_k + unsigned int Hs_k; // 0x110: s_Hs_k p3 _p17; - unsigned int BAs_k; // 0x120: s_BAs_k + unsigned int BAs_k; // 0x120: s_BAs_k p3 _p18; - unsigned int Seqs_k; // 0x130: s_Seqs_k + unsigned int Seqs_k; // 0x130: s_Seqs_k p3 _p19; - unsigned int Seqs_dk; // 0x140: s_Seqs_dk + unsigned int Seqs_dk; // 0x140: s_Seqs_dk p3 _p20; - unsigned int seqlen_k; // 0x150: batch mode + unsigned int seqlen_k; // 0x150: batch mode p3 _p21; - unsigned int head_dim_q; // 0x160: batch&group mode for headdim padding + unsigned int head_dim_q; // 0x160: batch&group mode for headdim padding p3 _p22; - unsigned int head_dim_v; // 0x170: batch&group mode for headdim padding + unsigned int head_dim_v; // 0x170: batch&group mode for headdim padding p3 _p23; - unsigned int nhead_q; // 0x180: batch mode lsed([B,H,S]) addr = batch_idx * nhead_q * seqlen_q * 4 + head_idx * seqlen_q * 4 + unsigned int nhead_q; // 0x180: batch mode lsed([B,H,S]) addr = batch_idx * nhead_q * seqlen_q * + // 4 + head_idx * seqlen_q * 4 p3 _p24; - unsigned int Hs_v; // 0x190: batch&group mode + unsigned int Hs_v; // 0x190: batch&group mode p3 _p25; - unsigned int BAs_v; // 0x1a0: batch mode + unsigned int BAs_v; // 0x1a0: batch mode p3 _p26; - unsigned int Seqs_v; // 0x1b0: batch&group mode + unsigned int Seqs_v; // 0x1b0: batch&group mode p3 _p27; - unsigned int Hs_do; // 0x1c0: batch&group mode + unsigned int Hs_do; // 0x1c0: batch&group mode p3 _p28; - unsigned int BAs_do; // 0x1d0: group mode + unsigned int BAs_do; // 0x1d0: group mode p3 _p29; - unsigned int Seqs_do; // 0x1e0: batch&group mode + unsigned int Seqs_do; // 0x1e0: batch&group mode p3 _p30; - unsigned int Hs_dk; // 0x1f0: batch&group mode + unsigned int Hs_dk; // 0x1f0: batch&group mode p3 _p31; - unsigned int BAs_dk; // 0x200: group mode + unsigned int BAs_dk; // 0x200: group mode p3 _p32; - unsigned int Hs_dv; // 0x210: batch&group mode + unsigned int Hs_dv; // 0x210: batch&group mode p3 _p33; - unsigned int BAs_dv; // 0x220: group mode + unsigned int BAs_dv; // 0x220: group mode p3 _p34; - unsigned int Seqs_dv; // 0x230: batch&group mode + unsigned int Seqs_dv; // 0x230: batch&group mode p3 _p35; - unsigned int Hs_lsed; // 0x240: group mode lsed([H,TotalValid_Q(90)]) addr = seqstart_q[batch_idx] * 4 + head_idx * nhead_stride_lsed(s_Hs_lsed) + unsigned int Hs_lsed; // 0x240: group mode lsed([H,TotalValid_Q(90)]) addr = + // seqstart_q[batch_idx] * 4 + head_idx * nhead_stride_lsed(s_Hs_lsed) p3 _p36; - const void* ptr_qseq; // 0x250: group mode seqstart_q [0, 20, 50, 90] + const void* ptr_qseq; // 0x250: group mode seqstart_q [0, 20, 50, 90] p2 _p37; - const void* ptr_kseq; // 0x260: group mode seqstart_k [0, 50, 110, 180] + const void* ptr_kseq; // 0x260: group mode seqstart_k [0, 50, 110, 180] p2 _p38; - const void* ptr_qseq_padded; // 0x270: group mode seqstart_q_padded [0, 30(20+10), 70(20+10+30+10), 120(20+10+30+10+40+10)] if 10 is padded after each seqlen[30(20+10), 40(30+10), 50(40+10)] + const void* ptr_qseq_padded; // 0x270: group mode seqstart_q_padded [0, 30(20+10), + // 70(20+10+30+10), 120(20+10+30+10+40+10)] if 10 is padded after + // each seqlen[30(20+10), 40(30+10), 50(40+10)] p2 _p39; - const void* ptr_kseq_padded; // 0x280: group mode seqstart_k_padded [0, 60(50+10), 130(50+10+60+10), 200(50+10+60+10+70+10)] if 10 is padded after each seqlen[60(50+10), 70(60+10), 80(70+10)] + const void* ptr_kseq_padded; // 0x280: group mode seqstart_k_padded [0, 60(50+10), + // 130(50+10+60+10), 200(50+10+60+10+70+10)] if 10 is padded after + // each seqlen[60(50+10), 70(60+10), 80(70+10)] p2 _p40; - unsigned int max_seqlen_dq; // 0x290: gorup mode max seqlen q for a16 dq_acc store, padding to 16x + unsigned int + max_seqlen_dq; // 0x290: gorup mode max seqlen q for a16 dq_acc store, padding to 16x p3 _p41; - int mask_x; // 0x2a0 + int mask_x; // 0x2a0 p3 _p42; - int mask_y; // 0x2b0 + int mask_y; // 0x2b0 p3 _p43; }; diff --git a/csrc/include/mha_fwd.h b/csrc/include/mha_fwd.h index 8a43bbca83..65ff3ea1d5 100644 --- a/csrc/include/mha_fwd.h +++ b/csrc/include/mha_fwd.h @@ -67,7 +67,7 @@ struct mha_fwd_splitkv_traits : public fmha_fwd_splitkv_traits bias_type, has_lse, false, // do_fp8_static_quant - has_sink} + has_sink} { } }; diff --git a/csrc/include/moe_ck.h b/csrc/include/moe_ck.h index a6d461415b..f9b06dfe9f 100644 --- a/csrc/include/moe_ck.h +++ b/csrc/include/moe_ck.h @@ -18,7 +18,9 @@ void ck_moe_stage1(torch::Tensor& hidden_states, // [m, k], input token std::optional block_m, std::optional sorted_weights, int quant_type, - int activation); + int activation, + int splitk, + std::optional dst_type); void ck_moe_stage2(torch::Tensor& inter_states, // [m, k], input token torch::Tensor& w1, // [e, n, k]/[e, 2*n, k], pre-shuffle([e, nr, kr, w]) @@ -34,4 +36,6 @@ void ck_moe_stage2(torch::Tensor& inter_states, // [m, k], input token std::optional block_m, std::optional sorted_weights, // [max_num_tokens_padded]); int quant_type, - int activation); + int activation, + int splitk, + std::optional dst_type); diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index 135e8ae03b..4a973ac1b5 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -914,7 +914,9 @@ namespace py = pybind11; py::arg("block_m") = 32, \ py::arg("sorted_weights") = std::nullopt, \ py::arg("quant_type") = 0, \ - py::arg("activation") = 0); \ + py::arg("activation") = 0, \ + py::arg("splitk") = 1, \ + py::arg("dst_type") = std::nullopt); \ \ m.def("ck_moe_stage2", \ &ck_moe_stage2, \ @@ -932,7 +934,9 @@ namespace py = pybind11; py::arg("block_m") = 32, \ py::arg("sorted_weights") = std::nullopt, \ py::arg("quant_type") = 0, \ - py::arg("activation") = 0); + py::arg("activation") = 0, \ + py::arg("splitk") = 1, \ + py::arg("dst_type") = std::nullopt); #define MOE_CKTILE_2STAGES_PYBIND \ m.def("cktile_moe_gemm1", \ @@ -1537,34 +1541,34 @@ namespace py = pybind11; #define GEMM_COMMON_PYBIND \ m.def("get_padded_m", &getPaddedM, py::arg("M"), py::arg("N"), py::arg("K"), py::arg("gl")); -#define TOP_K_PER_ROW_PYBIND \ - m.def("top_k_per_row_prefill", \ - &top_k_per_row_prefill, \ - py::arg("logits"), \ - py::arg("rowStarts"), \ - py::arg("rowEnds"), \ - py::arg("indices"), \ - py::arg("values"), \ - py::arg("numRows"), \ - py::arg("stride0"), \ - py::arg("stride1")); \ - m.def("top_k_per_row_decode", \ - &top_k_per_row_decode, \ - py::arg("logits"), \ - py::arg("next_n"), \ - py::arg("seqLens"), \ - py::arg("indices"), \ - py::arg("numRows"), \ - py::arg("stride0"), \ - py::arg("stride1")); \ - m.def("top_k_per_row_decode_fast", \ - &top_k_per_row_decode_fast, \ - py::arg("logits"), \ - py::arg("next_n"), \ - py::arg("seqLens"), \ - py::arg("indices"), \ - py::arg("numRows"), \ - py::arg("stride0"), \ +#define TOP_K_PER_ROW_PYBIND \ + m.def("top_k_per_row_prefill", \ + &top_k_per_row_prefill, \ + py::arg("logits"), \ + py::arg("rowStarts"), \ + py::arg("rowEnds"), \ + py::arg("indices"), \ + py::arg("values"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ + py::arg("stride1")); \ + m.def("top_k_per_row_decode", \ + &top_k_per_row_decode, \ + py::arg("logits"), \ + py::arg("next_n"), \ + py::arg("seqLens"), \ + py::arg("indices"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ + py::arg("stride1")); \ + m.def("top_k_per_row_decode_fast", \ + &top_k_per_row_decode_fast, \ + py::arg("logits"), \ + py::arg("next_n"), \ + py::arg("seqLens"), \ + py::arg("indices"), \ + py::arg("numRows"), \ + py::arg("stride0"), \ py::arg("stride1")); #define MLA_METADATA_PYBIND \ diff --git a/csrc/include/topk_per_row.h b/csrc/include/topk_per_row.h index 86fcf9bf0c..dcfcfa565e 100644 --- a/csrc/include/topk_per_row.h +++ b/csrc/include/topk_per_row.h @@ -19,7 +19,6 @@ void top_k_per_row_decode(const torch::Tensor& logits, int64_t stride0, int64_t stride1); - void top_k_per_row_decode_fast(const torch::Tensor& logits, int64_t next_n, const torch::Tensor& seqLens, diff --git a/csrc/kernels/mla/metadata/v1_1_device.cuh b/csrc/kernels/mla/metadata/v1_1_device.cuh index ef448bd21d..743a99eb79 100644 --- a/csrc/kernels/mla/metadata/v1_1_device.cuh +++ b/csrc/kernels/mla/metadata/v1_1_device.cuh @@ -8,32 +8,30 @@ #define PRINT_DBG 0 -CK_TILE_DEVICE auto get_cost_top( - const int32_t* p_cost_heap, - const int32_t num_clusters) +CK_TILE_DEVICE auto get_cost_top(const int32_t* p_cost_heap, const int32_t num_clusters) { - int32_t cid_min = -1; + int32_t cid_min = -1; int32_t cost_min = 0x7fffffff; // Get local top - for (int32_t cid = ck_tile::get_lane_id(); cid < num_clusters; cid += ck_tile::get_warp_size()) + for(int32_t cid = ck_tile::get_lane_id(); cid < num_clusters; cid += ck_tile::get_warp_size()) { const int32_t cost = p_cost_heap[cid]; - if (cost < cost_min) + if(cost < cost_min) { cost_min = cost; - cid_min = cid; + cid_min = cid; } } - // Get global top - #pragma unroll - for (int32_t offset = (ck_tile::get_warp_size() >> 1); offset > 0; offset >>= 1) +// Get global top +#pragma unroll + for(int32_t offset = (ck_tile::get_warp_size() >> 1); offset > 0; offset >>= 1) { const int32_t srd_lane = (offset ^ ck_tile::get_warp_size()) ^ ck_tile::get_lane_id(); - const int32_t cid_remote = ck_tile::warp_shuffle(cid_min, srd_lane); + const int32_t cid_remote = ck_tile::warp_shuffle(cid_min, srd_lane); const int32_t cost_remote = ck_tile::warp_shuffle(cost_min, srd_lane); - if ((cost_remote < cost_min) || ((cost_remote == cost_min) && (cid_remote < cid_min))) + if((cost_remote < cost_min) || ((cost_remote == cost_min) && (cid_remote < cid_min))) { cost_min = cost_remote; cid_min = cid_remote; @@ -43,23 +41,23 @@ CK_TILE_DEVICE auto get_cost_top( return std::make_tuple(cid_min, cost_min); } -template +template struct MlaMetadataV11Traits { - static constexpr int32_t kPackedQoLenPerWg = kPackedQoLenPerWg_; - static constexpr int32_t kPackedQoLenPerWg_log2 = __builtin_ctz(kPackedQoLenPerWg); - static constexpr int32_t kMaxClusterSize = kMaxClusterSize_; - static constexpr int32_t kSplitTolerance = 16; - static constexpr bool kQoSplits = kQoSplits_; + static constexpr int32_t kPackedQoLenPerWg = kPackedQoLenPerWg_; + static constexpr int32_t kPackedQoLenPerWg_log2 = __builtin_ctz(kPackedQoLenPerWg); + static constexpr int32_t kMaxClusterSize = kMaxClusterSize_; + static constexpr int32_t kSplitTolerance = 16; + static constexpr bool kQoSplits = kQoSplits_; // <= -1: read from seqlens_qo_indptr // == 0: read from MlaMetadataV1KernelParameter::uni_seqlen_QO // >= 1: read from MlaMetadataV11Traits::kUniSeqlenQo - static constexpr int32_t kUniSeqlenQo = kUniSeqlenQo_; - static constexpr int32_t kIsSparse = kIsSparse_; + static constexpr int32_t kUniSeqlenQo = kUniSeqlenQo_; + static constexpr int32_t kIsSparse = kIsSparse_; static constexpr bool kSortBatch = true; }; @@ -72,130 +70,145 @@ struct MlaMetadataV11Coefficients }; // This version just follows Flashinfer -CK_TILE_HOST_DEVICE int32_t cal_workload_limit_global_v0( - const int32_t cum_workload, - const int32_t num_clusters, - const int32_t kv_granularity) +CK_TILE_HOST_DEVICE int32_t cal_workload_limit_global_v0(const int32_t cum_workload, + const int32_t num_clusters, + const int32_t kv_granularity) { int32_t limit; - const int32_t avg_workload = ck_tile::max(ck_tile::integer_divide_ceil(cum_workload, num_clusters), 1); - if (avg_workload <= 8) limit = 32; - else if (avg_workload <= 16) limit = 64; - else if (avg_workload <= 32) limit = 128; - else if (avg_workload <= 64) limit = 192; - else limit = avg_workload; + const int32_t avg_workload = + ck_tile::max(ck_tile::integer_divide_ceil(cum_workload, num_clusters), 1); + if(avg_workload <= 8) + limit = 32; + else if(avg_workload <= 16) + limit = 64; + else if(avg_workload <= 32) + limit = 128; + else if(avg_workload <= 64) + limit = 192; + else + limit = avg_workload; return ck_tile::integer_least_multiple(limit, kv_granularity); } -CK_TILE_HOST_DEVICE int32_t cal_workload_limit_global_v1( - const MlaMetadataV11Coefficients& coefs, - const int32_t num_batches, - const int32_t cum_workload, - const int32_t num_clusters, - const int32_t packed_seqlen_qo, - const int32_t kv_granularity) +CK_TILE_HOST_DEVICE int32_t cal_workload_limit_global_v1(const MlaMetadataV11Coefficients& coefs, + const int32_t num_batches, + const int32_t cum_workload, + const int32_t num_clusters, + const int32_t packed_seqlen_qo, + const int32_t kv_granularity) { - const int32_t split_overhead = 2 * cal_cost(packed_seqlen_qo, 1) - cal_cost(packed_seqlen_qo, 2); + const int32_t split_overhead = + 2 * cal_cost(packed_seqlen_qo, 1) - cal_cost(packed_seqlen_qo, 2); const int32_t fixed_split_overhead = split_overhead * num_batches; int32_t limit; - const int32_t avg_workload = - ck_tile::max(ck_tile::integer_divide_ceil(cum_workload - fixed_split_overhead, num_clusters), 1); - if (avg_workload <= 8) limit = 32; - else if (avg_workload <= 16) limit = 64; - else if (avg_workload <= 32) limit = 128; - else if (avg_workload <= 64) limit = 192; - else limit = avg_workload; - - const float split_amplifier = - num_batches * coefs.workload_limit_global_0 + - avg_workload * coefs.workload_limit_global_1 + - coefs.workload_limit_global_2; + const int32_t avg_workload = ck_tile::max( + ck_tile::integer_divide_ceil(cum_workload - fixed_split_overhead, num_clusters), 1); + if(avg_workload <= 8) + limit = 32; + else if(avg_workload <= 16) + limit = 64; + else if(avg_workload <= 32) + limit = 128; + else if(avg_workload <= 64) + limit = 192; + else + limit = avg_workload; + + const float split_amplifier = num_batches * coefs.workload_limit_global_0 + + avg_workload * coefs.workload_limit_global_1 + + coefs.workload_limit_global_2; return ck_tile::integer_least_multiple( int32_t(cal_cost(packed_seqlen_qo, limit) + split_overhead * split_amplifier), kv_granularity); } template -CK_TILE_DEVICE void generate_work( - const int32_t batch_idx, - const int32_t tile_idx, - const int32_t qo_len, - const int32_t kv_len, - const int32_t qo_tile_len, - const int32_t packed_qo_tile_len, - const int32_t qo_batch_start, - const int32_t kv_batch_start, - const int32_t kv_batch_end, - const int32_t workload_limit_global, - const int32_t num_clusters, - const int32_t kv_granularity, - const int32_t* p_work_indptr, - const int32_t* p_lds_num_qo_clusters_indptr, - int32_t* p_loc_partial_outputs, - int32_t* p_num_partial_outputs, - MlaWorkInfo* p_work_info_set, - MlaPartialTileInfo* p_reduce_final_map, - MlaPartialTileInfo* p_reduce_partial_map, - int32_t* p_cost_heap, - int32_t* p_cluster_work_counter) +CK_TILE_DEVICE void generate_work(const int32_t batch_idx, + const int32_t tile_idx, + const int32_t qo_len, + const int32_t kv_len, + const int32_t qo_tile_len, + const int32_t packed_qo_tile_len, + const int32_t qo_batch_start, + const int32_t kv_batch_start, + const int32_t kv_batch_end, + const int32_t workload_limit_global, + const int32_t num_clusters, + const int32_t kv_granularity, + const int32_t* p_work_indptr, + const int32_t* p_lds_num_qo_clusters_indptr, + int32_t* p_loc_partial_outputs, + int32_t* p_num_partial_outputs, + MlaWorkInfo* p_work_info_set, + MlaPartialTileInfo* p_reduce_final_map, + MlaPartialTileInfo* p_reduce_partial_map, + int32_t* p_cost_heap, + int32_t* p_cluster_work_counter) { int32_t remaining_kv_len = kv_len; - int32_t kv_start_local = 0; + int32_t kv_start_local = 0; - const int32_t kv_len_limit_floor = - ck_tile::integer_least_multiple(ck_tile::integer_divide_ceil(kv_len, num_clusters), kv_granularity); - const auto [cid_top, accum_cost_top] = get_cost_top(p_cost_heap, num_clusters); - const int32_t remaining_capability_top = - ck_tile::max(cal_kv_len(workload_limit_global - accum_cost_top, packed_qo_tile_len), kv_len_limit_floor); + const int32_t kv_len_limit_floor = ck_tile::integer_least_multiple( + ck_tile::integer_divide_ceil(kv_len, num_clusters), kv_granularity); + const auto [cid_top, accum_cost_top] = get_cost_top(p_cost_heap, num_clusters); + const int32_t remaining_capability_top = ck_tile::max( + cal_kv_len(workload_limit_global - accum_cost_top, packed_qo_tile_len), kv_len_limit_floor); const int32_t num_splits_estimated = ck_tile::integer_divide_ceil(remaining_kv_len, remaining_capability_top); - // For the case of #splits==2, make sure that the tailing tile is smaller than Traits::kSplitTolerance. - const bool split_kv = (num_splits_estimated == 2) ? - ((remaining_kv_len - remaining_capability_top) > Traits::kSplitTolerance) : - (num_splits_estimated > 1); + // For the case of #splits==2, make sure that the tailing tile is smaller than + // Traits::kSplitTolerance. + const bool split_kv = + (num_splits_estimated == 2) + ? ((remaining_kv_len - remaining_capability_top) > Traits::kSplitTolerance) + : (num_splits_estimated > 1); do { // Check and update cost_heap auto [cid, accum_cost] = get_cost_top(p_cost_heap, num_clusters); - const int32_t remaining_capability = cal_kv_len(workload_limit_global - accum_cost, packed_qo_tile_len); - const int32_t kv_len_limit_local = - [&]() { + const int32_t remaining_capability = + cal_kv_len(workload_limit_global - accum_cost, packed_qo_tile_len); + const int32_t kv_len_limit_local = [&]() { const int32_t limit_ori = ck_tile::max(remaining_capability, kv_len_limit_floor); - const int32_t tail_size = (remaining_kv_len > limit_ori) ? (remaining_kv_len - limit_ori) : 0x7fffffff; - const int32_t limit_fin = (tail_size <= Traits::kSplitTolerance) ? remaining_kv_len : limit_ori; + const int32_t tail_size = + (remaining_kv_len > limit_ori) ? (remaining_kv_len - limit_ori) : 0x7fffffff; + const int32_t limit_fin = + (tail_size <= Traits::kSplitTolerance) ? remaining_kv_len : limit_ori; return limit_fin; }(); const int32_t kv_len_consuming = ck_tile::min(remaining_kv_len, kv_len_limit_local); - if (ck_tile::get_lane_id() == 0) + if(ck_tile::get_lane_id() == 0) { - const int32_t cost = cal_cost(packed_qo_tile_len, kv_len_consuming); + const int32_t cost = cal_cost(packed_qo_tile_len, kv_len_consuming); const int32_t new_cost = accum_cost + cost; - p_cost_heap[cid] = new_cost; + p_cost_heap[cid] = new_cost; - if constexpr (kOnlyGatherWorkCount == false) + if constexpr(kOnlyGatherWorkCount == false) { // Record work MlaWorkInfo work_info{}; work_info.batch_idx = batch_idx; work_info.qo_start = tile_idx * qo_tile_len + qo_batch_start; - work_info.qo_end = ck_tile::min(work_info.qo_start + qo_tile_len, qo_batch_start + qo_len); + work_info.qo_end = + ck_tile::min(work_info.qo_start + qo_tile_len, qo_batch_start + qo_len); work_info.kv_start = kv_start_local + kv_batch_start; work_info.kv_end = work_info.kv_start + kv_len_consuming; work_info.kv_offset = kv_batch_end - work_info.kv_end; - if (split_kv) + if(split_kv) { - const int32_t global_cluster_q_idx = p_lds_num_qo_clusters_indptr[batch_idx] + tile_idx; + const int32_t global_cluster_q_idx = + p_lds_num_qo_clusters_indptr[batch_idx] + tile_idx; work_info.partial_qo_loc = *p_loc_partial_outputs; - if (p_reduce_partial_map[global_cluster_q_idx].q_start == -1) + if(p_reduce_partial_map[global_cluster_q_idx].q_start == -1) { p_reduce_partial_map[global_cluster_q_idx].q_start = *p_loc_partial_outputs; - p_reduce_final_map[global_cluster_q_idx] = {{ work_info.qo_start, work_info.qo_end }}; + p_reduce_final_map[global_cluster_q_idx] = { + {work_info.qo_start, work_info.qo_end}}; } ++(*p_num_partial_outputs); *p_loc_partial_outputs += (work_info.qo_end - work_info.qo_start); @@ -210,8 +223,14 @@ CK_TILE_DEVICE void generate_work( p_work_info_set[work_info_set_idx] = work_info; #if PRINT_DBG - printf("[metadata] - cost heap updated: work_loc=%d, cid=%d, pre_cost=%d, new_cost=%d, tot_cost=%d, kv_len_cons=%d\n", - work_info_set_idx, cid, accum_cost, cost, accum_cost+cost, kv_len_consuming); + printf("[metadata] - cost heap updated: work_loc=%d, cid=%d, pre_cost=%d, " + "new_cost=%d, tot_cost=%d, kv_len_cons=%d\n", + work_info_set_idx, + cid, + accum_cost, + cost, + accum_cost + cost, + kv_len_consuming); #endif } @@ -221,15 +240,13 @@ CK_TILE_DEVICE void generate_work( // Update state remaining_kv_len -= kv_len_consuming; kv_start_local += kv_len_consuming; - } - while (remaining_kv_len > 0); + } while(remaining_kv_len > 0); } template -__launch_bounds__(ck_tile::get_warp_size(), 1) -__global__ void kn_get_mla_metadata_v1_1( - const MlaMetadataV1KernelParameter params, - const MlaMetadataV11Coefficients coefs) +__launch_bounds__(ck_tile::get_warp_size(), 1) __global__ + void kn_get_mla_metadata_v1_1(const MlaMetadataV1KernelParameter params, + const MlaMetadataV11Coefficients coefs) { extern __shared__ uint8_t p_smem[]; @@ -237,27 +254,33 @@ __global__ void kn_get_mla_metadata_v1_1( // Step.0. Get sequence lengths of query/output and key/value for each batch. int32_t* p_lds_batch_idx = reinterpret_cast(p_smem); - int32_t* p_lds_qo_lens = Traits::kSortBatch ? (p_lds_batch_idx + params.num_batches) : p_lds_batch_idx; - int32_t* p_lds_kv_lens = p_lds_qo_lens + params.num_batches; - for (int32_t bid = lane_idx; bid < params.num_batches; bid += ck_tile::get_warp_size()) + int32_t* p_lds_qo_lens = + Traits::kSortBatch ? (p_lds_batch_idx + params.num_batches) : p_lds_batch_idx; + int32_t* p_lds_kv_lens = p_lds_qo_lens + params.num_batches; + for(int32_t bid = lane_idx; bid < params.num_batches; bid += ck_tile::get_warp_size()) { - const int32_t bid_ori = Traits::kIsSparse ? (bid / params.ori_seqlen_qo / params.qk_batch_ratio) - : (bid / params.qk_batch_ratio); - if constexpr (Traits::kSortBatch) + const int32_t bid_ori = Traits::kIsSparse + ? (bid / params.ori_seqlen_qo / params.qk_batch_ratio) + : (bid / params.qk_batch_ratio); + if constexpr(Traits::kSortBatch) { p_lds_batch_idx[bid] = bid; } - const int32_t raw_seqlen_kv = params.p_seqlens_kv_indptr[bid_ori + 1] - params.p_seqlens_kv_indptr[bid_ori]; - p_lds_kv_lens[bid] = Traits::kIsSparse ? ck_tile::min(raw_seqlen_kv, params.topk) : raw_seqlen_kv; - p_lds_qo_lens[bid] = params.p_seqlens_qo_indptr[bid_ori + 1] - params.p_seqlens_qo_indptr[bid_ori]; + const int32_t raw_seqlen_kv = + params.p_seqlens_kv_indptr[bid_ori + 1] - params.p_seqlens_kv_indptr[bid_ori]; + p_lds_kv_lens[bid] = + Traits::kIsSparse ? ck_tile::min(raw_seqlen_kv, params.topk) : raw_seqlen_kv; + p_lds_qo_lens[bid] = + params.p_seqlens_qo_indptr[bid_ori + 1] - params.p_seqlens_qo_indptr[bid_ori]; } - QoState qo_state(params.uni_seqlen_qo, params.ori_seqlen_qo, p_lds_qo_lens, params.p_seqlens_qo_indptr); + QoState qo_state( + params.uni_seqlen_qo, params.ori_seqlen_qo, p_lds_qo_lens, params.p_seqlens_qo_indptr); - // Step.1. Calculate the size of cluster and some related information. The size is the number of workgroups + // Step.1. Calculate the size of cluster and some related information. The size is the number of + // workgroups // composing each cluster. The size is determined by average packed qo length. - const int32_t sum_qo_len = warp_sum(p_lds_qo_lens, params.num_batches); - const int32_t cluster_size = - [&]() { + const int32_t sum_qo_len = warp_sum(p_lds_qo_lens, params.num_batches); + const int32_t cluster_size = [&]() { const int32_t avg_qo_len = sum_qo_len / params.num_batches; const int32_t cluster_size = ck_tile::integer_divide_ceil(avg_qo_len, Traits::kPackedQoLenPerWg); @@ -268,94 +291,102 @@ __global__ void kn_get_mla_metadata_v1_1( const int32_t cluster_len_q = cluster_size * Traits::kPackedQoLenPerWg; // Step.2. - // a. Get total valid (after causal masking) kv lengths and the maximun workload handled by each cluster - // b. Get a indptr array about #cluster for each batch in direction of qo. + // a. Get total valid (after causal masking) kv lengths and the maximun workload handled by + // each cluster b. Get a indptr array about #cluster for each batch in direction of qo. int32_t* p_lds_num_qo_clusters_indptr = p_lds_kv_lens + params.num_batches; - if (lane_idx == 0) + if(lane_idx == 0) { p_lds_num_qo_clusters_indptr[0] = 0; } - int32_t scan_base = 0; - int32_t workload_sum = 0; - const int32_t num_loop_batch = - integer_divide_ceil_power2(params.num_batches, - ck_tile::get_warp_size(), - __builtin_ctz(ck_tile::get_warp_size())); + int32_t scan_base = 0; + int32_t workload_sum = 0; + const int32_t num_loop_batch = integer_divide_ceil_power2( + params.num_batches, ck_tile::get_warp_size(), __builtin_ctz(ck_tile::get_warp_size())); // lds pointed by p_lds_qo_tiles will be reused by p_lds_sort_workspace later - int32_t* p_lds_qo_tiles = p_lds_num_qo_clusters_indptr + params.num_batches + 1; - for (int32_t loop_idx = 0; loop_idx < num_loop_batch; ++loop_idx) + int32_t* p_lds_qo_tiles = p_lds_num_qo_clusters_indptr + params.num_batches + 1; + for(int32_t loop_idx = 0; loop_idx < num_loop_batch; ++loop_idx) { - const int32_t bid = lane_idx + loop_idx * ck_tile::get_warp_size(); + const int32_t bid = lane_idx + loop_idx * ck_tile::get_warp_size(); int32_t num_qo_tiles = 0; - int32_t workload = 0; + int32_t workload = 0; - if (bid < params.num_batches) + if(bid < params.num_batches) { - const int32_t kv_len = p_lds_kv_lens[bid]; - const int32_t qo_len = qo_state.get_seqlen(bid); + const int32_t kv_len = p_lds_kv_lens[bid]; + const int32_t qo_len = qo_state.get_seqlen(bid); const int32_t packed_qo_len = qo_len * params.num_heads; - num_qo_tiles = ck_tile::integer_divide_ceil(packed_qo_len, cluster_len_q); + num_qo_tiles = ck_tile::integer_divide_ceil(packed_qo_len, cluster_len_q); p_lds_qo_tiles[bid] = num_qo_tiles; const int32_t packed_qo_tile_len = ck_tile::min(packed_qo_len, cluster_len_q); - for (int32_t tid = 0; tid < num_qo_tiles; ++tid) + for(int32_t tid = 0; tid < num_qo_tiles; ++tid) { - const int32_t kv_len_valid = - cal_packed_causal_kv_len( - qo_len, kv_len, tid, packed_qo_tile_len, num_qo_tiles, params.num_heads, params.is_causal); + const int32_t kv_len_valid = cal_packed_causal_kv_len(qo_len, + kv_len, + tid, + packed_qo_tile_len, + num_qo_tiles, + params.num_heads, + params.is_causal); workload += cal_cost(packed_qo_tile_len, kv_len_valid); } } const int32_t prefix_sum_qo_tiles = warp_prefix_sum(num_qo_tiles, ck_tile::get_warp_size()); const int32_t global_sum_qo_tiles = prefix_sum_qo_tiles + scan_base; - if (bid < params.num_batches) + if(bid < params.num_batches) { p_lds_num_qo_clusters_indptr[bid + 1] = global_sum_qo_tiles; } scan_base = ck_tile::warp_shuffle(global_sum_qo_tiles, ck_tile::get_warp_size() - 1); - workload_sum += aiter::warpReduce(workload); + workload_sum += + aiter::warpReduce( + workload); } const int32_t num_qo_tiles = scan_base; const int32_t tot_qo_tiles = warp_sum(p_lds_qo_tiles, params.num_batches); const int32_t workload_limit_global = - cal_workload_limit_global_v1( - coefs, - params.num_batches, - workload_sum, - num_clusters, - qo_state.is_unique() ? qo_state.get_seqlen(0) : cluster_len_q, - params.kv_granularity); + cal_workload_limit_global_v1(coefs, + params.num_batches, + workload_sum, + num_clusters, + qo_state.is_unique() ? qo_state.get_seqlen(0) : cluster_len_q, + params.kv_granularity); #if PRINT_DBG - if (lane_idx == 0) + if(lane_idx == 0) { printf("[metadata] workload_limit_global=%d\n", workload_limit_global); } #endif // Step.3. Sort batch idx based on cost. High cost batch first. - if constexpr (Traits::kSortBatch) + if constexpr(Traits::kSortBatch) { - int32_t *p_lds_sort_workspace = p_lds_num_qo_clusters_indptr + params.num_batches + 1; // will be reused later. - warp_sort(p_lds_batch_idx, p_lds_sort_workspace, p_lds_qo_lens, p_lds_kv_lens, params.num_batches); + int32_t* p_lds_sort_workspace = + p_lds_num_qo_clusters_indptr + params.num_batches + 1; // will be reused later. + warp_sort(p_lds_batch_idx, + p_lds_sort_workspace, + p_lds_qo_lens, + p_lds_kv_lens, + params.num_batches); } // Step.4.1. Initialize lds - int32_t* p_cost_heap = p_lds_qo_tiles; + int32_t* p_cost_heap = p_lds_qo_tiles; int32_t* p_cluster_work_counter = p_cost_heap + num_clusters + 1; - for (int32_t cid = lane_idx; cid < num_clusters; cid += ck_tile::get_warp_size()) + for(int32_t cid = lane_idx; cid < num_clusters; cid += ck_tile::get_warp_size()) { - p_cost_heap[cid] = 0; + p_cost_heap[cid] = 0; p_cluster_work_counter[cid] = 0; } // Step.5. Fill the output buffers except indptrs auto get_kv_batch_start = [&](const int32_t bid) { const int32_t bid_ori = bid / params.qk_batch_ratio; - if constexpr (Traits::kIsSparse) + if constexpr(Traits::kIsSparse) { return bid_ori * params.topk; } @@ -366,55 +397,77 @@ __global__ void kn_get_mla_metadata_v1_1( }; // Step.5.1. Get total work for each cluster - for (int32_t idx = 0; idx < params.num_batches; ++idx) + for(int32_t idx = 0; idx < params.num_batches; ++idx) { - const int32_t bid = Traits::kSortBatch ? p_lds_batch_idx[idx] : idx; - const int32_t bid_ori = bid / params.qk_batch_ratio; - const int32_t qo_len = qo_state.get_seqlen(bid); - const int32_t qo_batch_start = qo_state.get_begin(bid); - const int32_t kv_len = p_lds_kv_lens[bid]; - const int32_t kv_batch_start = Traits::kIsSparse ? bid_ori * params.topk - : params.p_seqlens_kv_indptr[bid_ori]; - const int32_t kv_batch_end = kv_batch_start + kv_len; - const int32_t packed_qo_len = qo_len * params.num_heads; - const int32_t num_qo_tiles = ck_tile::integer_divide_ceil(packed_qo_len, cluster_len_q); + const int32_t bid = Traits::kSortBatch ? p_lds_batch_idx[idx] : idx; + const int32_t bid_ori = bid / params.qk_batch_ratio; + const int32_t qo_len = qo_state.get_seqlen(bid); + const int32_t qo_batch_start = qo_state.get_begin(bid); + const int32_t kv_len = p_lds_kv_lens[bid]; + const int32_t kv_batch_start = + Traits::kIsSparse ? bid_ori * params.topk : params.p_seqlens_kv_indptr[bid_ori]; + const int32_t kv_batch_end = kv_batch_start + kv_len; + const int32_t packed_qo_len = qo_len * params.num_heads; + const int32_t num_qo_tiles = ck_tile::integer_divide_ceil(packed_qo_len, cluster_len_q); const int32_t packed_qo_tile_len = ck_tile::min(packed_qo_len, cluster_len_q); - const int32_t qo_tile_len = ck_tile::integer_divide_ceil(packed_qo_tile_len, params.num_heads); + const int32_t qo_tile_len = + ck_tile::integer_divide_ceil(packed_qo_tile_len, params.num_heads); - for (int32_t tid = 0; tid < num_qo_tiles; ++tid) + for(int32_t tid = 0; tid < num_qo_tiles; ++tid) { - const int32_t tile_kv_len = - cal_packed_causal_kv_len( - qo_len, kv_len, tid, packed_qo_tile_len, num_qo_tiles, params.num_heads, params.is_causal); - - generate_work( - bid, tid, qo_len, tile_kv_len, qo_tile_len, packed_qo_tile_len, qo_batch_start, kv_batch_start, - kv_batch_end, workload_limit_global, num_clusters, params.kv_granularity, nullptr, - p_lds_num_qo_clusters_indptr, nullptr, nullptr, nullptr, nullptr, nullptr, p_cost_heap, - p_cluster_work_counter); + const int32_t tile_kv_len = cal_packed_causal_kv_len(qo_len, + kv_len, + tid, + packed_qo_tile_len, + num_qo_tiles, + params.num_heads, + params.is_causal); + + generate_work(bid, + tid, + qo_len, + tile_kv_len, + qo_tile_len, + packed_qo_tile_len, + qo_batch_start, + kv_batch_start, + kv_batch_end, + workload_limit_global, + num_clusters, + params.kv_granularity, + nullptr, + p_lds_num_qo_clusters_indptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + p_cost_heap, + p_cluster_work_counter); } } // Step.5.2. Re-init cost heap and cumulative sum cluster_work_tot - scan_base = 0; - const int32_t num_loop_clusters = - integer_divide_ceil_power2(num_clusters, ck_tile::get_warp_size(), __builtin_ctz(ck_tile::get_warp_size())); - for (int32_t loop_idx = 0; loop_idx < num_loop_clusters; ++loop_idx) + scan_base = 0; + const int32_t num_loop_clusters = integer_divide_ceil_power2( + num_clusters, ck_tile::get_warp_size(), __builtin_ctz(ck_tile::get_warp_size())); + for(int32_t loop_idx = 0; loop_idx < num_loop_clusters; ++loop_idx) { const int32_t cid = lane_idx + loop_idx * ck_tile::get_warp_size(); const int32_t cluster_work = (cid < num_clusters) ? p_cluster_work_counter[cid] : 0; - const int32_t cum_cluster_work = warp_prefix_sum(cluster_work, ck_tile::get_warp_size()) + scan_base; + const int32_t cum_cluster_work = + warp_prefix_sum(cluster_work, ck_tile::get_warp_size()) + scan_base; scan_base = ck_tile::warp_shuffle(cum_cluster_work, ck_tile::get_warp_size() - 1); - if (cid < num_clusters) + if(cid < num_clusters) { params.p_work_indptr[cid + 1] = cum_cluster_work; - p_cost_heap[cid] = 0; - p_cluster_work_counter[cid] = 0; + p_cost_heap[cid] = 0; + p_cluster_work_counter[cid] = 0; } } - if (lane_idx == 0) + if(lane_idx == 0) { params.p_work_indptr[0] = 0; } @@ -422,57 +475,79 @@ __global__ void kn_get_mla_metadata_v1_1( MlaPartialTileInfo* p_reduce_partial_map = reinterpret_cast(p_cluster_work_counter + num_clusters); MlaPartialTileInfo* p_reduce_final_map = p_reduce_partial_map + tot_qo_tiles; - for (int32_t cluster_q_idx = threadIdx.x; cluster_q_idx < tot_qo_tiles; cluster_q_idx += ck_tile::get_warp_size()) + for(int32_t cluster_q_idx = threadIdx.x; cluster_q_idx < tot_qo_tiles; + cluster_q_idx += ck_tile::get_warp_size()) { p_reduce_partial_map[cluster_q_idx] = MlaPartialTileInfo{{-1, -2}}; - p_reduce_final_map[cluster_q_idx] = MlaPartialTileInfo{{-1, -2}}; + p_reduce_final_map[cluster_q_idx] = MlaPartialTileInfo{{-1, -2}}; } // Step.5.3. Output work info - int32_t num_partial_outputs = 0; - int32_t loc_partial_outputs = 0; + int32_t num_partial_outputs = 0; + int32_t loc_partial_outputs = 0; MlaWorkInfo* p_work_info_set = reinterpret_cast(params.p_work_info_set_raw); - for (int32_t idx = 0; idx < params.num_batches; ++idx) + for(int32_t idx = 0; idx < params.num_batches; ++idx) { - const int32_t bid = Traits::kSortBatch ? p_lds_batch_idx[idx] : idx; - const int32_t bid_ori = bid / params.qk_batch_ratio; - const int32_t qo_len = qo_state.get_seqlen(bid); - const int32_t qo_batch_start = qo_state.get_begin(bid); - const int32_t kv_len = p_lds_kv_lens[bid]; - const int32_t kv_batch_start = Traits::kIsSparse ? bid_ori * params.topk - : params.p_seqlens_kv_indptr[bid_ori]; - const int32_t kv_batch_end = kv_batch_start + kv_len; - const int32_t packed_qo_len = qo_len * params.num_heads; - const int32_t num_qo_tiles = ck_tile::integer_divide_ceil(packed_qo_len, cluster_len_q); + const int32_t bid = Traits::kSortBatch ? p_lds_batch_idx[idx] : idx; + const int32_t bid_ori = bid / params.qk_batch_ratio; + const int32_t qo_len = qo_state.get_seqlen(bid); + const int32_t qo_batch_start = qo_state.get_begin(bid); + const int32_t kv_len = p_lds_kv_lens[bid]; + const int32_t kv_batch_start = + Traits::kIsSparse ? bid_ori * params.topk : params.p_seqlens_kv_indptr[bid_ori]; + const int32_t kv_batch_end = kv_batch_start + kv_len; + const int32_t packed_qo_len = qo_len * params.num_heads; + const int32_t num_qo_tiles = ck_tile::integer_divide_ceil(packed_qo_len, cluster_len_q); const int32_t packed_qo_tile_len = ck_tile::min(packed_qo_len, cluster_len_q); - const int32_t qo_tile_len = ck_tile::integer_divide_ceil(packed_qo_tile_len, params.num_heads); + const int32_t qo_tile_len = + ck_tile::integer_divide_ceil(packed_qo_tile_len, params.num_heads); #if PRINT_DBG - if (lane_idx == 0) + if(lane_idx == 0) { printf("[metadata] Dividing batch=%d, qo_len=%d, kv_len=%d\n", bid, qo_len, kv_len); } #endif - for (int32_t tid = 0; tid < num_qo_tiles; ++tid) + for(int32_t tid = 0; tid < num_qo_tiles; ++tid) { - const int32_t tile_kv_len = - cal_packed_causal_kv_len( - qo_len, kv_len, tid, packed_qo_tile_len, num_qo_tiles, params.num_heads, params.is_causal); - - generate_work( - bid, tid, qo_len, tile_kv_len, qo_tile_len, packed_qo_tile_len, qo_batch_start, kv_batch_start, - kv_batch_end, workload_limit_global, num_clusters, params.kv_granularity, params.p_work_indptr, - p_lds_num_qo_clusters_indptr, &loc_partial_outputs, &num_partial_outputs, p_work_info_set, - p_reduce_final_map, p_reduce_partial_map, p_cost_heap, p_cluster_work_counter); + const int32_t tile_kv_len = cal_packed_causal_kv_len(qo_len, + kv_len, + tid, + packed_qo_tile_len, + num_qo_tiles, + params.num_heads, + params.is_causal); + + generate_work(bid, + tid, + qo_len, + tile_kv_len, + qo_tile_len, + packed_qo_tile_len, + qo_batch_start, + kv_batch_start, + kv_batch_end, + workload_limit_global, + num_clusters, + params.kv_granularity, + params.p_work_indptr, + p_lds_num_qo_clusters_indptr, + &loc_partial_outputs, + &num_partial_outputs, + p_work_info_set, + p_reduce_final_map, + p_reduce_partial_map, + p_cost_heap, + p_cluster_work_counter); } } // Step.6. Output metadata for reduce kernel - scan_base = 0; - const int32_t num_loop_reduce = - integer_divide_ceil_power2(tot_qo_tiles, ck_tile::get_warp_size(), __builtin_ctz(ck_tile::get_warp_size())); - for (int32_t loop_idx = 0; loop_idx < num_loop_reduce; ++loop_idx) + scan_base = 0; + const int32_t num_loop_reduce = integer_divide_ceil_power2( + tot_qo_tiles, ck_tile::get_warp_size(), __builtin_ctz(ck_tile::get_warp_size())); + for(int32_t loop_idx = 0; loop_idx < num_loop_reduce; ++loop_idx) { const int32_t global_cluster_q_idx = lane_idx + loop_idx * ck_tile::get_warp_size(); @@ -481,53 +556,61 @@ __global__ void kn_get_mla_metadata_v1_1( int32_t reduce_tile_size; int32_t num_reduce_tiles = 0; - if (global_cluster_q_idx < tot_qo_tiles) + if(global_cluster_q_idx < tot_qo_tiles) { - final_info = p_reduce_final_map[global_cluster_q_idx]; + final_info = p_reduce_final_map[global_cluster_q_idx]; partial_range = p_reduce_partial_map[global_cluster_q_idx]; - reduce_tile_size = (final_info.q_start == -1) ? 0 : (final_info.q_end - final_info.q_start); + reduce_tile_size = + (final_info.q_start == -1) ? 0 : (final_info.q_end - final_info.q_start); num_reduce_tiles = - (reduce_tile_size == 0) ? 0 : ((partial_range.q_end - partial_range.q_start) / reduce_tile_size); + (reduce_tile_size == 0) + ? 0 + : ((partial_range.q_end - partial_range.q_start) / reduce_tile_size); } - const int32_t curr_cum_reduce_tiles = warp_prefix_sum(num_reduce_tiles, ck_tile::get_warp_size()) + scan_base; + const int32_t curr_cum_reduce_tiles = + warp_prefix_sum(num_reduce_tiles, ck_tile::get_warp_size()) + scan_base; const int32_t prev_cum_reduce_tiles = curr_cum_reduce_tiles - num_reduce_tiles; scan_base = ck_tile::warp_shuffle(curr_cum_reduce_tiles, ck_tile::get_warp_size() - 1); - if (global_cluster_q_idx < tot_qo_tiles) + if(global_cluster_q_idx < tot_qo_tiles) { - for (int32_t tid = prev_cum_reduce_tiles; tid < curr_cum_reduce_tiles; ++tid) + for(int32_t tid = prev_cum_reduce_tiles; tid < curr_cum_reduce_tiles; ++tid) { const int32_t local_tid = tid - prev_cum_reduce_tiles; - params.p_reduce_partial_map[tid] = partial_range.q_start + local_tid * reduce_tile_size; + params.p_reduce_partial_map[tid] = + partial_range.q_start + local_tid * reduce_tile_size; } - params.p_reduce_indptr[global_cluster_q_idx + 1] = curr_cum_reduce_tiles; - params.p_reduce_final_map[2 * global_cluster_q_idx] = final_info.q_start; + params.p_reduce_indptr[global_cluster_q_idx + 1] = curr_cum_reduce_tiles; + params.p_reduce_final_map[2 * global_cluster_q_idx] = final_info.q_start; params.p_reduce_final_map[2 * global_cluster_q_idx + 1] = final_info.q_end; } } // reduce_indptr may be larger than #clusters. const int32_t num_reduce_tiles = scan_base; - for (int32_t idx = tot_qo_tiles + 1 + lane_idx; idx < params.reduce_indptr_size; idx += ck_tile::get_warp_size()) + for(int32_t idx = tot_qo_tiles + 1 + lane_idx; idx < params.reduce_indptr_size; + idx += ck_tile::get_warp_size()) { params.p_reduce_indptr[idx] = num_reduce_tiles; } // Step.7. Fill metadata pointers for MLA kernel and the 1st element of reduce_indptr. - if (lane_idx == 0) + if(lane_idx == 0) { params.p_reduce_indptr[0] = 0; - params.p_work_metadata_ptrs[0] = static_cast(reinterpret_cast(params.p_work_indptr)); - params.p_work_metadata_ptrs[1] = static_cast(reinterpret_cast(params.p_work_info_set_raw)); + params.p_work_metadata_ptrs[0] = + static_cast(reinterpret_cast(params.p_work_indptr)); + params.p_work_metadata_ptrs[1] = + static_cast(reinterpret_cast(params.p_work_info_set_raw)); } #if PRINT_DBG - if (lane_idx == 0) + if(lane_idx == 0) { printf("[metadata] Final Cost Heap Status:\n"); - for (int32_t cid = 0; cid < num_clusters; ++cid) + for(int32_t cid = 0; cid < num_clusters; ++cid) { printf("[metadata] - cid=%d, cost=%d\n", cid, p_cost_heap[cid]); } @@ -535,40 +618,46 @@ __global__ void kn_get_mla_metadata_v1_1( #endif } -template -void dispatch_mla_metadata_v1_1_device( - const MlaMetadataV1KernelParameter& params, - const MlaMetadataV11Coefficients& coefs, - const hipStream_t stream, - const int32_t warp_size, - const int32_t lds_size) +template +void dispatch_mla_metadata_v1_1_device(const MlaMetadataV1KernelParameter& params, + const MlaMetadataV11Coefficients& coefs, + const hipStream_t stream, + const int32_t warp_size, + const int32_t lds_size) { - using Traits = MlaMetadataV11Traits; + using Traits = MlaMetadataV11Traits; const dim3 grid = dim3(1, 1, 1); kn_get_mla_metadata_v1_1<<>>(params, coefs); } -void get_mla_metadata_v1_1_device( - const torch::Tensor& seqlens_qo_indptr, // [batch size + 1] - const torch::Tensor& seqlens_kv_indptr, // [batch size + 1] - const int32_t num_heads_per_head_k, - const int32_t num_heads_k, - const bool is_causal, - const bool no_redundant, - const int32_t kv_granularity, - const int32_t max_seqlen_qo, - const int32_t ori_uni_seqlen_qo, - const int32_t topk, - torch::Tensor& work_metadata_ptrs, - torch::Tensor& work_info_set, - torch::Tensor& work_indptr, - torch::Tensor& reduce_indptr, - torch::Tensor& reduce_final_map, - torch::Tensor& reduce_partial_map) +void get_mla_metadata_v1_1_device(const torch::Tensor& seqlens_qo_indptr, // [batch size + 1] + const torch::Tensor& seqlens_kv_indptr, // [batch size + 1] + const int32_t num_heads_per_head_k, + const int32_t num_heads_k, + const bool is_causal, + const bool no_redundant, + const int32_t kv_granularity, + const int32_t max_seqlen_qo, + const int32_t ori_uni_seqlen_qo, + const int32_t topk, + torch::Tensor& work_metadata_ptrs, + torch::Tensor& work_info_set, + torch::Tensor& work_indptr, + torch::Tensor& reduce_indptr, + torch::Tensor& reduce_final_map, + torch::Tensor& reduce_partial_map) { - // This default settings is for our ASM MLA decode kernel. This kernel supports num_heads=16 and qo size from 1 - // to 4 without support to split qo for each workgroup. This means that kPackedQoLenPerWg should be 4*16=64 to - // prevent spliting in any case supported by it. + // This default settings is for our ASM MLA decode kernel. This kernel supports num_heads=16 and + // qo size from 1 to 4 without support to split qo for each workgroup. This means that + // kPackedQoLenPerWg should be 4*16=64 to prevent spliting in any case supported by it. constexpr int32_t kPackedQoLenPerWg = 128; constexpr int32_t kMaxClusterSize = 1; @@ -587,29 +676,32 @@ void get_mla_metadata_v1_1_device( int32_t qk_batch_ratio = 1; int32_t uni_seqlen_qo = ori_uni_seqlen_qo; - // In the following cases, we use #head=16 to simulate cases which is not natively supported by mla main kernel. - if ((num_heads != 16) && (num_heads != 128) && // main kernel natively supports #head=16 or #head=128 - (num_heads % 16 == 0) && (num_heads < 128)) + // In the following cases, we use #head=16 to simulate cases which is not natively supported by + // mla main kernel. + if((num_heads != 16) && + (num_heads != 128) && // main kernel natively supports #head=16 or #head=128 + (num_heads % 16 == 0) && (num_heads < 128)) { qk_batch_ratio = num_heads / 16; num_heads = 16; - num_batches *= qk_batch_ratio; + num_batches *= qk_batch_ratio; } - if (is_sparse) + if(is_sparse) { - num_batches *= uni_seqlen_qo; + num_batches *= uni_seqlen_qo; uni_seqlen_qo = 1; } - TORCH_CHECK((num_heads == 16) || (num_heads == 128), __func__, - ": only supports #heads in [16, 128], or (#head, uni_seqlen_qo) = (16*N, 1) where N is in [2, 8).") + TORCH_CHECK((num_heads == 16) || (num_heads == 128), + __func__, + ": only supports #heads in [16, 128], or (#head, uni_seqlen_qo) = (16*N, 1) where " + "N is in [2, 8).") - const int32_t lds_size_in_bytes = [&]() - { - const int32_t qo_tile_per_batch = - ck_tile::integer_divide_ceil(ck_tile::max(max_seqlen_qo, 1) * num_heads, kPackedQoLenPerWg); - const int32_t tot_qo_tiles = num_batches * qo_tile_per_batch; + const int32_t lds_size_in_bytes = [&]() { + const int32_t qo_tile_per_batch = ck_tile::integer_divide_ceil( + ck_tile::max(max_seqlen_qo, 1) * num_heads, kPackedQoLenPerWg); + const int32_t tot_qo_tiles = num_batches * qo_tile_per_batch; // this is maximun #clusters const int32_t num_clusters = dev_prop.multiProcessorCount; @@ -620,10 +712,12 @@ void get_mla_metadata_v1_1_device( // Memory for indptr about #cluster for each batch in direction of qo lds_size += (num_batches + 1) * sizeof(int32_t); // LDS for sorting - const int32_t power_2_num_batches = (num_batches <= 1) ? num_batches : ck_tile::next_power_of_two(num_batches); + const int32_t power_2_num_batches = + (num_batches <= 1) ? num_batches : ck_tile::next_power_of_two(num_batches); const int32_t lds_sort_size = lds_size + - ck_tile::integer_least_multiple(power_2_num_batches, ck_tile::get_warp_size()) * 2 * sizeof(int32_t); + ck_tile::integer_least_multiple(power_2_num_batches, ck_tile::get_warp_size()) * 2 * + sizeof(int32_t); // Memory for cost. Its size should be the same as #clusters lds_size += num_clusters * sizeof(int32_t); // Memory for counter of #works for each cluster. @@ -637,7 +731,8 @@ void get_mla_metadata_v1_1_device( }(); TORCH_CHECK(lds_size_in_bytes <= dev_prop.maxSharedMemoryPerMultiProcessor, - __func__, ": There is no enough LDS."); + __func__, + ": There is no enough LDS."); // auto opts = seqlens_kv_indptr.options(); // auto work_ptrs = torch::empty({2}, opts.dtype(torch::kUInt64)); @@ -649,30 +744,30 @@ void get_mla_metadata_v1_1_device( // kernel input parameters MlaMetadataV1KernelParameter params = {}; - params.p_work_metadata_ptrs = work_metadata_ptrs.data_ptr(); - params.p_work_indptr = work_indptr.data_ptr(); - params.p_work_info_set_raw = work_info_set.data_ptr(); - params.p_reduce_indptr = reduce_indptr.data_ptr(); - params.p_reduce_final_map = reduce_final_map.data_ptr(); - params.p_reduce_partial_map = reduce_partial_map.data_ptr(); - params.p_seqlens_qo_indptr = seqlens_qo_indptr.data_ptr(); - params.p_seqlens_kv_indptr = seqlens_kv_indptr.data_ptr(); - params.num_batches = num_batches; - params.num_heads = num_heads; - params.num_cu = num_cu; - params.reduce_indptr_size = reduce_indptr.size(0); - params.kv_granularity = kv_granularity; - params.kv_granularity_log2 = __builtin_ctz(kv_granularity); - params.uni_seqlen_qo = uni_seqlen_qo; - params.ori_seqlen_qo = ori_uni_seqlen_qo; - params.topk = topk; - params.is_causal = is_causal; - params.qk_batch_ratio = qk_batch_ratio; + params.p_work_metadata_ptrs = work_metadata_ptrs.data_ptr(); + params.p_work_indptr = work_indptr.data_ptr(); + params.p_work_info_set_raw = work_info_set.data_ptr(); + params.p_reduce_indptr = reduce_indptr.data_ptr(); + params.p_reduce_final_map = reduce_final_map.data_ptr(); + params.p_reduce_partial_map = reduce_partial_map.data_ptr(); + params.p_seqlens_qo_indptr = seqlens_qo_indptr.data_ptr(); + params.p_seqlens_kv_indptr = seqlens_kv_indptr.data_ptr(); + params.num_batches = num_batches; + params.num_heads = num_heads; + params.num_cu = num_cu; + params.reduce_indptr_size = reduce_indptr.size(0); + params.kv_granularity = kv_granularity; + params.kv_granularity_log2 = __builtin_ctz(kv_granularity); + params.uni_seqlen_qo = uni_seqlen_qo; + params.ori_seqlen_qo = ori_uni_seqlen_qo; + params.topk = topk; + params.is_causal = is_causal; + params.qk_batch_ratio = qk_batch_ratio; MlaMetadataV11Coefficients coefs = {}; - coefs.workload_limit_global_0 = 0.01f; - coefs.workload_limit_global_1 = 0.01f; - coefs.workload_limit_global_2 = 10.0f; + coefs.workload_limit_global_0 = 0.01f; + coefs.workload_limit_global_1 = 0.01f; + coefs.workload_limit_global_2 = 10.0f; // launch kernel MLA_METADATA_DISPATCHER( @@ -680,7 +775,10 @@ void get_mla_metadata_v1_1_device( kPackedQoLenPerWg, params.uni_seqlen_qo, topk, - dispatch_mla_metadata_v1_1_device( - params, coefs, stream, dev_prop.warpSize, dev_prop.maxSharedMemoryPerMultiProcessor) - ); + dispatch_mla_metadata_v1_1_device( + params, coefs, stream, dev_prop.warpSize, dev_prop.maxSharedMemoryPerMultiProcessor)); } diff --git a/csrc/kernels/mla/metadata/v1_1_host.cuh b/csrc/kernels/mla/metadata/v1_1_host.cuh index 3c00b3848e..2a4e155ae4 100644 --- a/csrc/kernels/mla/metadata/v1_1_host.cuh +++ b/csrc/kernels/mla/metadata/v1_1_host.cuh @@ -1,18 +1,18 @@ #pragma once -#include #include "aiter_hip_common.h" #include "v1_comm.cuh" +#include template -std::vector get_mla_metadata_v1_1_host( - const torch::Tensor& seqlens_qo_indptr, // [batch size + 1] - const torch::Tensor& seqlens_kv_indptr, // [batch size + 1] - const int32_t num_heads_per_head_k, - const int32_t num_heads_k, - const bool is_causal, - const int32_t kv_granularity, - const bool no_redundant) +std::vector +get_mla_metadata_v1_1_host(const torch::Tensor& seqlens_qo_indptr, // [batch size + 1] + const torch::Tensor& seqlens_kv_indptr, // [batch size + 1] + const int32_t num_heads_per_head_k, + const int32_t num_heads_k, + const bool is_causal, + const int32_t kv_granularity, + const bool no_redundant) { using index_t = uint32_t; @@ -22,7 +22,7 @@ std::vector get_mla_metadata_v1_1_host( HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); const int32_t num_batches = seqlens_qo_indptr.size(0) - 1; - const int32_t num_heads = num_heads_k * num_heads_per_head_k; + const int32_t num_heads = num_heads_k * num_heads_per_head_k; auto seqlens_qo_indptr_cpu = seqlens_qo_indptr.to(at::DeviceType::CPU); auto seqlens_kv_indptr_cpu = seqlens_kv_indptr.to(at::DeviceType::CPU); @@ -34,7 +34,7 @@ std::vector get_mla_metadata_v1_1_host( std::vector batch_infos; batch_infos.reserve(num_batches); int32_t sum_packed_qo_len = 0; - for (int32_t bid = 0; bid < num_batches; ++bid) + for(int32_t bid = 0; bid < num_batches; ++bid) { const int32_t qo_len = p_seqlens_qo_indptr[bid + 1] - p_seqlens_qo_indptr[bid]; const int32_t kv_len = p_seqlens_kv_indptr[bid + 1] - p_seqlens_kv_indptr[bid]; @@ -47,67 +47,79 @@ std::vector get_mla_metadata_v1_1_host( } std::sort(batch_infos.begin(), batch_infos.end(), std::greater()); - // Step.1. Calculate the size of cluster and some related information. The size is the number of workgroups + // Step.1. Calculate the size of cluster and some related information. The size is the number of + // workgroups // composing each cluster. The size is determined by average packed qo length. - const int32_t cluster_size = - [&]() { + const int32_t cluster_size = [&]() { const int32_t avg_packed_qo_len = sum_packed_qo_len / num_batches; const int32_t cluster_size = ck_tile::integer_divide_ceil(avg_packed_qo_len, Traits::kPackedQoLenPerWg); return ck_tile::min(cluster_size, Traits::kMaxClusterSize); }(); - TORCH_CHECK((dev_prop.multiProcessorCount % cluster_size) == 0, __func__, ": Invalid cluster_size!"); + TORCH_CHECK( + (dev_prop.multiProcessorCount % cluster_size) == 0, __func__, ": Invalid cluster_size!"); const int32_t num_clusters = dev_prop.multiProcessorCount / cluster_size; const int32_t cluster_len_q = cluster_size * Traits::kPackedQoLenPerWg; // Step.2. - // a. Get total valid (after causal masking) kv lengths and the maximun workload handled by each cluster - // b. Get a indptr array about #cluster for each batch in direction of qo. + // a. Get total valid (after causal masking) kv lengths and the maximun workload handled by + // each cluster b. Get a indptr array about #cluster for each batch in direction of qo. int32_t workload_sum = 0; std::vector num_qo_clusters_indptr; num_qo_clusters_indptr.reserve(num_batches + 1); num_qo_clusters_indptr.push_back(0); - for (const auto& binfo : batch_infos) + for(const auto& binfo : batch_infos) { - const int32_t packed_qo_len = binfo.qo_len * num_heads; - const int32_t num_qo_tiles = ck_tile::integer_divide_ceil(packed_qo_len, cluster_len_q); + const int32_t packed_qo_len = binfo.qo_len * num_heads; + const int32_t num_qo_tiles = ck_tile::integer_divide_ceil(packed_qo_len, cluster_len_q); const int32_t packed_qo_tile_len = ck_tile::min(packed_qo_len, cluster_len_q); num_qo_clusters_indptr.push_back(num_qo_clusters_indptr.back() + num_qo_tiles); - for (int32_t tid = 0; tid < num_qo_tiles; ++tid) + for(int32_t tid = 0; tid < num_qo_tiles; ++tid) { - const int32_t kv_len_valid = - cal_packed_causal_kv_len( - binfo.qo_len, binfo.kv_len, tid, packed_qo_tile_len, num_qo_tiles, num_heads, is_causal); + const int32_t kv_len_valid = cal_packed_causal_kv_len(binfo.qo_len, + binfo.kv_len, + tid, + packed_qo_tile_len, + num_qo_tiles, + num_heads, + is_causal); // always assume that each batch of tile will be splited once along kv. - const int32_t kv_len_splited = - ck_tile::integer_least_multiple(ck_tile::integer_divide_ceil(kv_len_valid, 2), kv_granularity); + const int32_t kv_len_splited = ck_tile::integer_least_multiple( + ck_tile::integer_divide_ceil(kv_len_valid, 2), kv_granularity); workload_sum += 2 * cal_cost(packed_qo_tile_len, kv_len_splited) + kv_granularity; } } - const int32_t workload_limit_global = cal_workload_limit_global_v0(workload_sum, num_clusters, kv_granularity); + const int32_t workload_limit_global = + cal_workload_limit_global_v0(workload_sum, num_clusters, kv_granularity); #if PRINT_DBG printf("[metadata] workload_limit_global=%d\n", workload_limit_global); #endif // Step.3.1. Allocates output buffers except indptrs std::vector> work_info_set(num_clusters, std::vector()); - std::vector> reduce_partial_map(num_qo_clusters_indptr.back(), std::vector()); + std::vector> reduce_partial_map(num_qo_clusters_indptr.back(), + std::vector()); std::vector reduce_partial_info(num_qo_clusters_indptr.back(), {{-1, -2}}); // Step.3.2. Declare priority queue using ClusterCost = std::tuple; // cluster_id(cid), cost - auto pq_cmp = [](const ClusterCost& l, const ClusterCost& r) { return std::get<1>(l) > std::get<1>(r); }; + auto pq_cmp = [](const ClusterCost& l, const ClusterCost& r) { + return std::get<1>(l) > std::get<1>(r); + }; std::priority_queue, decltype(pq_cmp)> cost_heap(pq_cmp); - for (int32_t cid = 0; cid < num_clusters; ++cid) { cost_heap.push(std::tuple{cid, 0}); } + for(int32_t cid = 0; cid < num_clusters; ++cid) + { + cost_heap.push(std::tuple{cid, 0}); + } // Step.4. Fill the output buffers except indptrs int32_t num_reduce_row = 0; int32_t num_partial_outputs = 0; int32_t loc_partial_outputs = 0; - for (const auto& binfo : batch_infos) + for(const auto& binfo : batch_infos) { const int32_t bid = binfo.batch_idx; const int32_t qo_len = binfo.qo_len; @@ -121,42 +133,55 @@ std::vector get_mla_metadata_v1_1_host( printf("[metadata] Dividing batch=%d, qo_len=%d, kv_len=%d\n", bid, qo_len, kv_len); #endif - for (int32_t tid = 0; tid < num_qo_tiles; ++tid) + for(int32_t tid = 0; tid < num_qo_tiles; ++tid) { const int32_t global_cluster_q_idx = num_qo_clusters_indptr[bid] + tid; - int32_t remaining_kv_len = - cal_packed_causal_kv_len(qo_len, kv_len, tid, cluster_len_q, num_qo_tiles, num_heads, is_causal); + int32_t remaining_kv_len = cal_packed_causal_kv_len( + qo_len, kv_len, tid, cluster_len_q, num_qo_tiles, num_heads, is_causal); int32_t kv_start_local = 0; const auto [cid_top, accum_cost_top] = cost_heap.top(); - const int32_t remaining_capability_top = cal_kv_len(workload_limit_global - accum_cost_top, cluster_len_q); + const int32_t remaining_capability_top = + cal_kv_len(workload_limit_global - accum_cost_top, cluster_len_q); const int32_t num_splits_estimated = ck_tile::integer_divide_ceil(remaining_kv_len, remaining_capability_top); - // For the case of #splits==2, make sure that the tailing tile is smaller than Traits::kSplitTolerance. - const bool split_kv = (num_splits_estimated == 2) ? - ((remaining_kv_len - remaining_capability_top) > Traits::kSplitTolerance) : (num_splits_estimated > 1); - const int32_t kv_len_limit_floor = - ck_tile::integer_least_multiple(ck_tile::integer_divide_ceil(kv_len, num_clusters), kv_granularity); + // For the case of #splits==2, make sure that the tailing tile is smaller than + // Traits::kSplitTolerance. + const bool split_kv = + (num_splits_estimated == 2) + ? ((remaining_kv_len - remaining_capability_top) > Traits::kSplitTolerance) + : (num_splits_estimated > 1); + const int32_t kv_len_limit_floor = ck_tile::integer_least_multiple( + ck_tile::integer_divide_ceil(kv_len, num_clusters), kv_granularity); do { // Check and update cost_heap auto [cid, accum_cost] = cost_heap.top(); cost_heap.pop(); - const int32_t remaining_capability = cal_kv_len(workload_limit_global - accum_cost, cluster_len_q); - const int32_t kv_len_limit_local = - [&]() { - const int32_t limit_ori = ck_tile::max(remaining_capability, kv_len_limit_floor); - const int32_t tail_size = (remaining_kv_len > limit_ori) ? (remaining_kv_len - limit_ori) : 0x7fffffff; - const int32_t limit_fin = (tail_size <= Traits::kSplitTolerance) ? remaining_kv_len : limit_ori; + const int32_t remaining_capability = + cal_kv_len(workload_limit_global - accum_cost, cluster_len_q); + const int32_t kv_len_limit_local = [&]() { + const int32_t limit_ori = + ck_tile::max(remaining_capability, kv_len_limit_floor); + const int32_t tail_size = (remaining_kv_len > limit_ori) + ? (remaining_kv_len - limit_ori) + : 0x7fffffff; + const int32_t limit_fin = + (tail_size <= Traits::kSplitTolerance) ? remaining_kv_len : limit_ori; return limit_fin; }(); const int32_t kv_len_consuming = ck_tile::min(remaining_kv_len, kv_len_limit_local); - const int32_t cost = cal_cost(cluster_len_q, kv_len_consuming); + const int32_t cost = cal_cost(cluster_len_q, kv_len_consuming); #if PRINT_DBG - printf("[metadata] cost heap updated: cid=%d, pre_cost=%d, new_cost=%d, tot_cost=%d, kv_len_cons=%d\n", - cid, accum_cost, cost, accum_cost+cost, kv_len_consuming); + printf("[metadata] cost heap updated: cid=%d, pre_cost=%d, new_cost=%d, " + "tot_cost=%d, kv_len_cons=%d\n", + cid, + accum_cost, + cost, + accum_cost + cost, + kv_len_consuming); #endif const int32_t new_cost = accum_cost + cost; cost_heap.push(std::tuple{cid, new_cost}); @@ -165,17 +190,19 @@ std::vector get_mla_metadata_v1_1_host( MlaWorkInfo work_info{}; work_info.batch_idx = bid; work_info.qo_start = tid * cluster_len_q + qo_batch_start; - work_info.qo_end = ck_tile::min(work_info.qo_start + cluster_len_q, qo_batch_start + qo_len); + work_info.qo_end = + ck_tile::min(work_info.qo_start + cluster_len_q, qo_batch_start + qo_len); work_info.kv_start = kv_start_local + kv_batch_start; work_info.kv_end = work_info.kv_start + kv_len_consuming; work_info.kv_offset = kv_batch_end - work_info.kv_end; - if (split_kv) + if(split_kv) { work_info.partial_qo_loc = loc_partial_outputs; - if (reduce_partial_map[global_cluster_q_idx].empty()) + if(reduce_partial_map[global_cluster_q_idx].empty()) { ++num_reduce_row; - reduce_partial_info[global_cluster_q_idx] = {{ work_info.qo_start, work_info.qo_end }}; + reduce_partial_info[global_cluster_q_idx] = { + {work_info.qo_start, work_info.qo_end}}; } reduce_partial_map[global_cluster_q_idx].push_back(loc_partial_outputs); ++num_partial_outputs; @@ -190,14 +217,13 @@ std::vector get_mla_metadata_v1_1_host( // Update state remaining_kv_len -= kv_len_consuming; kv_start_local += kv_len_consuming; - } - while (remaining_kv_len > 0); + } while(remaining_kv_len > 0); } } #if PRINT_DBG printf("[metadata] Final Cost Heap Status: %zu elements\n", cost_heap.size()); - while (cost_heap.empty() == false) + while(cost_heap.empty() == false) { auto [id, cost] = cost_heap.top(); cost_heap.pop(); @@ -209,50 +235,65 @@ std::vector get_mla_metadata_v1_1_host( std::vector work_indptr; work_indptr.reserve(num_clusters + 1); work_indptr.push_back(0); - for (int32_t cid = 0; cid < num_clusters; ++cid) + for(int32_t cid = 0; cid < num_clusters; ++cid) { - if ((work_info_set[cid].empty() == false) || (no_redundant == false)) + if((work_info_set[cid].empty() == false) || (no_redundant == false)) { work_indptr.push_back(work_indptr.back() + work_info_set[cid].size()); } } const int32_t num_works = work_indptr.back(); - const int32_t reduce_final_map_size = no_redundant ? num_reduce_row : num_qo_clusters_indptr.back(); + const int32_t reduce_final_map_size = + no_redundant ? num_reduce_row : num_qo_clusters_indptr.back(); const int32_t reduce_indptr_size = reduce_final_map_size + 1; std::vector reduce_final_map; std::vector reduce_indptr; reduce_final_map.reserve(reduce_final_map_size); reduce_indptr.reserve(reduce_indptr_size); reduce_indptr.push_back(0); - for (auto [global_cluster_q_idx ,rid] = std::tuple{0, 0}; - (global_cluster_q_idx < num_qo_clusters_indptr.back()) && ((rid < num_reduce_row) || (no_redundant == false)); - ++global_cluster_q_idx) + for(auto [global_cluster_q_idx, rid] = std::tuple{0, 0}; + (global_cluster_q_idx < num_qo_clusters_indptr.back()) && + ((rid < num_reduce_row) || (no_redundant == false)); + ++global_cluster_q_idx) { - if ((reduce_partial_map[global_cluster_q_idx].empty() == false) || (no_redundant == false)) + if((reduce_partial_map[global_cluster_q_idx].empty() == false) || (no_redundant == false)) { - reduce_indptr.push_back(reduce_indptr.back() + reduce_partial_map[global_cluster_q_idx].size()); + reduce_indptr.push_back(reduce_indptr.back() + + reduce_partial_map[global_cluster_q_idx].size()); reduce_final_map.push_back(reduce_partial_info[global_cluster_q_idx]); ++rid; } } // Step.6. Flatten 2D arries - auto work_info_set_flatten = flatten(work_info_set, num_works); + auto work_info_set_flatten = flatten(work_info_set, num_works); auto reduce_partial_map_flatten = flatten(reduce_partial_map, num_partial_outputs); // Step.7. Create tensors. - auto input_opts = seqlens_qo_indptr.options(); - auto int_opts = torch::TensorOptions().dtype(torch::kInt32); + auto input_opts = seqlens_qo_indptr.options(); + auto int_opts = torch::TensorOptions().dtype(torch::kInt32); auto work_metadata_ptrs_tsr = torch::empty({2}, torch::TensorOptions().dtype(torch::kUInt64)); - auto work_info_set_tsr = torch::from_blob(work_info_set_flatten.data(), {num_works, kSizeMlaWorkInfoInDw}, int_opts).to(input_opts); - auto work_indptr_tsr = torch::from_blob(work_indptr.data(), {static_cast(work_indptr.size())}, int_opts).to(input_opts); - auto reduce_indptr_tsr = torch::from_blob(reduce_indptr.data(), {reduce_indptr_size}, int_opts).to(input_opts); - auto reduce_final_map_tsr = torch::from_blob(reduce_final_map.data(), {reduce_final_map_size, kSizeMlaPartialTileInfoInDw}, int_opts).to(input_opts); - auto reduce_partial_map_tsr = torch::from_blob(reduce_partial_map_flatten.data(), {num_partial_outputs}, int_opts).to(input_opts); + auto work_info_set_tsr = + torch::from_blob(work_info_set_flatten.data(), {num_works, kSizeMlaWorkInfoInDw}, int_opts) + .to(input_opts); + auto work_indptr_tsr = + torch::from_blob(work_indptr.data(), {static_cast(work_indptr.size())}, int_opts) + .to(input_opts); + auto reduce_indptr_tsr = + torch::from_blob(reduce_indptr.data(), {reduce_indptr_size}, int_opts).to(input_opts); + auto reduce_final_map_tsr = + torch::from_blob( + reduce_final_map.data(), {reduce_final_map_size, kSizeMlaPartialTileInfoInDw}, int_opts) + .to(input_opts); + auto reduce_partial_map_tsr = + torch::from_blob(reduce_partial_map_flatten.data(), {num_partial_outputs}, int_opts) + .to(input_opts); - work_metadata_ptrs_tsr.index_put_({0}, static_cast(reinterpret_cast(work_indptr_tsr.data_ptr()))); - work_metadata_ptrs_tsr.index_put_({1}, static_cast(reinterpret_cast(work_info_set_tsr.data_ptr()))); + work_metadata_ptrs_tsr.index_put_( + {0}, static_cast(reinterpret_cast(work_indptr_tsr.data_ptr()))); + work_metadata_ptrs_tsr.index_put_( + {1}, static_cast(reinterpret_cast(work_info_set_tsr.data_ptr()))); // Last step. Copy to the device of input and return the results. return {work_metadata_ptrs_tsr.to(input_opts), diff --git a/hsa/gfx942/fmha_v3_fwd/codegen.py b/hsa/gfx942/fmha_v3_fwd/codegen.py index 710f6e716e..37a6da8bad 100644 --- a/hsa/gfx942/fmha_v3_fwd/codegen.py +++ b/hsa/gfx942/fmha_v3_fwd/codegen.py @@ -862,7 +862,7 @@ class fmha_fwd_v3_kernel } } } - } + } } } } diff --git a/op_tests/cpp/mha/benchmark_mha_bwd.cpp b/op_tests/cpp/mha/benchmark_mha_bwd.cpp index b8829a1372..2d4b842404 100644 --- a/op_tests/cpp/mha/benchmark_mha_bwd.cpp +++ b/op_tests/cpp/mha/benchmark_mha_bwd.cpp @@ -156,7 +156,9 @@ auto create_args(int argc, char* argv[]) .insert("v3_bf16_cvt", "1", "float to bf16 convert type when bwd_v3 is set to 1, 0:RTNE; 1:RTNA; 2:RTZ") - .insert("v3_api_check", "0", "if set to 1, check whether the input scenario is supported by the asm kernel."); + .insert("v3_api_check", + "0", + "if set to 1, check whether the input scenario is supported by the asm kernel."); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser); @@ -263,7 +265,7 @@ bool run(const ck_tile::ArgParser& arg_parser) bool bwd_v3 = arg_parser.get_bool("bwd_v3"); bool v3_atomic_fp32 = arg_parser.get_bool("v3_atomic_fp32"); int v3_bf16_cvt = arg_parser.get_int("v3_bf16_cvt"); - bool v3_api_check = arg_parser.get_bool("v3_api_check"); + bool v3_api_check = arg_parser.get_bool("v3_api_check"); ck_tile::stream_config stream_config{nullptr, true, @@ -353,9 +355,11 @@ bool run(const ck_tile::ArgParser& arg_parser) const ck_tile::index_t nsplits = deterministic ? ck_tile::integer_divide_ceil(max_seqlen_k, kN0) : 1; const ck_tile::index_t a16_dq_acc_seq = - v3_atomic_fp32 ? shape_seqlen_q : (mode == mode_enum::batch ? (seqlen_q + 15) / 16 * 16 : (max_seqlen_q + 15) / 16 * 16); + v3_atomic_fp32 ? shape_seqlen_q + : (mode == mode_enum::batch ? (seqlen_q + 15) / 16 * 16 + : (max_seqlen_q + 15) / 16 * 16); // hdim_q = 192 pipline currently don't support hdim padding - const ck_tile::index_t a16_dq_acc_hdim = v3_atomic_fp32 ? hdim_q : hdim_q == 192? 192: 128; + const ck_tile::index_t a16_dq_acc_hdim = v3_atomic_fp32 ? hdim_q : hdim_q == 192 ? 192 : 128; ck_tile::HostTensor q_host( get_lengths(i_perm, shape_batch, nhead, shape_seqlen_q, hdim_q)); @@ -490,7 +494,8 @@ bool run(const ck_tile::ArgParser& arg_parser) << ", h:" << nhead << "/" << nhead_k << ", s:" << seqlen_q << "/" << seqlen_k << ", d:" << hdim_q << "/" << hdim_v << ", scale:" << scale << ", bias:" << bias << ", dbias:" << use_dbias << ", p_drop:" << p_drop << ", s_randval:" << s_randval - << ", deterministic:" << deterministic << ", mask:" << mask << std::flush << std::endl; + << ", deterministic:" << deterministic << ", mask:" << mask << std::flush + << std::endl; std::size_t workspace_size = dq_acc_host.get_element_space_size_in_bytes() * sizeof(AccDataType) / (1024 * 1024); @@ -502,16 +507,25 @@ bool run(const ck_tile::ArgParser& arg_parser) } auto get_mask_type = [&]() { - if (mask.type == mask_enum::no_mask) { + if(mask.type == mask_enum::no_mask) + { return 0; - } else { - if (mask.type == mask_enum::window_generic) { + } + else + { + if(mask.type == mask_enum::window_generic) + { assert(false); return 0; - } else { - if ((mask.left == -1) && (mask.right == 0)) { + } + else + { + if((mask.left == -1) && (mask.right == 0)) + { return (mask.type == mask_enum::mask_top_left) ? 1 : 2; - } else { + } + else + { return 3; } } @@ -580,7 +594,7 @@ bool run(const ck_tile::ArgParser& arg_parser) v3_atomic_fp32, v3_bf16_cvt, v3_api_check, - + hdim_q, hdim_v, data_type, @@ -591,7 +605,7 @@ bool run(const ck_tile::ArgParser& arg_parser) p_drop > 0, s_randval, deterministic, - + q_buf.GetDeviceBuffer(), k_buf.GetDeviceBuffer(), v_buf.GetDeviceBuffer(), diff --git a/op_tests/test_gemm_a16w16.py b/op_tests/test_gemm_a16w16.py index c5e297b0ff..2fba35c717 100755 --- a/op_tests/test_gemm_a16w16.py +++ b/op_tests/test_gemm_a16w16.py @@ -2,9 +2,7 @@ # Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. import argparse -import os import random -import sys from functools import lru_cache import pandas as pd diff --git a/op_tests/test_moe_2stage.py b/op_tests/test_moe_2stage.py index 4da2af1e12..0400cd6abd 100644 --- a/op_tests/test_moe_2stage.py +++ b/op_tests/test_moe_2stage.py @@ -11,8 +11,6 @@ from aiter.jit.utils.chip_info import get_gfx import argparse import pandas as pd -import os -import numpy as np import logging from aiter.fused_moe import ( @@ -28,7 +26,6 @@ shuffle_scale_a16w4, shuffle_weight_a16w4, ) -from aiter import ActivationType torch.int4 = getattr(torch, "int4", torch.uint32) torch.set_default_device("cuda") @@ -132,10 +129,10 @@ def weight_per_128x128_quant(weight, quant_dtype): a1_scale = a1_scale.squeeze(-1) elif ( qType == aiter.QuantType.per_1x32 - and (AQDType in [dtypes.bf16, dtypes.fp16]) + and (AQDType in [dtypes.bf16, dtypes.fp16, dtypes.fp8]) and WQDType == dtypes.fp4x2 - ): # a16w4 - a1_qt = input.to(AQDType) + ): # a16w4 & a8w4 + a1_qt = input.to(dtypes.bf16) a1_scale = None else: a1_qt, a1_scale = torch_quant(input, quant_dtype=AQDType) @@ -143,7 +140,7 @@ def weight_per_128x128_quant(weight, quant_dtype): # bias dtype convert if ( qType == aiter.QuantType.per_1x32 - and (AQDType in [dtypes.bf16, dtypes.fp16]) + and (AQDType in [dtypes.bf16, dtypes.fp16, dtypes.fp8]) and (WQDType == dtypes.fp4x2) ): # a16w4 exp_bias1_aiter = exp_bias1.to(dtypes.fp32) @@ -170,7 +167,7 @@ def weight_per_128x128_quant(weight, quant_dtype): w2_scale_aiter = fp4_utils.e8m0_shuffle(w2_scale) elif ( qType == aiter.QuantType.per_1x32 - and (AQDType in [dtypes.bf16, dtypes.fp16]) + and (AQDType in [dtypes.bf16, dtypes.fp16, dtypes.fp8]) and (WQDType == dtypes.fp4x2) ): # a16w4 w1_qt_aiter = shuffle_weight_a16w4(w1_qt_aiter, 16, True) @@ -210,9 +207,9 @@ def weight_per_128x128_quant(weight, quant_dtype): a2_scale = a2_scale.view(token, topk, -1) elif ( qType == aiter.QuantType.per_1x32 - and (AQDType in [dtypes.bf16, dtypes.fp16]) + and (AQDType in [dtypes.bf16, dtypes.fp16, dtypes.fp8]) and (WQDType == dtypes.fp4x2) - ): # a16w4 + ): # a16w4 & a8w4 a2_qt = out1_ref a2_scale = None else: @@ -299,10 +296,12 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor): (aiter.QuantType.per_1x32, dtypes.fp4x2, dtypes.fp4x2), # a4w4 (aiter.QuantType.per_128x128, dtypes.fp8, dtypes.fp8), # a8w8 (aiter.QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2), # a16w4 + (aiter.QuantType.per_1x32, dtypes.fp8, dtypes.fp4x2), # a8w4 ] l_act = [aiter.ActivationType.Silu, aiter.ActivationType.Gelu][:1] l_doweight_stage1 = [False, True][:1] -l_hidden_intermediate_pad = [(0, 0), (65, 65), (129, 191)][1:2] +# l_hidden_intermediate_pad = [(0, 0), (65, 65), (129, 191)][1:2] +l_hidden_intermediate_pad = [(0, 0), (192, 128), (129, 191)][1:2] l_preshuffle = [False, True] @@ -354,7 +353,9 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor): 2: aiter.QuantType.per_Token, dtypes.fp8, dtypes.fp8 # a8w8 3: aiter.QuantType.per_Token, dtypes.fp8, torch.int4 # a8w4 4: aiter.QuantType.per_1x32, dtypes.fp4x2, dtypes.fp4x2 # a4w4 - 5: aiter.QuantType.per_128x128, dtypes.fp8, dtypes.fp8, # a8w8""", + 5: aiter.QuantType.per_128x128, dtypes.fp8, dtypes.fp8, # a8w8, + 6: aiter.QuantType.per_1x32, dtypes.bf16, dtypes.fp4x2, # a16w4, + 7: aiter.QuantType.per_1x32, dtypes.fp8, dtypes.fp4x2, # a8w4,""", ) parser.add_argument( @@ -463,6 +464,30 @@ def calc_diff(x: torch.Tensor, y: torch.Tensor): intermediate_pad=intermediate_pad, ) df.append(ret) + elif (quant_type, aq_dtype, wq_dtype) == ( + aiter.QuantType.per_1x32, + dtypes.fp8, + dtypes.fp4x2, + ): + for hidden_pad, intermediate_pad in l_hidden_intermediate_pad: + for m in l_tokenNum: + ret = test_fmoe( + dtype, + m, + model_dim, + inter_dim, + args.expert, + args.topk, + aiter.ActivationType.Swiglu, + quant_type, + aq_dtype, + wq_dtype, + use_g1u1=True, + doweight_stage1=doweight_stage1, + hidden_pad=hidden_pad, + intermediate_pad=intermediate_pad, + ) + df.append(ret) elif (quant_type, aq_dtype, wq_dtype) == ( aiter.QuantType.per_1x32, dtypes.fp4x2, From c6965e61a3c44e443c75d3c1e3632013a521a3c7 Mon Sep 17 00:00:00 2001 From: amd-ruitang3 <145657428+amd-ruitang3@users.noreply.github.com> Date: Sat, 20 Dec 2025 22:08:39 +0800 Subject: [PATCH 24/40] bf16_gemm_clean_in_kl (#1700) * bf16_gemm_clean_in_kl * update * update * update * update --- aiter/ops/gemm_op_a16w16.py | 10 +- aiter/tuned_gemm.py | 14 +- csrc/include/asm_gemm_a16w16.h | 1 + csrc/include/rocm_ops.hpp | 1 + csrc/py_itfs_cu/asm_gemm_a16w16.cu | 293 +++++++----------- hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv | 34 +- ...p32bf16_tn_128x64_bshuffle_splitk_clean.co | Bin 0 -> 28656 bytes ...p32bf16_tn_160x64_bshuffle_splitk_clean.co | Bin 0 -> 32720 bytes ...fp32bf16_tn_32x64_bshuffle_splitk_clean.co | Bin 0 -> 16432 bytes ...bf16gemm_fp32bf16_tn_32x64_splitk_clean.co | Bin 0 -> 18512 bytes ...fp32bf16_tn_48x64_bshuffle_splitk_clean.co | Bin 0 -> 18464 bytes ...bf16gemm_fp32bf16_tn_48x64_splitk_clean.co | Bin 0 -> 20544 bytes ...fp32bf16_tn_64x64_bshuffle_splitk_clean.co | Bin 0 -> 20496 bytes ...bf16gemm_fp32bf16_tn_64x64_splitk_clean.co | Bin 0 -> 22576 bytes ...fp32bf16_tn_80x64_bshuffle_splitk_clean.co | Bin 0 -> 22560 bytes ...bf16gemm_fp32bf16_tn_80x64_splitk_clean.co | Bin 0 -> 24640 bytes ...fp32bf16_tn_96x64_bshuffle_splitk_clean.co | Bin 0 -> 24592 bytes ...bf16gemm_fp32bf16_tn_96x64_splitk_clean.co | Bin 0 -> 26672 bytes hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv | 38 ++- ...p32bf16_tn_128x64_bshuffle_splitk_clean.co | Bin 0 -> 28656 bytes ...p32bf16_tn_160x64_bshuffle_splitk_clean.co | Bin 0 -> 32720 bytes ...fp32bf16_tn_32x64_bshuffle_splitk_clean.co | Bin 0 -> 16432 bytes ...bf16gemm_fp32bf16_tn_32x64_splitk_clean.co | Bin 0 -> 18512 bytes ...fp32bf16_tn_48x64_bshuffle_splitk_clean.co | Bin 0 -> 18464 bytes ...bf16gemm_fp32bf16_tn_48x64_splitk_clean.co | Bin 0 -> 20544 bytes ...fp32bf16_tn_64x64_bshuffle_splitk_clean.co | Bin 0 -> 20496 bytes ...bf16gemm_fp32bf16_tn_64x64_splitk_clean.co | Bin 0 -> 22576 bytes ...fp32bf16_tn_80x64_bshuffle_splitk_clean.co | Bin 0 -> 22560 bytes ...bf16gemm_fp32bf16_tn_80x64_splitk_clean.co | Bin 0 -> 24640 bytes ...fp32bf16_tn_96x64_bshuffle_splitk_clean.co | Bin 0 -> 24592 bytes ...bf16gemm_fp32bf16_tn_96x64_splitk_clean.co | Bin 0 -> 26672 bytes op_tests/test_gemm_a16w16.py | 3 +- 32 files changed, 184 insertions(+), 210 deletions(-) create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co create mode 100755 hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co diff --git a/aiter/ops/gemm_op_a16w16.py b/aiter/ops/gemm_op_a16w16.py index e9f86a5cf9..d83ffd0309 100644 --- a/aiter/ops/gemm_op_a16w16.py +++ b/aiter/ops/gemm_op_a16w16.py @@ -20,6 +20,7 @@ def gen_gemm_a16w16_asm_fake_tensors( A: Tensor, B: Tensor, out: Tensor, + semaphore: Tensor, bias: Optional[Tensor] = None, splitK: Optional[int] = None, kernelName: Optional[str] = None, @@ -37,6 +38,7 @@ def gemm_a16w16_asm( A: Tensor, B: Tensor, out: Tensor, + semaphore: Tensor, bias: Optional[Tensor] = None, splitK: Optional[int] = None, kernelName: Optional[str] = None, @@ -44,6 +46,11 @@ def gemm_a16w16_asm( ) -> Tensor: ... +@functools.lru_cache(maxsize=1) +def get_semaphore_workspace(device: torch.device) -> Tensor: + return torch.zeros((16, 64), dtype=torch.uint32, device=device) + + def gemm_a16w16( A: Tensor, B: Tensor, @@ -52,4 +59,5 @@ def gemm_a16w16( splitK: Optional[int] = None, kernelName: Optional[str] = None, ): - return gemm_a16w16_asm(A, B, out, bias, splitK, kernelName) + sema = get_semaphore_workspace(out.device) + return gemm_a16w16_asm(A, B, out, bias, sema, splitK, kernelName) diff --git a/aiter/tuned_gemm.py b/aiter/tuned_gemm.py index c7c8b5994f..4465facd34 100644 --- a/aiter/tuned_gemm.py +++ b/aiter/tuned_gemm.py @@ -24,7 +24,14 @@ import torch.nn.functional as F from torch import Tensor -from aiter import dtypes, gemm_a16w16_asm, hipb_create_extension, hipb_mm, logger +from aiter import ( + dtypes, + gemm_a16w16_asm, + get_semaphore_workspace, + hipb_create_extension, + hipb_mm, + logger, +) from aiter.jit.core import AITER_CONFIGS, AITER_LOG_TUNED_CONFIG from aiter.jit.utils.chip_info import get_cu_num, get_gfx from aiter.jit.utils.torch_guard import torch_compile_guard @@ -392,7 +399,10 @@ def asm_gemm( out_asm = torch.empty( inp.shape[0], weights.shape[0], dtype=otype, device=inp.device ) - return gemm_a16w16_asm(inp, weights, out_asm, bias, splitK, KernelName, bpreshuffle) + sema = get_semaphore_workspace(out_asm.device) + return gemm_a16w16_asm( + inp, weights, out_asm, sema, bias, splitK, KernelName, bpreshuffle + ) def triton_gemm( diff --git a/csrc/include/asm_gemm_a16w16.h b/csrc/include/asm_gemm_a16w16.h index c7788bb3ec..26a207882c 100644 --- a/csrc/include/asm_gemm_a16w16.h +++ b/csrc/include/asm_gemm_a16w16.h @@ -6,6 +6,7 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 torch::Tensor& B, // B:[N, K] bf16 torch::Tensor& out, // Out:[M, N] f32 + torch::Tensor& semaphore, std::optional bias, std::optional splitK, std::optional kernelName, diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index 4a973ac1b5..908865ae07 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -456,6 +456,7 @@ namespace py = pybind11; py::arg("A"), \ py::arg("B"), \ py::arg("out"), \ + py::arg("semaphore"), \ py::arg("bias") = std::nullopt, \ py::arg("splitK") = std::nullopt, \ py::arg("kernelName") = std::nullopt, \ diff --git a/csrc/py_itfs_cu/asm_gemm_a16w16.cu b/csrc/py_itfs_cu/asm_gemm_a16w16.cu index b627f06af9..4d6b723e4a 100644 --- a/csrc/py_itfs_cu/asm_gemm_a16w16.cu +++ b/csrc/py_itfs_cu/asm_gemm_a16w16.cu @@ -9,16 +9,15 @@ #include #include -// start to prepare the input and output buffer struct __attribute__((packed)) KernelArgs { - void *ptr_D; + void* ptr_D; p2 _p0; - void *ptr_C; + void* ptr_C; p2 _p1; - void *ptr_A; + void* ptr_A; p2 _p2; - void *ptr_B; + void* ptr_B; p2 _p3; float alpha; p3 _p4; @@ -50,10 +49,12 @@ struct __attribute__((packed)) KernelArgs p3 _p17; unsigned int is_out_b16; p3 _p18; - void *ptr_Bias; + void* ptr_Bias; p2 _p19; unsigned int add_bias; p3 _p20; + void* ptr_semaphore; + p2 _p21; }; std::tuple @@ -64,6 +65,7 @@ get_heuristic_kernel(int M, std::string arch_id, bool bpreshuffle, int add_bias, + int clean = 1, std::optional splitk = std::nullopt, std::optional kernelName = std::nullopt) { @@ -72,7 +74,7 @@ get_heuristic_kernel(int M, HIP_CALL(hipGetDevice(&dev)); HIP_CALL(hipGetDeviceProperties(&dev_prop, dev)); uint32_t num_cu = dev_prop.multiProcessorCount; - // printf("num_cu: %d\n", num_cu); + uint32_t empty_cu = num_cu; uint32_t pure_tg_num = 0; uint32_t round = 0xffffffff; @@ -84,41 +86,44 @@ get_heuristic_kernel(int M, for(const auto& el : *cfgs) { - if (el.first.find(arch_id) != 0) + if(el.first.find(arch_id) != 0) continue; const auto& cfg = el.second; if(kernelName.has_value() && el.first != (arch_id + kernelName.value())) continue; if(kernelName.has_value()) { - TORCH_CHECK( - N % cfg.tileN == 0 && - cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && - (add_bias == 0 || cfg.bias == 1), - __func__, - " the specified kernel name ", el.first, - " cannot support the input shape (N=", N, ", tileN=", cfg.tileN, - ") or bias/preshuffle setting (preshuffle=", bpreshuffle, - ", bias=", add_bias, ")." - ); + TORCH_CHECK(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && + (add_bias == 0 || cfg.bias == 1), + __func__, + " the specified kernel name ", + el.first, + " cannot support the input shape (N=", + N, + ", tileN=", + cfg.tileN, + ") or bias/preshuffle setting (preshuffle=", + bpreshuffle, + ", bias=", + add_bias, + ")."); } - if(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && (add_bias == 0 || cfg.bias == 1)) + if(N % cfg.tileN == 0 && cfg.bPreshuffle == (bpreshuffle ? 1 : 0) && + (add_bias == 0 || cfg.bias == 1) && clean == cfg.clean) { - // 1. select splitK int split_K = 1; if(splitk.has_value()) - split_K = splitk.value(); - else if (cfg.splitK == 1)// auto select + split_K = std::min(splitk.value(), 16); + else if(cfg.splitK == 1) { - pure_tg_num = - ((M + cfg.tileM - 1) / cfg.tileM) * (N / cfg.tileN); // M-orient support OOB + pure_tg_num = ((M + cfg.tileM - 1) / cfg.tileM) * (N / cfg.tileN); if(pure_tg_num < num_cu) { - TORCH_CHECK(cfg.subK > 0, __func__, " cfg.subK must be greater than 0 to avoid division by zero."); - int max_split = std::min( - std::min(static_cast(num_cu / pure_tg_num), 16), - static_cast(K / cfg.subK) // “K-dim must satisfy min 128 bytes. BF16 are 2 bytes each, this means min ele of K is 64.” - ); + TORCH_CHECK(cfg.subK > 0, + __func__, + " cfg.subK must be greater than 0 to avoid division by zero."); + int max_split = std::min(std::min(static_cast(num_cu / pure_tg_num), 16), + static_cast(K / cfg.subK)); for(int i = max_split; i >= 1; i--) { if(K % 64 == 0) @@ -132,16 +137,15 @@ get_heuristic_kernel(int M, } } - uint32_t tg_num = pure_tg_num * split_K; - // 2. better or not - uint32_t local_round = (tg_num + num_cu - 1) / num_cu; - float local_compute2mem_effi = cfg.tileM * cfg.tileN / (cfg.tileM + cfg.tileN); + uint32_t tg_num = pure_tg_num * split_K; + uint32_t local_round = (tg_num + num_cu - 1) / num_cu; + float local_compute2mem_effi = + static_cast(cfg.tileM * cfg.tileN) / (cfg.tileM + cfg.tileN); bool is_earlier_round = (local_round < round); bool is_same_round = (local_round == round); bool has_sufficient_empty_cu = (empty_cu > (local_round * num_cu - tg_num)); - bool has_same_empty_cu = empty_cu == (local_round * num_cu - tg_num); + bool has_same_empty_cu = (empty_cu == (local_round * num_cu - tg_num)); bool has_better_efficiency = (local_compute2mem_effi > compute2mem_effi); - // printf("oob %d, tielM: %d\n", oob, cfg.tileM); bool less_oob = (M % cfg.tileM == 0) ? (oob > 0) : (cfg.tileM - M % cfg.tileM < oob); bool has_same_oob = (cfg.tileM - (M % cfg.tileM)) == oob; @@ -153,7 +157,6 @@ get_heuristic_kernel(int M, compute2mem_effi = local_compute2mem_effi; oob = (M % cfg.tileM == 0) ? 0 : cfg.tileM - (M % cfg.tileM); selectedKernelName = el.first; - // printf("Selected Kernel: %s\n", selectedKernelName.c_str()); selectedsplitK = split_K; } } @@ -163,175 +166,101 @@ get_heuristic_kernel(int M, return std::make_tuple(selectedKernelName, selectedsplitK); } -torch::Tensor gemm_a16w16_asm(torch::Tensor& A, // A:[M, K] bf16 - torch::Tensor& B, // B:[N, K] bf16 - torch::Tensor& out, // Out:[M, N] f32 +AiterAsmKernel* get_or_load_kernel(const std::string& selectedKernelName, + CFG* config_map, + unsigned int& SUBM, + unsigned int& SUBN) +{ + static std::unordered_map> impl_ptr_map; + + auto it_kl = config_map->find(selectedKernelName); + TORCH_CHECK(it_kl != config_map->end(), __func__, " not find kernel~ " + selectedKernelName); + + const auto& cfg = it_kl->second; + const char* name = cfg.knl_name.c_str(); + const char* co_name = cfg.co_name.c_str(); + SUBM = cfg.tileM; + SUBN = cfg.tileN; + + auto result = impl_ptr_map.emplace(name, nullptr); + if(result.second) + result.first->second = std::make_unique(name, co_name); + + return result.first->second.get(); +} + +torch::Tensor gemm_a16w16_asm(torch::Tensor& A, + torch::Tensor& B, + torch::Tensor& out, + torch::Tensor& semaphore, std::optional bias, std::optional splitK, std::optional kernelName, bool bpreshuffle = false) { - TORCH_CHECK(out.dtype() == torch::ScalarType::Float || out.dtype() == torch::ScalarType::BFloat16, + TORCH_CHECK(out.dtype() == torch::ScalarType::Float || + out.dtype() == torch::ScalarType::BFloat16, "GEMM A16W16 asm only support Float32 or Bf16 output now!"); - + std::string arch_id = get_gpu_arch(); - // 1. prepare args - int Mdim = A.size(0); - int Ndim = B.size(0); - int Kdim = A.size(1); + int Mdim = A.size(0); + int Ndim = B.size(0); + int Kdim = A.size(1); - unsigned int SUBM = 64; + unsigned int SUBM = 32; unsigned int SUBN = 64; - float alpha = 1.0; - float beta = 0.0; - int szA = Mdim * Kdim; - int szB = Kdim * Ndim; - int szC = Mdim * Ndim; - int szBias = 1 * Ndim; - int sz_A_pad = 0; - int sz_B_pad = 0; - int sz_C_pad = 0; - int strideD0 = 0; - int strideD1 = 0; - int strideC0 = 0; - int strideC1 = 0; - int strideA0 = 0; - int strideA1 = 0; - int strideB0 = 0; - int strideB1 = 0; - int is_out_b16 = 0; - int add_bias = bias.has_value() ? 1 : 0; - // A row major, B col major, C row major - strideA0 = strideA1 = A.stride(0) * A.element_size(); // in bytes - strideB0 = strideB1 = B.stride(0) * B.element_size(); - const auto elem_bytes = out.element_size(); - strideC0 = strideC1 = strideD0 = strideD1 = Ndim * elem_bytes; // inbytes - if (out.dtype() == torch::ScalarType::BFloat16) - is_out_b16 = 1; - - szA += sz_A_pad; - szB += sz_B_pad; - szC += sz_C_pad; - KernelArgs args; - size_t arg_size = sizeof(args); + KernelArgs args = {}; args.ptr_D = (void*)out.data_ptr(); - // args.ptr_C = bias.has_value() ? (void*)bias.value().data_ptr() : nullptr; - args.ptr_C = (void*)NULL; - args.ptr_A = (void*)A.data_ptr(); - args.ptr_B = (void*)B.data_ptr(); - args.ptr_Bias = bias.has_value() ? (void*)bias.value().data_ptr() : nullptr; - args.alpha = alpha; - args.beta = beta; - args.stride_C0 = strideC0; - args.stride_A0 = strideA0; - args.stride_B0 = strideB0; - args.M = Mdim; - args.N = Ndim; - args.K = Kdim; - args.is_out_b16 = is_out_b16; - args.add_bias = add_bias; + args.ptr_C = nullptr; + args.ptr_A = (void*)A.data_ptr(); + args.ptr_B = (void*)B.data_ptr(); + args.ptr_Bias = bias.has_value() ? (void*)bias.value().data_ptr() : nullptr; + args.alpha = 1.0f; + args.beta = 0.0f; + args.stride_A0 = A.stride(0) * A.element_size(); + args.stride_B0 = B.stride(0) * B.element_size(); + args.stride_C0 = args.stride_D0 = Ndim * out.element_size(); + args.M = Mdim; + args.N = Ndim; + args.K = Kdim; + args.is_out_b16 = (out.dtype() == torch::ScalarType::BFloat16) ? 1 : 0; + args.add_bias = bias.has_value() ? 1 : 0; - // args.stride_D0 = 25; - // args.stride_D1 = 80; - // args.stride_C1 = 3; - // args.stride_A1 = 124; - - // 2. select kl - static std::unordered_map> impl_ptr_map; - AiterAsmKernel* impl_ptr = nullptr; - CFG* config_map = &cfg_bf16gemm_fp32bf16; - - // 2.1 static dict + CFG* config_map = &cfg_bf16gemm_fp32bf16; std::string selectedKernelName = kernelName.has_value() ? arch_id + kernelName.value() : ""; int selectedksplit = splitK.value_or(0) ?: 1; - if(!kernelName.has_value() || kernelName == "" || !splitK.has_value()) + if(!kernelName.has_value() || kernelName.value_or("").empty() || !splitK.has_value()) { - - auto it_sel = get_heuristic_kernel(Mdim, - Ndim, - Kdim, - config_map, - arch_id, - bpreshuffle, - add_bias, - splitK.has_value() ? splitK : std::nullopt, - kernelName.has_value() ? kernelName : std::nullopt); - selectedKernelName = std::get<0>(it_sel); - selectedksplit = std::get<1>(it_sel); + auto [name, split] = get_heuristic_kernel(Mdim, + Ndim, + Kdim, + config_map, + arch_id, + bpreshuffle, + args.add_bias, + 1, + splitK, + kernelName); + selectedKernelName = name; + selectedksplit = split; } - - args.splitk = selectedksplit; - // printf("=== KernelArgs Important Parameters ===\n"); - // printf("ptr_D: %p\n", args.ptr_D); - // printf("ptr_A: %p\n", args.ptr_A); - // printf("ptr_B: %p\n", args.ptr_B); - // printf("alpha: %f\n", args.alpha); - // printf("beta: %f\n", args.beta); - // printf("stride_D0: %u\n", args.stride_D0); - // printf("stride_D1: %u\n", args.stride_D1); - // printf("stride_C0: %u\n", args.stride_C0); - // printf("stride_C1: %u\n", args.stride_C1); - // printf("stride_A0: %u\n", args.stride_A0); - // printf("stride_A1: %u\n", args.stride_A1); - // printf("stride_B0: %u\n", args.stride_B0); - // printf("stride_B1: %u\n", args.stride_B1); - // printf("M: %u\n", args.M); - // printf("N: %u\n", args.N); - // printf("K: %u\n", args.K); - // printf("splitk: %u\n", args.splitk); - // printf("is_out_b16: %u\n", args.is_out_b16); - // printf("add_bias: %u\n", args.add_bias); - // printf("=======================================\n"); - - auto it_kl = config_map->find(selectedKernelName); - if(it_kl != config_map->end()) - { - const auto& cfg = it_kl->second; - const char* name = cfg.knl_name.c_str(); - const char* co_name = cfg.co_name.c_str(); - SUBM = cfg.tileM; - SUBN = cfg.tileN; - auto result = impl_ptr_map.emplace(name, nullptr); // insert new kl. - if(result.second) // emplace successfully - result.first->second = std::make_unique(name, co_name); - impl_ptr = result.first->second.get(); - } - else - TORCH_CHECK(false, __func__, " not find kernel~ " + selectedKernelName); - - // 3. launch kl + args.splitk = selectedksplit; + AiterAsmKernel* impl_ptr = get_or_load_kernel(selectedKernelName, config_map, SUBM, SUBN); const at::hip::OptionalHIPGuardMasqueradingAsCUDA device_guard(device_of(A)); const hipStream_t stream = at::hip::getCurrentHIPStream(); - int bdx = 256; int gdx = (Ndim + SUBN - 1) / SUBN; - int gdy = ((Mdim + SUBM - 1) / SUBM); - int gdz = 1; + int gdy = (Mdim + SUBM - 1) / SUBM; + int gdz = selectedksplit; - if(selectedksplit > 1) - { - out.zero_(); - // HIP_CALL(hipMemsetAsync(out.data_ptr(), 0, elem_bytes * szC, stream)) - int k_per_tg = Kdim / selectedksplit; - gdz = selectedksplit; - } + TORCH_CHECK(gdx <= 16, __func__, " gdx (", gdx, ") must be <= 16"); // 16 = 512/32 - // printf("argsize: %zu\n", arg_size); - // printf("gdx: %d\n", gdx); - // printf("gdy: %d\n", gdy); - // printf("gdz: %d\n", gdz); + // semaphore.fill_(selectedksplit); + args.ptr_semaphore = (void*)semaphore.data_ptr(); - impl_ptr->launch_kernel({&args, - &arg_size, - gdx, // gdx - gdy, // gdy - gdz, // gdz - 256, // bdx: 4 wv64 - 1, // bdy - 1, // bdz - stream}); + size_t arg_size = sizeof(args); + impl_ptr->launch_kernel({&args, &arg_size, gdx, gdy, gdz, 256, 1, 1, stream}); - // 4. return out return out; } diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv index 3e58803dd2..63698c39bc 100755 --- a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv +++ b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16.csv @@ -1,11 +1,23 @@ -knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias -_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1 +knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias,clean +_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1,0 +_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1,0 +_ZN5aiter49bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co,1,128,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co,1,32,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co,1,48,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co,1,64,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co,1,80,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co,1,96,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_splitk_clean.co,1,96,64,0,0,1,64,1,1 +_ZN5aiter49bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co,1,160,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_splitk_clean.co,1,32,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_splitk_clean.co,1,48,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_splitk_clean.co,1,64,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_splitk_clean.co,1,80,64,0,0,1,64,1,1 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..4fef26f0ea862281f562112f211870f6e979780f GIT binary patch literal 28656 zcmeHQeQ;FQb-!9IVi5v_&4&QNE?CCIg=K{#By2twA+QVr%MP{y6E7=i#mW|vyjo;% ziFjH|vaBdVRusie)7Wur$N6y5q;}&pskHH=P0}$PaMGzgwL2u!%%n3(>$*uNXj9N$CRoJaFz*CAuALz{Adb{Ef+GoF%y5h zPsj>&Kg{*Dh_Jubmx&!hWb5+fjy#@=#P{N-{MGxr<7e?EyubH)JhkSxlOK-veqT5i zUr$UJF9Y36ggT8vWU~qXeKBJ`!kznTZmGYMvBX!yCz_(2VgLP+SX-pE^Rc04{NdQ~ z&NoB2aQS>kxJfyA zEKbigoDT62DpLG(N96M@Wt+??evwbHy{+~16=%3h9jwrDuy>tdk;UovJ;R-9hCa)| z-gSoSEKa}g8CI(q1}q1A*BMq>oTig^smj* zzitNpPd7y3=>@2tK>&RL4rb|pC`05T(XUe1UoXAmW0A)d}Qz;oFK_{t0dsD-$k5q~~| zC>aa!Lbd^3$~M4vW)MIwMD^v2NX{Ti#zK5A+WMEFumXQ=&TJk(IS`SNo3W)LWIIZCpQurBKe>)%g=nXxKQSGR;aFRxob9P#~; zKj(9*EN<_1TmKx9QK5#6qX*y5`?au8=W}YZj!>I*gu3?=!8~j4k*<5bBOVKNMoxq~ zqV2KB#VLBYoX|8akuIP_{;h;6!;0HMT~=ZM(ni?})W@h2x~2 zGG+FHKNfj_2-1ure}b+pIXPeUcOLJEg&NyWw#7TX;O{(rqM^O@^+*dm6 zc^CVd|G{uqq&e13ONkW;DsHzc669?`FY8D z!a?Tp4Z7UEl-s>FMQfFExdbxCv7hDnCTZWr#wT*wrzUb|1-j2BhbAwu(TNLen6^RM z`e}Q1BE>#2flN!Wa}qx$@ezq%;J7fgkgvhK)I5%JQgb-YO3mVUed>CSH0}|6(?a7o z(m0MZjw6lZxO~#BHQmU_33ZeOz9*`C-wT;b$Rx>)bzj~C=5x6|8AvCmJqa`R@V zT-S7yluEf40nesfOS)5M8C$i?JL;wPw7DmLat-+vuxl?eZw`Iy*7s_`HpY6H$2a2f z6u6faMVF?$qYlONt|FhJqUe$&VXxRv76}~Tw&hXp^5UqscvaN9$`|$e)-CL#ecqk~TUX*;w=Q}EOIYVIaHV-9#POfZW*!MKk3M?Ot7uMc zS_Ah^#-3pLYZLkTCGNTFqL?4n*iUQauqP&v6FaK)(~Po*AZeqC4~4aBs}>g=lJ>*`1%S8O7iCBM;|6RPYwztM4m^r_nz;J9)*iJl5Co2>G!==0&T zSm71t3-5*IjJ)CJi;$l$LVmso`S~K`=Zlb^FG7C42>JOU&ctkNuS3GhfVGvOz?B^>;%s7faj$GUPoSfiE13xC#nlR z+IL*vq0jO}H!D1N>FB+4oQ?%UlPMS0Je9h44&{I=zZ=)Md*|k_b?3g^J(8a+QTC8# z4{`h7lKp!n$8ryKhjWi~H{>4eZp^*6`&jNVk{d{FBDtC5D9QJc+)8pA$sHueNRE@- zMe-?|_jQD}qqN;i z+cDZ2XltUanYJix_tDl$TN`a1w8dzP)7C}XDcT;Q?F?<*v^lgTXzQh|pSA(o25B3j zZJ4$Z+D2&`qiwv~@t<*6;q8|evmE-Y(0&dTC6bpY8BH9?OO=eqj^rXG)8_{AG9}~T z4U(5D84o*uUsAUn9u+8bQ|A2(rFLko7f!tgjJdeT^XNYXn(eBgpz1LDts@@|D(TK(5h{u|@&% zqw~b`2>bdPLDts@vc5);^)-U5uMuQ@jUel51X*7r$od*V*4GHKzDAJsHG-_K5oCRh zAWySKE7((XUH{yKx_&P6ezPZjZfKIa-pPt0?;~E>BQBb^llO>>z4K4kQ%VK}x|xg1 zo}j=2=H_x?P~ak#$K|4+z+&d%vM(sGgynO2T~Oe9R>0*zP+&Qm&*iG1z^z3I_v99L z?_?|}=D42yr-~@TBU{YWW#`VsT!n`HE2ozGA4>{s!nocK$#Gh~LEpbY>b1Oc_HzGdQebNpj$isyaeOTw zp-)suy_T0w9ry1@3XEf1?-n^u%U9_8w@JO0x6T3Xe<~^P%vKzK=i9}p)n5Mj^{&aE zT<@&!!!xeUJz&W1kd`}e z|Gk{g`9?mEpg!Xoq~-G{>N9kDK8tHgP+nVjeI7%7#yBIN$5Ef5*YjCigKFiqh}Y*4 z)Mtz{@_7{X8G1dR#Wky5Ub}dG9z%V`I3u6OQJ&lqRq^C;>w z^m;yvYvM6^ZRGWN4D}h~jC>wPeTH7oXK@XU%4;dF&m*YM7-!`3DC#rxdOnM5Zil?~ z^7=f6`iyZ#K98e5L$BwvxJI9n*J@s$N2aUK)R!S|M^$~M{!Gv3v1#h_xKW=+WPKi$ z^?6Lz=W$t|M`V2-mGyZ{*5`3qpGRbU9+mZZOxEXdtv=tCmsy`3)Mwm#X!)E#eTGiU zXW4Hs*>CXr+>82*aYjD(qdr5g<+JQJJhI>5_1Qsv#yBIN6R6M7YxykujY8RP@cP_~ z`iyZ#KKG+OL$Bqt>^F*JzrpLXgZhkdMm{G{pP|?CS@s(~*>CXr+>82*aYjD(qdr5g z<+JQJ*2#WjJ?g82`iyZ#J||G0q1Wa!#3b3)eVURj^}Wqo#JeNM>w+$-yIzpT%Wtj`HqpL=C} z?$_$`Q?oMjc>wj<>^D%K?fphj_8YuD51~F|oRQDNsL#-A`7Ha5TG?;#`aFR8jB!Rj z528LpujRAsH|k}-!Rzx7>NCa}`872T-3e&dBFM)Mx0me3t#j zG1+hM`aFdCjB!Rj52HRqujRAsH=?rNXyyF|>NCa}`8C{RXekL#WRf zXXNuR>NE6OKFfaNlhr%D_k$y-&$u>e^?4Na89J>#oA6 zanxt%^?VlnM*7|t{qG3sGsYSDJc{}Zy`Ima-$>sp!+#9*8RLw69!GtKUe9OIZ=~<7 z(f^L1K4YAb&!ec%(Chgu`i=CxIQ+*@pE1tJ=W*0$==FRS{YLuU9r-+h`iyZ#K98b4 zL$Bwv=r_{$`tTn^ea1K=pT|+3q1W?S^c!hz0R6&8={l+x)dEBVaBeFh^ z%KAJe>+`s*&m*!vkIMQyChPOKtj{B|K99=!JSOY&xK^KUqI~}T?-Ez!IBZ;ojq^ZG z+c*xB%fL7r=YgEGaU3=-!^U~Q-^Ov6Tn5J3I1l8cjpML!88*%X{x*)oF_KFwVw#AkSfpIp@19@iSIBZ;ojq`xNjpH!642-jJ9>}{ZaU6O+>s$x?Y#fK4&)_|f*tiTE=Yc%6aU3R>fpIp@19@iS zIBZ;ojq`xNjpH!642-jJ9>_Br$6@0#Y@7%DZ5)TmWni3*^FW^2I1U?^VdFgDZ{s*j zE(7CioCosqN*srt&k5B_6W0O1UP}+H=QFqt-fvt5j-w?fvp$;~hwL{pxQsyy=Yc%6 zaU3R>fpIp@19@iSIBZ;ojq`xNjpH!642-jJ9>_Br$6@0#Y@7%DZ5)TmWni3*^FW>r zSU3(Fmto^P;BVtNOfCcCY@7%3@=6?sp3j3C*8#sF3&)}7Gq?`kZ(IeA<3)PzNdL~; zWjCxCG_&I5U7<2cOc3NX&b zc|dRDILzk|FwVw#AkS!U?@{g|>G^qjrWeo3FxL9o zI>y!q1#WNz`jY}TvW46p2nsB91a3+SEMsf9y*wze!V$PRDX@~2aeGxz;1);VO-X?_ zvl?#S8WgzA5x6}ma0k1S+k-)YI~{@5NrAgqGq=|S1@3kP?nw%~g+0jawLyWmIs*45 z1@6N-6w$Lnd32uetn(s|c%~eBpVu?_yA(aoOzX6Y&cUKwdJfVlf789bkZ98Ki|!q? zo{%>fvfq$58gjspOAUFGA(t6)xgl2=@@7M>G~_Bn-eSl%8S>4Byw#Am8S-{R-eJf= zL*8k~)rP#wkZTNiw;}H_pv!nL#L^NnZI>AC+x{#nO!#l~lSE33V*{;-Rv z@zlljOY>Yr(O>i|ew#tR;U1yiKzgF+xxSA5^5U#H!`$xwh_L$&zl|ZG->CWR*Zcxu zVGkI7rHw*gs`(wz{5C~|eUsr=c3kMoG{4(4zw#DgFE{)u?iczB&97eb+k8UUHyeJH z?LuFv`Q5JhRXrf=RfgY|PNCnT`Q4%U-E>meZ!-LDeo*Ld*8D!K`E5Nd>{|`LZGSBE z+cdv}n&0-13j21$Z^v1o-=X>4srdzaggt2Z?R;41cWQojX@1px!d`9o?RrG$cWHii zYkoD43VV&=xBFv4zgzP=r1|anxUlar{BAiX^tWh!hc&<2$ArDs@VoUBLVv5~cSQ5s z`$=KnYxwQ^l+f?f{O*x{jD7mG>+7i0dk7EEhj}3({ZPwUMtBFwYX}dLd?VpqB$p8$ zBKc;*BU7%mb@E!f-|_Hk(}ASG+vK&jUS4Z&cb0Mc9Z7*7me<;Y@>+YRb0fFkl@xfl zyw)C)*V@C*&D?$@De#{6e61T$*Sgv_-E~^+b4cEA$OjDhHbbsAAGpt!!HVsBAANo7PJz+w;n%^_9x@tg>l6rm{VuY+ApmY>z9O)_W@3 zW6GxWp~^O>Y+6sMY!54&)}Jcd8D-OYRb@M=Y+B!{Y;DS>^{~ozT-mgKR@sgzo7USZ z+hJwX`dnqZL)o;RSK0O{o7Vp-TaB`5y|A)vQ#P$HR<_N`ruE3mwo%!%ep%VpDVx?i zE8A*i)B0#-Tdr(cPpxd%Dx20{D_eoGX}z|x%~Ce4?}iQcVd%X-Mc>J)dOViRq#gz8x zV*WQ>Z(z$01EvB^FVyYf8kLrZ#A#-yQlj*-* zQh4T4!Fo46-*@KHynw*@95{}};Liu`90kjJC~KR>|B;s0ms z*YBj-v0E?g^xX}A#Z>?)E2wK}qkk>rCLjG9BEGu18sDaXpZ@c$+t}LPc03Xbb%f(l z{IR`IXL~3fr9Z|O>p0OFY7Do9I!|_Vw8!E!qVfLl@s_sZWRHZJA`K^xhg#a2+e6Lm zu~4|RHPqf2>WH;Bx3oq&H+IGvH@3ERo!CH{`y!2TF8BgXP2sX|Q**`U%FUZ2fpEAn z&=e>utt@MdG?i6WY>AZ8KQPkR(D)%W>#2yJt1$Y}$C*FMwBb>7=*#ilwf0)d_h+*U zhJKCIk8IEe)2C;d zo}Xlj^JGP){vkQO;Fe5%pVY70oT(p?`nT@R)c>i}Cm+bvKP&aW2xscQF7@narv5up z|Au&jY2){${s)HsRjGgFWO^vyc~gx)mZ|@_)W2Zp|5NJE9M2s8j?}-_n5mz`6KFR3 zo}phP^-V?``g3mdn$3>3X2!Wu>T3;sh15T1tY=W_3u2jZ{tkMVhpB(>NLlxh-u8DL zA0@r*?>aJnTYEgh{7t9Z=r2_=|M9ky{%E)}%H;o{kH?rl*4`A3hnYVfITh!l!zWrA zNgZj8wbMtusfj+|%-`640)O0-;jdoC!wrN{@)mmnS!=i<(i#c`D)yvzt7_7_)#d5k z-IsT(E7RKQ%VTTO14?&Rr~#$BtJLnMz~$Y`+RE}WHJ~i8Q|(q)?vlHurF#NOTeGKH z?e3~dYpeI9r&GOWlXBa%Cmpw@T4@6{JJa*sbNLJX)Y zEh<=P>25XO(oKP~?EW5RcHa1FnxUG*2X^kiRTMIneltd-^K9Dib3y)6I}+8uZ>7xY zx}{|J;wN>=Ok#U6f1=i}Q0npA3Utc9wqF=(n+3jv z-&<-g^ZoB98+68HlPUkNX};K1{tEFmxmr7Y=x^ll19D>X$mpX@&0pouy`)ofr`_~x c-F;$Ly?Nq6AEs0RsX;oR9>90uB%`YQTtysL^4POfrFx#3V%Q()fsg zXsseutJc;SV~jDzG)+^x?UJddd$!%9d(?JM@oeo$d)htSv+ZuzZufM%U1z`F{WILT zxdF{sj%SaX=fM0M2|NDrA{Lh97l-u?0eAD-v<|4k0Vl z{m|Ff4+{J1Pb6ZK5Ls!u+>ys~k@!*kl)w7jJ|%P^x2S_#D;gesvY>okS3Q5UP{(k0j|&_e|U( zKh{;O1|eS!LenII=!4Ljr2nBL{SQyVUk<{S_?7UcGl?krD7<+z*#IY!4M4SbvRA{F z_|@>{$w@>>7>LJ{4e(U50iK;i0C`1hI(tQQP9jReKy)V?pf}k7{gViw2IA}$F*u1R z2?H^lY=GC34e;Gb1ds!<`Ro-DokWy`f%ty10e+BdfHx-*Kn=v%E8^`*L`fKkA152& zOtJxfI*9$Z0lIhSgYdgaL`WEf|4cT(ACnF6 z=Sc+UzCjy=w^G<7Lg=ILR*F062%e-PWK1GLPfJsMUF&|86mO+uP9jL+P|Qs_LRQid zvL_LtudXH7c(65CnZM@jaQG$>C~-J)la8<~=?E)7PK3#^D&3pwLM>;<%`cAl@yMUI zQi?5Z=UdkI5*Zb&OgMVruUs7an;Q={(cx8BZjZsL`f#XW>v!^+n(Nv^t)w2; zWj37G9KM?f(u5;_iE2wq$}4#-wN1^zs>Xv2txYfGwcLB4va$ZXKvI3l+h0xGS3K-n zF7{g9J)yR6O>-j+B}OEmxZSpJb4y)g!zo2POFq~HnCGMtORRWy+29(&*~E+oBu^iZ zdbC#?DEdO74?y?*pD(%S{`o)pV=C=q!v{1qYzd#F#(*W|O(U4lmGcXUy^qT01~xy8 z*?p-Q>?r;4>N$a}-G$V-**8z8#8QsOV)P6r%@rM^gFA%3Ba9OAlQY#7?Ibxs7$tNF z9YQw66%7ys2+JX-q(yx|S6b8$OidHR;ZBP}WHVgROd2kS(1&50M(qd#vnUSa^t5O> z;X};jd&uSX#oX?tF&e9w%O#L8j>lP=ZWTfW((L&W_FIV=yB&gX6T=G>%hZQ#dY(E#XM*9>9|p zYR8eu#4Vd-V}Q3cI@4Z4U8RS z9^XliC&N8ID>5(UJ=IAuy$i`FD=TtIl<=t7k7fxx$!!ZF-UZnaZ}!57ccCxh^(~Hg z7w1O2xl1G7rOP7TWh>6c%JLpTyi20KQ@*3VQ(m9s`4rvYd5T`*Y%4{pk?> z==>!5(;@nEjK*Lg_03IV;@-sAb1Z#nM|yg$d-}=<`iU`LLu2Q#=T74qxyG9|n#zvF z&%vK!9ixtGu-$PD(KbxmNIS;+V)9{cw~uA)aXY{n5@$-BB{5UtY>9IuULgqIR7Aj~GblrV=dn{Xjv z4xyKDA)$}ZOSp*8N4S`95#eQoiwScHFC$z+m`k{na0%h%gi8sR5nfKXoNyW83c}@t z@#{B>@+RANY98t5(fLGwvk1}O`Gn~20z&lnQbP1Mn-KlYAw+){5~9CeLiE>1i2g1j zM1L0(qQ93BqQAL>=6=UUJ#U^9%e&HX=w=za($6tx0p$eUuVU=g(7%>pn!fb7@6o~xPrFn2 z+!6P+9=ew}qcM*s{lnOOH@ate+NaPp;q|!a`T6m8-P2vnJ>3&=)4q@E=z6K6@k*b$ z?}#TOQaJO;&lk>2uSX2`3=f++eI`5NN$2_$=txc_oH`}Y4)oDIqR8hxb*cCDBZWtv zJ5uCjqs6{UkNJ@EMP6|}>1W~RlaQZJLVi98`S~Q|=aZ11PeOh^3HkXX*#(^p6{U;DqN)2EU)3IYakcx=+%5$w&K+>!)-t`B^*5+%;C;FwJS2_3&uSg>w>% zHB6&x#FgHTn%giveW^S37wsq0$8wduQL{I4`#+QYm$^-;d)x0$t!!^jt!{5gt!Zyc zjgY*L!PijwjSDgY3rk{pSFQ^C-1PsGH)82$5JR4X+MQ-nj~i_852^H z=PMZ#RgxDdner0yrAo$^QAo~KGQPY)a*mRx3VETDG4Uh2SIN_a>{Bwne?s;}N}ev{ z#Y)Z)@?}b%A>>>o&lK_!CC?J_QYB{!`En)C7VI)$Z@XJ#u)qh7(>>_7_vUbko7T!tdB8deT*UN zV+>gzW61g#L)OO_vOdO;^)ZI5k1=F@j3Mh|3|Svz$od#V*2fs~xyHCxj&Z*+#(o?} zUdPzi#~89c#*pthUAA7jY+7(>>_7_vUbko7T!tdB8deT*UNV+>gz zW61g#L)OO_vOdO;Cm7>PSr^Tlo~m*q;&XP! zKK!REC?x^{-OR;hPe5P>b8|T}AaE{A<8oF&U^eq`*%uI)%hI{LEFf?t%iyv7S37f;^(tyCStPc0+@+n70+XCWe@Ux%)s59k_Z2i6mUDuR1vd#OS zQ_f?RM+NReyY6YSopwDz-!nt%wd=~+$^G|41y&T|*t6di$I-4g=({eKdhNP%c5(l` zQGwNH*F9gh)2>J8dvc^+yDpvG+&>Z(SYM3eFZzWzzIMGr-?c>Qwd>Z|!~L700$b6p zd%0|tde@|x^`~o{)eIh53k4Z^FI*B*RFTyyNaYi1ym9t&|C3RHkE31pb+Vn7Ptf;l zmwGL)oJ#KhR8-)zB{=>~b8=#fy!`&;U7S9;(phS^sPmU-J@;O67#M1dCE6l z2gLmu`O^2|8`&}FwEHxBo{tv`2y~-e|I4zSk#7U9O1+kE>;>-c2?)$UUi5zbU2z;E z-}=5O^;*8M7rB3CK;T^X_kUZqGxBZVZ=_z!H}(?u&k6|4Mqc!OkMaUPr;%@ce=GG` zzOk3Nzb_y#7ykV}l zEQEjmd$OI8Zv*d3y_Rq6HSS*=5V#(B(ff1C3x4j!yb1Fx==**p^;*8MuXF!EKwt^l z_5X`(XXM+!Z>3(#H}(zgUm6fthWzOLj|uXP=4!C_{Xym%&Dqp^4*LH8w&&Zx|0d>J zZ^~sn-}Nu)-y>^1uWy5>Z)j)a+Yss-^m@LDIl*2zH{kVc81)V9jC>nGeS==lH!+8Z$hics zZ-c0BXlLZx5b7KBdcKJ{N0Xd;@cK54`i6E!zKx*1L9geVn4=t$a}{3S22tP8&d9eR z)Hmq$d=qmTN6u|{eH%u7LpvkiMo{0N*Yi!xflkP|5U+28sBdUz)VCqjH|X_z6LY4M za_+?I+c4@I+8Oyag8Bx%o^N7~^^}}z@%lD6QGKI%JMwBs)i;{I>-jc3L46xB>f4~K zZ$q-a4a@pABJ10rtZzfIz75OzHX`fWpsa60vc3(=`Zgl#+n}s(L$baN%lbB=)wk1A z6Z5ST^-Wtxqi%Jfz8ULjIiF;5KFRA_H|iVO8Tr(9X!W zF4Q;ZwS1HF$xJz)!wIiJju^GROcI#J)y&d9ee)Hmq0e3SD@ zpPWzf`qqv5hIU52^`O2%ujQMZPcD=5NnYPNQQy$c$hR)kH|Vu|lk-WxoKN!l){Xjx zc1FJSpuRz`<(r&O7R&i0uWy~GZ)j)aTNmmZ^jf~j`D8%OCwYDAMtws&Bj0*Z-=Np> zP0lAv<$RLYx6X;`o0?Cm`ex22C#Y{dMt$p)^{q?Rw{BVAdSrd;l=ZDk*0*k1-+E+y z>y-7aOV+n;S>JkOee0CKob_`8I(12ECSV zay}W6^T~QXpG18_J0suvP~V`}@=eYso8)|w*SCJuH?%YIZ2+8Oyafcgf#mTz)Cc|y)7d420eeM37V-}+GB zpx5$E&L>aG`6RDz{ittfXXM)e>KpW0zRCIIQ*u7Z>s#+c^-awuRedw(lM~dp0i(Y4 z%KFwP>s!C9Zv(Qv^~(CzC+l0ktZxIdzV*ub)+g&*zpQTqvcC1o`qn4wTfeMt16qAc zGxBW^^^MOdwD~0J8+7C6lT2|zsDs0(Z)j)a+X(6#^m@LD`DC0s!uo3v^$qQed>cZ2 zgI>=!gKL8SFzOrH8TmGX`Ubt8Zw5Dod>cf4LpvkihEU(2*YnNbvfw|A`i6E!zKx*1 zL9geV!F^%nH zeS==lH!+`#b8+w=Mtws&Bi}|)-=Np?P0T0b+#TkNgA>&^HJ?=V&74n8P~S$3`Zg%* z+mNhp!?M1O$oe)Y>)Vj5Z^N>_jmY{oDC^shtZ&1zzKzKGHYn@ckgRXRvc8RI^=$?5 zlRx=W$2mDl8y9KgERl0Ij?&~J(ay$MB4=zIrHzZUahCA6ag-((iFP*55;Gf ze;Y?>a*=3f<1CQ}HjdK9McOz^_}e&2lZ!+<8)u2UI2T8$=bO$|!q3K0>iGt)68ZLN z;3)4-P0Tlwqm<7P%XKc&##thdY#gP0?ilAH(ay$MA`fgFrF>4Az(v|POZeM3O8Hzf z&PAe~jk81^*f>i095jK8v~ia3w{ev6xoMn>L^~U2i9E1zl=3-i0vBoHEa7kCDCKk6 zI2Va_HqH`xVB;v|bKC?j(#BcB-^NkO=e}_+676i9CGzH69HpLbI#&rl8%L?<8@Nj3 z+oyq}e8QEOZze}+<05UGCGyC|QJP#N+Sxcu1TM1A!dW7Z zY#gP@MWUUJvqT=)I7%BAY2z&6Z{sLUE)wl*oF($0*TPZSxJVmk34a?$X>yThXX7l9 z2R4q<#zopVOZeM3N|TF3I~!+-Jg{+;HZIb}S;F7OQJP#N+Sxcu>`cPoB2_1Gxh$>^l7^sM!!kEWy^qwl$=x2t!?@V?M0s)u+VAm-Vf zSwijbUI^Z|IEr^GZrvThJ0O|#?7+Kwccf$Y?%WREwt~0CJ0tEpD7 z%b587CS#}HyTVTaXA|OO%Eg4mgaw4_2}=kAgtrjlz%_)Wg!d4Z5uP9{Cw!W47rlQ- z??BTpblyz-DuMVL)J!BXCVr;99nr+lvDN*Es^OiVD1%6>$6d zfWQroz>QIX*RT?94+I2mas+OU3f#hO;r5b%z^#tJZBc>OvKnqL4G6r>5qN!6;0^2^ zZZ8W6-0le65fykNJHhSc0f9F;0(V9Q-pron_FVyiw>ScKM+M%Bam=Fc2B*<=f_Ka2 zdc-?@(EGfe(f^6j`(rfD3+bF*l1lG=IEC-K{h9Pwr|=#3HFQ28=Ns~BLoP7nLPIVx z@_IwwV8|N{`5HqG81g1V-fYNQ47tRRw;J*`L%!CKOAYxt zL%!aSZ!qLCL*8!4I}G_oLoPSun+$oUA>VAsyA1gjL*8x3w`ww7i@09@jlRn>cZUCs zjdThA>(mZ@&+4LkmbzbcX!olV)JEN}@E#$(|K;mSugD}^I)(06c-N5L_srz?F1+Vu z+`HDZ(zpE~7d=Xhz2l#k=AuWM?|3f$fI**MCG`2Er$?GxzlZ(yohj2E;r7*GVP9?d z71RoSf#!Fc=2uuJ?1hG3(SD&X()@1M{MH;0_BDpz+D4&YtNGob`4!(S?8S!Px)z~d zr}e;kV@hq2HqUg*3mC2Zg=F@Z0(Yq2H?cRce0Q zz9{V548Ln175Zy6zbehI^f6&CHTrjBww$xzQduH@eO7@VUV4 zQGs{Jxlx6j8{O&5=k~jz0)ujHv{%lJLe2_quZ#+;l5?YKIX4PBS95z!RA8-~8%5;Y zsLmEt$md3F=f-%LCW@!vT17~WxHS5wDpm)9ac7Nt)y%Rl}%eWDO-cGX=^BDt5r5_ zJ*8}Wl}%e)DcfzzrmeG-ZKtwnYc6HGLD{tRm$H>8o3<8HwhhXrt;>`Rzaxg{gSJLf zwkwrQTdyhGGG)`&ZpyYu*|c?>vMo?HZB3_a7b%;zzEidgWz*Jr$~HyWv~?eB;32T~ zYozZ0srps#{>!ZW&R4TOTNB3K`D{&?$OU}1CKPL&&(?$=u+P?nd~W*}UlU?p*+uii z@3Ft8-+%F)(^sW@c$#5dNb5vZPo5#WL-y}k?C?{1x8$@D^NY5awANHHkJ(~M`$RDh z*<#XqRmE(w#gz7mVpiE=(wbMryu%h#+9!&+!xod)%_`)ZnEAGt z(mqklTw6?9zpI$pwwTgBQB15+U`x1`hyzeDr`lpl`$RFZMj02Acmx&m1G101KBax4 zm{_BXi%DFBiizj!<6=ttL@}{O85fiI3>EVSwwTgBQB15+#>FH~M8*7uEvB?j6ccNd zaWRQkQ8B+}iz)3B#l#wATukC-RLlo$F{OQ?m{_BXi%I;BirH$5DeV)*#2RH>OyY=C z%%Cl%v`-WhYm{*@iDyzVueZgN_K9L*jWRAKaaAg2ku9dQPZSetlyNbM?@}>+wwTgB zQB15+#>FJgOvRjTiz)54m}-rJd!$;UY)R=DjsGsr%)?_DemA|Bb$Dz>p}?8N0%xrk zNWXN6+h>;uoKq_BqB4PV%LQJ%%k8YB*cW;&!ZKU=@BYMbFXVG79Cya;ZU@J`kk6}d z+?f^PxEJ!d6^=XWE^*uo`TPpU&D<-Fdm*1=;kdIa#c?m>^DG>9PPI7hg?z4s<6cxF zj(Z`WZ{fIeBjUIh^f?zD_u_qY@8aJHIzrzoVf^)8peGCeGyYmFP>TFDVaSh51zyL+ z$Kn4o_JM=D8*UCZ zg<2!{W%|LE#$aoNe%XF=(}9*?Rj47@aoW%A4ssFQ~|9j|N9;W`DvGTfs^tQj-@F3}J zf470>LD#P_bzrSc(e7E@Q?&jjScAG!G zyLoebcT0YJw`7aj&EK+4?XJ!*jPDl28x$1AcZ=7>k5;gDjcQx6MIAW5Xj6RGza`#w zn?K&2twn2<+t#AhayNg?R@KRTe>^Vjs{ZD$UaNxT=ND~NZuv#3>(n5=Td*x2tYDk! zB=szQw4%-1lKXoX$$8_iWduuZ+p%f;b>c>*F26}5E@%9T3YUtYb|k95U#-llZYdeQ z_!Se%P1Q$*rLrM@jRub8k+Ld#vq=80Sd6~)B|fS?D$JF7Wmo=EO~2Qi@!G5Gs-7zJ z8UCvO3cX}crA_&(`mNBB{tECj=%33;fKK@b&>{N7QHb&X1Mxkvb0q=%)0BRN+*u$n z{&&@t-RM9O>EVl?0Uvc=Qhx`%NvPxV6E%LBQooT9I_1CYNnu#lCh!b?|A_W64$roe z4LZNp2D{4tHAAob749ax8aut|8F_ra?AQ!4`e;-ASJ%&7q*HyT-T22N3NBMWzcQ%f TDmg+sHvdD?|3`*{A+rAiKfbAs literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..7d039667e46dd295e6a33b94dece06059124ac8a GIT binary patch literal 16432 zcmeG@e{37)bswpdbfPGWvSo*sY28U-WW}XuN+K!Qvt{Z>Tz7RmH^_>%$i`A4B{HT& zj-(yC^XL<6DMl8=o?4{C8X#`sI9b~?9gwWqT8Qj0VEMz|?5uw{SOTa+2V_7Cr2Bte&Y@lo=jsl#{x8gTUdjQX9x7-$Z>gY-(&w&h@*WUcRcR%FMu45cfSXY zMg55r#=`)G9U8d`HIy_1{MB|sWPsz3PJQ;oZxd4bL2N#r%*XsE6S;*%I{!z{zT%JN z=JM~8W6MAA$7TwrV(DmXW+su(e>azi#iLKT3z{d;Zh0^r=MjWNIP)y2cuvJ~ca=$Sr9iFK1(ME+m== zy*7O|N>y-8g|pej%cdhj0e8fDM*Of2Ej5mK)2f2E ztt$9Q9SWEuCe|~eQiqlrNBp%_1@Bl@aJ>!%+!5;;akCCBHIDe1RRy=LDtNyR1y>)9 zr{?)(^ZHsH8fu*I3#$q~w5s5jbtt%cTd;W*gnzTD;6JS@_@6oyT>ZSx2{&w{ z4i$PY+^`9jHHelqIP1`GEuT%N3McuZxM6eEp`_Loot8DYEo<=9q2ar!d^B^a5S<%Cd%|mNgu^9}UK+ea!8;*RzFOG@qD{Ws{j)Vm*E*w6X6ueqS;X+cbWE z;h)Q8PGw>Ex%tFGAv&8nlZZ$05$JDmWBSkOrX=h}RvBXH)ZBvb)&@F?+^~K1ZU21i zOmsFKD@32lv=0hAZce^;F#46_p%`po=7OWsvxDKe#Qc18Hak3oNVKpJ9UeLp z4o0W*N!XxFCt!aQt7CK~oro_*;oDfv-8uL^m~V`OvQgP4m)2}tvBrC4gXWI(}~$!1{{h0i13}=(}`R@ zm04Kf+s?%u&v@BhEKpI;a+cBn$ zmByY6DNSWNxlsODyR9+?=Bni7ReM8IvufX31WZ-!oe0mX_8mp_JR#jXr4eFS&i|?;0gb9IG_DEAF+?(E#1kB9%;ozg16dX z;dTIY0o)3(10eOc8)EE{SK5&WY(ZWZK;-QJh`d_?Vw_5%!+VM9cFIdmr?;uqk^DN~ zs_1lyHh9Mfc*)V{b*s{fEWjF({EF(|rzlWQ)O`U8cXxrWA-7fX`YINX3#8wxf}bEA z$#xj46Xu1nzwD;IR;1nXN+;4`4%ojPU~sXu90q^xT9z*bonlc5zPKoSR)oA$%BmdT^Y*nU+4g74svUizs(mdGBfF!B^|7zj(I+%KR=n&u;pP2%bp3m1 z{}-YEac_S^p!jgZP;qZVu=tsVaIwE(6z~Ip9|U}?C_-*3_5h4K1ZNP=Fr1@s9)R;8 zoMT1Be@-E;FRZl_8{`7CAx{u@aU3t8fOl{lFRFlV1>7eHiqhCvYVs|EjwjYSyh!x#bG7}(a2fw+DQ#5WmZ7kLTR z>~~gi?d_C)w0I8U1Xqi4$SplBvD(_%Hco3RA$R^^8@MB)L4nvQE=DxybeDv3%66fA znrJ%QT~@#$ugfCQh9epgB}l# z-I4RCJrebe)axkw6yFk!TcD(1fA9932l4H}QZ=?n_wLl>Dqt(h4JZRkNblR7^YA!F zNY9UYdWG`y_F8-Ud%#{@{Bgc}AATw*z3pmB3)-M%V&2)%V!$j{>|dc^O?ujn9rNU*KZQvYX*A* z7++&PV|>l?nebYOc@V_-n&mUb*F2xAwG;C-uvvV0KJ)k*^Ldl_hE3x8B-s0{JEgnU zXtP>uRuRLU^|c!1%xbjWUaZyJ-e1E#n*JJWtkGt**sNxw9cDG!Sc{R* ztY+i9?^dJr_Udaj%9+(@y}ekgxxH^0>^0VCvs!Fcv(XN-8f~n_$Y)lwao%^U(RzD( z`C5irjdEU-HLSN6Yc;p`tifJmjb`~=Q;YYS)NHiFtVSDaG4h$!Y@GMqYP8pI`))N_Z|@L~FVt$3 z3*IBXSgX0cHn10KG(J}j*q6#z;CXaJa+H4#&zK54H}QQHd`ABr;Kk=KRqS(3!Tl;e z594#@1$?I5)0f0&Ul%+ZNPT_DQeU69M0tIb7oU}d0bkOM`x-u9vgGr53A=YEWc9sm zZcx|(5WfG#eFJ;(6U-a; zBRWNG-v;TjB$oeMg*^utryJ(9r2(E7mEZ@0+Xau~!CS&%m=EGR4S1IU-(|o(27I>x z?>69)4u`qme7C@TU}sz4uf|{st@aZ2`Agt)?(dS$-xq<0`y2N@;JICX$>DPW>}v#n z*RWMa@~it{%(VubXcQHI=Kg# z93g*ldz%cQ6an^b$Ss-WR*xdmTDETgSW)_iX8Sx;4%Fw(&mQ`sRI3-ltpZypQg4q7Axr zk9}B&aP7D2;_rm`W4-;P*Z#d?{{JHM06I`JN5gHEV5TxZBkC+pu#C$5nVI-y>1owAuu*1wrfTq7HF zLe1kkTg`N`elwkXjT`|vULz-AAGZ4bz;$lT=@ww0^xRsTM^Fwz->03+#8sg0D{$OT z^*RURw)KMT^qy)F?s>FB#QQ2ZS~WlzO6YH}fc8yBaIpR&ga3X<&0zBc>xcUSdK9<< zhI4Q%wE(|1Iv~SukmO^>rsSc3AO7V2ed)}?Tp}0E#tKP%Ss%@3qJ<>9%+F=#^U;~u zLNtFWo6Y13z%g?&HkVqMgZ@M`o|ry07fmh9W}>s1Tr`$WM>F|oHkX-Ar4#vm`P|ID zbo%uC08pMx%oHdh2jcPAa4bF>3Xg=ti9jqi6Nm?f2Sx~e%wh8>I>fC2)`<#s4>8qtb!049^bhy=kqnUhUpbs$m3kLcqqrYjOPcr(NfqtCP zZ~AGY`tQq(er{itL>qBNf5AYfL56wWrZ0TgWxmGvud?6U@t!3{Cxe`#!OM((^k6Oh z+l($eQcJ(a=xcbNtvmjVwwuW>;Qgrn_z9y&4D`QYbf1C#E~C4S=(P~?0i&NYjQb&@ zCk^y}W%RZ2T0M6d{ryNSy@e)FGkM2A@8ooY9(WxLM>Dys1-hziFQcC`&_j&=%c&|W zZA2JdHu(7+q}xT}X1|x@QlAI9*>@oqfo}F)2=Omu3JKzmpIv}YU5I~f;gmla%O?r@ zkMu&0_;ZwRVGeAzHa~W9T@i;7S;-AUPGbTkkM9jP`)L)GSZu-crMtTx9-)|*q+=ENv(4h~nv2G{ithO1(O6JgFfJhY!T z2PXr(8Qed_o8iep-VBXYhX@|2iUp_E#U>{>??|9JhtT>QLZNZa5IRy-7MiHeEHqIa zICP}i42P@A!t2Vy6V=helcSbCsIerBU*kllzI=TA(a#yOm}lYrLrmvpIPF}NHyGr< zkLBI`y2Wvn#cLrK^hljJ%)EpF;`y8+7vg`pqG~els=H2Hr-v2Trt$>#u(QW%v#ZdGUzQ|Lgzo z@`3XF{-XL_S(nCB{oOVfL1x_%f;P&c4S4Xn#D8a<)yUQR6Yf9wjDj9x@E>Vh{vxYu z7uj{{4FCPN#=gP+M}dx0GFQtbkhsPg1`kRClz89H|#W7BL?juGOijhI*>7iJk!bk&Od=P9nrJiwvT!yXnTlPVNv0+v$=CgJ zk=fWQ<8Sy=)6>~lj=b*ACcYX|zT-a`NuG#>k0<7$Zy8O)lP9L9W0?g*<+XGq%9Rug zr8g%}hp7o}DR(*@do3~0FAwog=@3(M$RkOQ-%JU*<2dQsMWw9)oS2dRTy9%FlQUS!ZS`EBY ztATf`Fu;J=U-pPn6;>*M_^VnCyjQD%>s1)wK$Jb=W))T{fcR;x20p0Oz|X2MaOH3` zG0PvD*Kb#0p#p@zuhqaWYBlgrRT#K(Lp0IJI*)74V$|PBb89J)oj62vxSZ-EPOkW4W~}z!jpZ& zWjNF-3{}G6tJy+#%@%rAW1+gN%9Z_zNVbey!02&x$)6jxpoQA~eyx8Kl2PHwirz!3 znXg3_>V|E+W($XEwlJ|83-YRc%5>dZ>0Bn9jm<{V@l+;OUcaM8->Y4}V-{*FuHT>g zXELc1Y3P1tHa3?FPbW^rqG9|M=x;DF{Ue%`glweQAd*bX%sGB|j~rEQ*q(dGKN~p} zo=!$`;pbDC}+^CJj&N`~K%6Cu7r@6u^eF5Ai(i$yg?vNX;$rBs1iC z9x=hIn(A)u|80q6O6Cdc;t##B6}%NsO9HvU}x-B-SL{(K$C@a7j!Nh42( zQpK!E6*_XbGqWR0B%t?WGWq5YVH@+ZswLQhP%wfV8fmgnmIB4J9O` z*t71>*@pfv!x5gn`CMECV2_&7vWxn`|7ezzOjriLv(VH(cfeA1xDYXv{i4V7&Pn6 zls4*(l-BF@lj#6kxDYT;$+ELnG^lIKk@5N2teM|c`?$h=a_PsQu zH5BdST=AzZw$eVBu1;QCw%0YR(e3N+#C1mSn?~)hhG(7Jtwu7FF ztiMIPHhAl{@V@I|Le3FIy`(5E$7WA_(`9J86>g6wzOe-KoKY@$40?&ywHxg`-g#`h zv7|1k=hP*y>YXo;4xhK9Bkm;y%YM$U)K9^%fBAm>6pVfrV9Z;gPY2AwF-pkG#MxDF zI(?2N=o|V+j5fktHS+Q@uIVG*hGHFAFfsd2y-?KbNAsHfDY&18`%8Jmryg|3&HQaw zw?jkP4AcR0AvaGl*-0wvpqMU#Uwn62gEheM6cokRg5%kX<59XO-?%k?zQeoZCWE(H zVB@y}bpu@ov=u1D+XHLdp)R#x8<-2Wbpyq=tw6EuI-s~t1;y#RNNu~+1((a$(BzDN z9e7=FxfL74PT&j9E}uvDE~yUKBVNCz`+GGFHjv&Mp!9)uz#4p8CvR1Zy$#3y8|AUL zHFn3(Kj2;30R6YYST_RQfMW)_5vT{~CZL;vdVsbA-3)XK&~~8Ie{gOOs7q}iYlAtU ztqnlY)<&RcYZFkk3 zQJi=6vrjr&lzctR!K>Ke-R|Q1jwU;CG%0ZhlvOH&`6UQZool}`uejoaYrpxsgKM2h zwBcx0$eO0LWL|MnxeaB&*8yEqAIc+D!Ow?O@6s0UJM)9{FV7EoNinEySy0h;hrGu6 z!My4E0j28)l&&98-i~4cJjDVi#R4eB0w~1-D8&LO#R4eB0_c7H03MKr9|Wl%K-ePQ zU@%U70vMct0;qU-)f@JqWhkIw{ule5cFH<`cWe9p(~ zdxiR5T0af-QD2}gl<%wCpYN|5&kxic%m?ZY0slDg6TlzIE3h9mdkFg75BE6S2jM;h z_v3I+zMZn3h z(?;x?j%8+N1Lv8Yjhtt8HgO*IAlmhC9zQw&elzFsB|Y%%oL^(`TY%rAD4N#TSa9xn zQPX;b`CwU`5Ax!CkY8y&{e~SnA1sUWL0+5>^5T4u7w3b#I3MK2`5?d2d;(Ah^QP;A zWpO^pi}OK#rTGjPcIbStEY1gcaX!e4^Fdym5Ax!CkQe8J{0j5gLSBVf`pPoK^UdD3 z=TAL(p$I3CB0hVDJl+?)?3vxxJW8M0gxvjTJuG#|pblcEyb>~~OY6J)Q3oMv$)G+@ z!BO09KUX|M47)1%`QNIxLWk(*DBD(F=#c$gb5q;yl0nPI@1GJI$4BuPiDLTLIkU@K}y>ByXFRU5L zyB=V2aqZBqxOT{kYlpnJcE}5BCysrOwr7FwwGioiyK|eP_>Xm!Sa){<*1{e{tak4J ztfak2I@!7!)Pe1KcCmI6*8V+AE?`ZzGe0n>3+Lb6XMr{DE!aQk_t<9l5VS#=;Qt(J zEyH}tpgwHhvzN7(u=YR9=3JS$DVmY$g`h7cA*%6^UQto$0~f?u;3qOcgUvR~sq6|km$jWP?Y zSqxEG3@Q6HwwJJ`evNVwYve_&kr%N>Uc?&tRbc&hR`FVjA=Iy}V~9Is#E`OIV>=0J z8n000Ym`}F&0dawvMnRjuPG^z#Y@d7DVXOk6u8}};nZH~3onZxRg zV){1Pu<~t`i@uG#=-bGPzKy)-+sLoNw`V1+<+F&@nZxRAV*0fdN8~ey>DN+RS<$a? zugX|koH?w{D5hVd4GHVd<1DgH!dgCySe-ejUrT2b)32qoh^@-qnLh;HYBV+kF&_zkk7g2cSno^FhBeqoW+oeGl$jL#Pn{>rd7MR_k+7D}A}odw@yua$HZlF$I)<2M4%4qKVu+<*uN*_n zd;?-?bw)A$+9HO09%m67V2x)H%n$FeFBC6BzJJ8)EdCsF#Tw)u`TH2m?f*8k#ayPY zbh*bN_kwwJ%om@--0;pWnjdw;dCc3@6)$vk`3kgckhaAfyJIipQ1N{T;;PXtAIJjyCC3w9@=oc@tq0e-_=*0K{wE@ zdcYgstH3)WH^m=O15kwZ7tQ9Hax)o-br8te{2ZYdl7g`m81%#e429F7Q7-cA1b zR(<13w0_$GqkfyDxBXE=zFpAUCFuDcGwOYk-i{{>`3^yEx1iT`*r@N4^maaJ$ae~Q z4>CPM{`iCS-LRAjP<`BpdctzDWzeQ9^epbvg(-#O2H=g`NFZecAo;CD&cp5}Gyd7ZGod7X>b345K_(f6Ym zSA>0!br^?m@3#o+?}GKmc>7uJ{rj!?|HlLx3u=xD=$B7{FZ_i31-#q%@$wgJcb5sq z1&9;eA0CH#4eEblQIC3D&YwwW^IE1Rs|$|m0<2SJa=$T4`AX?}O$K6BfpI^Z09=618sp*;k3ziMA5?i~Hj z0sH-xKIfp{=G`!NdZwC(GmmlI*<{cr#(NmWU-|Jm!vs!)CGawzUjKGiPhla*>fu~K zuV=Uc#&a@}n1jDk{(=gByR1%3jH~?tKm1d>dy}cTnOG*Aj^yI_kzY8Q3g_bR5nv`g zn+;Dz=EB(%>2xZSgBDZABQuG)8K{qiqp`^oGvUPCbSgZZ%7i1yWH^-#r!%SPL^77` z&1R;0lgX2_dqDD)*i?=(Y9Jbo3`C;SL&GD(!?8dlG8Kph2Kq(@ree{7k)ge@KKR4t zsmZBbeAM?+OeYdZc$|5#vd+ciZ9SE81Ku^{k|f{8+TW4ndzk!Nl03-d*ChEEli!x) zN0>a>OAF?YFEe?Xe)wgINtDSiNpiZ#FyFvHxhQls8fbf`pkQZ{u>w4tz!i*BbKP6P0#$Gx;@1KE&ic3C|Fd zACu%iLb+WbN1v^<{{+abzB4@wa;xu5iGMDYixGeH^c;LEOZ+o)C;ah9Hcr@2l;<+U zpGid{xd`#+VyAMnb!0X%1>#sTlY%WCjlu>e{;AX~epF2GyW3o35@;N>jgmo0GBO!U zh6912gJv-}WEMvU&Eozsvp70ZE{>bU{Xt&r8`^Ic_nK;bWn=w=rgU(`w9r2^%GHKP z1H2dfVs#mKNZ$Y7vsY|NbT$XL1KK)E<# z8Vi<<1;@`5;!Y-oeD5{EiDZCiv|9^%d zBs8VX%ScfdpLtx3KL2|_VEuf z;vH)YMCzVk7(|)$xFcNu`(;&H;`*F^38W@=LfbUvh2RRXfi*)}g`1C``wtG6j~$BU db$P#04~On%9d@~mc=JUwq;wQDT<?R-8n3oFMIz&MQNSl&F{z zIg)nl&Xb>5OEJd7Bu7GT3(@O0k!y!Zazx%WKY_ndQwMuyEMQ@}#>qlzpM6T3zdVAq#} z`o%~!%KSuuziWw&)PNX$1G;(z&9h*fD$EYWyiF!5A=(z5<&@ZU9{r#TglyG$vZs_P zZ931X=!bn(2AKZKr8KFttS9mOQG8_+u$Ht=R^!HNHa3u6hYBCcAO(c>6Sty+hk43Le#1o_8_-n4|@Kp5W z;Onl$_;@;+A+Nd8lV6D{Z@Nx}<0qq`6O+@Cw{_Fd=*jW%XlhP(c{Lf1a3{sU>5b7d zAsT{5%A83?U!Cms$X)y{?P6j&e&)74Jjfff8#H#u9y$!1KIn2i02oZ(r zCQHQ;l!_zR%7}0^os3UrPVh-_-DEE#NGTMJ6-RJX9HFU<2;ZAbhY}|?-Dp!Uk!|7$*+22TZ{d>Z|ZPokt=dOuV z;$#wRs1V_;hGI z9?pcGNu*A&;8X_-JZ?^Y+#mYV(Vj4DVWzzP(eW#6LbzNmpDaBCzvSjC$Qne&Bj0d@4GgN&t|EM}Y6_o{FZ@lZojC zzU|C*gI5wXSYHLZ#C)Hfncy~h1MG|k*gk!L$88U zzEFU7I5p<{5;U#>{S;^(D7|sD=KN*g13>dYHJ}<$Qfpn-J|2YG8v{ut2bh*^EkY*7mq%gzE|RWO@%N(iO% zq(;rbwV8Ztv5I_qv8viVb1wgU?gm*{yg^=o^Eo)@;k>$7Am3QTOe>I!jDC&LXBmB+ z(uP6<#h|`WPibADj?(HvHKjWWJ17OaXYox7*ij00l!6_lU`OfJoKthoJ1=f_4lE38 z8qfx^126PwH90dmpZjo~DL(+iEs$3i%~dtE1@rnD;0gtEBhqsP^QM`?IYOGZI2W8S zCf}?zw+;MkWW!D3G{IZ9xesjj6LOv?>Lo?7S++Q0o8JN3W;h*=*rq(t^ZIq(q0>uL zw>9S6+N9e$oU>@ZDX%W5=hX$L>YUAz<~C<@b4(>!!?rH0)>c+;cZ%swEupT7Vm%M}^46jT_XhiyRg^Y6 z_HRG-PuWTNIyH8o$+^%#yf^JIq1OX#0J;I_dZ3L!HvruTv=J!1FAm6sCUwCMGCPbF zeH(zH?|Pu56p=Id3zBTp7X@jRsdWfjfn%=+6?`~oVnzn z_eOJWR~I&;9QuHI2T0d^IrZ^^jE)!R{j)W>PROMmjpX(M%~A}nF5*48pd7c+Tt9vs<{Hc&$Y~X> zHS<%DhodvZ{`iv1U8g12J)bL>5u-xET?hBfY@NY%*1g8M(^54)bIBTMgsq)WsSM|+!tNLepss?5}RRQ3KfDZyc4E$l>9|C>^_@gs6$Z^f=hko?I*$?Lc zoB=q8;0(e!4Ci4uAA)lP&Z9G$>#RoXk1pAX333OnA+L~c;5-&;;Ma2=i#PBaI1hP* zd?V-a1wHT^f$y+bG_ATiThlSG5&Q9_&F(Cv4sTZLR1{6CtIJwDpVPE`LLcy2>;v** zACMROfV|iT;v+v^kEx$1=g6C7jbRa z;{5S!@{#9rQ1x;>4(GE@RwEkg2WgEU zbUt^Q=rNnfFaFkTN;Qf17-gR7RFizKH3yC5$m_HT?Ywo&P8d7Nef3N(V4!WK{%T&Q zZ7R0ikk#7}utB+JBa;glX-(9>J+IRawDWEOyA~(C7tU5|u0!jXqj?m_YN+>Z&o&|N z2xg19)`so2EKt8ZV26I4TIU?fJv$bfc-|rJ-M!F+)YrMt6d)}>ZfUjTQdOn+`MUr= z%qf-|IR3sK7(dE{93fq_|0JMO3)=PXXLb^P1NSqzfFJ3m{z^cnHXNUS5XLX%H_H1C zF}Z*r>7oAifKD6HuK&SR@Pm1X`hg=1KbVhvt|A`(M~(68`$8#x{ZH!n4gCHxeytLI z0n9&~4+4Hen13h}@zZnM!E&AEUl8*T?Iiq$G5=65;-}9EmCX&Be*w%tw3G602=fo+ zB7XWDYG-qa=3j8N{DU~$_4z`{I#?BOY_gW zTK@6*%kxj3zpLb*N6J4h%Rirxe_07XKjxo2e=+}z=dXj!Uz&gYn15&|;WvQ!hjIZw zHh)z%e`)^tG5^p`!mkhW59I=WZ2q>h`AhS!f3^JM^OxtJJbzcozdkAd`dR)B2>Iuc z@C#u6;am~tFXkW0R?c6-YZc~c5c3c1B>aXk|4=UCC)GBbUjfWNw3F}~!u&(Ih@Vso z(LcCa{_*+C^G}|?tK{F1lz%~%f5Srl{XNuQ=dJ8*Yp_wRHLA&op-~N%Yc1Lt)nvrL zs0JI=TBDkb{wFW_=&Z6wHhqqhqV~(jB4hz{tu?C2h>KATX3wF; zT8nl@H5tcmRD;=bYe}s&s>$eoyBaLwC)Q%b#i$00_+c$ZJEI!>Pw<Pu5Na{v;xh^NEbH;K5%$}g zkj0NTH9^1zpeoQ-prFP5O#Ho1Nclcjk4v3)^wPenJh55BE_U6@A|s<*HN3{bvF8N}(Uk(D!v!uur9VKe05~ z;Ze(b({cd%i~M$pZW~ui^Q%vEKcc z12BZ2nX~lXz5@4_-`A{gUoU_SzpuE*3;U1iD^`yk=+0`muec8k``&hXk8z(-y2t(G z@QtSZX4qCO+-%xhV}{4no65$2CMe(jJGy*3$l)=S%G=3rZdTX5K=sY{>-uKN&v`(X zI|aYpf}c95>s86G<)AKa5&ZTDeyxXeeXHbm-vhe*KEZFV;Mew`u5Xk4c08=hcL;tR zg5S;~x_+nRx9bsIzDw}yWPXJF@kg85A@vlX_|CBbXvc~<)6VA1ZcU+cU{7ABd)b`n zU~{JPuIIo7J_ineVrdt0N&|kk#P5;#y%OIc@tp$CaFl`B|<>y;o#zmus(8~jq8N9lIyCu zPFOclhjjqg&Oe0T4Sel9ZTg>CJMWZ>|G%adZvJ0dQ*l0?g}nPI`3rdO@sq_q)2&5< z>ng0XJTEh#*FgVM13mh2Ie#|5m=i`ZVXfvdLq;)~ezlmmrmW~EtoJikMKlc+53MF`3>dCSOy=!H=&g zWAJ{m_@3T=cFERcfqliZOZBQw*S1--FM-YHy#^3_hJF`-?S7umSJ-a-9*c(Uey-11 z(C*p}z?}Y8nT6j7`k^>xR|EcZ!;)&^rXeyKpXJYtqUMQUiWn%F0UMe}24vmGUL+O*r zWFnOTi?I{oiOJ~+&__d&=;+Cb(B$-ZA~c>zg~IW8D3J~&Q;G4(cr?8)of_L0kDr?A z1j*CUu?%HYcO(+-4oAj&e7!zj)Ey3wxg+lGuHNpkXr#NhXMeN{{z3BC=-6)F>y;F@ zzX|*+>a%-Gbv7nAv#-2u1NNNX8)cfKgi^tO7f#j?$}ov z|4AnQP?ATO+(JL(TK1tg8OD3LYgvX@U!iO*IS${-h{qh0e0 ztPthPrSTsIxzTr^_zjZLcc8>IoybIqD{^KUK5ZqgiRqKBSU4Rc>@UhQDdI{cBH>J! zxH8ex8EPG#nj8ahG@eSp6put;f)m$RVhTS7Cit0bCOir>2HyIWPDwmG8jXkC?w;Y| zrQcV)99+IU=vkKTU%p&+8#>6PUEZQwS8q>I>hCIE9`f_c?qDy!>>4h*^$Zma{QHC4 zz;~$F2VZxw4?ee>8}tkh@yq?oQs2R1!@lKC_WFxW^m>YJy`Ewpdi}-L{9Q$)iVdxn6HPT&h;<&hf@q2IV7#somclnRZV_89e{(MpVj(n@` zQ~dw5=&WwcuYvZ!iM}`nJoviA|Gz3pU0i%V;rQ8^d;opm7Gx^@~BaNznS9uiBA zfD&a=86@2Q2ck2xMG=qlMY!hJp`0J>xW5XAn{})nu2ndB|9SpkcX{98vUof(q+jy! Wa|Lgg^D($E@_(QC-;^9Ami#yC-R3g@ literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..9be88b91c49ed02dc9778e130f5594c239ce22e6 GIT binary patch literal 20544 zcmeHPeQaCTb-$!2iJ~Q&vSpi=W%0?8ZP}J6N}?nxa}-LJ9e>D<<0hFCWMwFk5*1S- zN79aydGZr$IYt&Fo>FwhS|DGxleS5l0co49naB7q|WRa-D|a20ye^4732o7@N0wszs`-a zi_uEt`H3C=UrB076|hk^z?yHt95*J=&gGCT`sA!42y9@aK;mRH>cJ2QX=F6nQH+6h zMhlenaIAs7ynUh=6O1Ie8h*k1vjX{9^n|{c-_gHMhv~X}#{R^b-v@fw-*R8r7hO*r zFuoN0cr9ZK6l-4%><^($0~|Uua?gIWNQGq#|i@$&;FTGGhOp=VT~$A`(14H4}b=i3Z0`OiV?TUDjXe>h zCWNK*6N$(xQ$5}45Who*7@vtf@u?%+Ejn|WbavYjx=fTl?-34(5o#u#-FAd+CQ6_8 z2#3W8y(XRAc7%Qtr6otmcgdwtED;TfZUfAMO>7C=T(`3z@3WCI_4k#jKU#wR#j!}* zSb!rX7?2m>{xbECm8t(o3Hq<4(#ffCWW+0W!J{P@kryFcrhc?ceaM-ZBA2y`#UgZx zMM#ujL|%k+nfj;7)IVKLGV?1&djuu`-T7t1y9TDb;ZFTsFVh{BF|vji(e3-MOD2Hq~$ zz&j-v;0rNa*b%uBtQ0N8UzTg&XXP5WQi1`o5QQCatpqDY3-R-E4ZL5jfnSth;Nqe1 z)U>#5Ub#_%g`!3HRk;R!U9N$DEWyCV!}210T!Mw7Mflfp4SZ6rf&VJOz{P{oB3!kQ z5=_XeaMfZf+k(Ap3$-O!xSUGFrqahnP+YY*OE6Nr6b)rtaFuPLsRRq(ok|7cC(^;O zuD-%@XeAgbUXIqXEwq(wp?x_PN;_4#I6M_f71qtmdR*S&&sB@xMD2DT>tBPzsNh&p z?}6pa*W?uHs%4~X3;W8pFuEKIs@Fc_aorn-(n@4~TYQGd_F9mQh#o+mY#NCwB_ zCuY)#S3RjGrpMy35Bf?Nf#-M_&bbU@SFPj+p2tHcBNNFuEE~=~AkJ}5Mv|$i_{@Sh zWM)UfMgL>d>Iygw7U$`_wHL4+Pr$`_Jm;glc;ivLg`JyQkaytuk1YEyWqw%!*Z3Ak z(Qs+JP>z%^>r!-JrHp=Wy`Up0ueb!AI8#we&cdX`Iln6Nn?~En4;L+Y%ej0W zo{dylb2nfDI^f5EIY8>Z~~@EVmA*;j-6&9AID-*g@J+l{*6XG_h(=TWwn2X4{&Fwa#0u3<;rlj#Oz` zxVDmSELM;|TCAw_&YsCVlf6b37O#zG_d=16Qd?m##`7IPd?kRlI0&)~Vjv~lW1UZV0S-0*!=YDRzdvIZJ-Jm|0 z8GN=^ugY4<+3e3(S#pCg-8^|^(OOYeowu%@1uCDnHXxqKTi4Cz&k)kQ!M)&yIrYss zvKv9KmaMr>+!lE1HveAjU4)z^cI|@QUTfRnimrbXWSimbaz)qW0MD}PoQvTF+SV9# zH#RX@mwOK7*X6VY?X0%o*4%R$(%kB9ZjNdsW7^mGrTWUSK3@DRePvi*^Dy>i=+_2w zu?-RO05I2^6E70po`1Flsg%vmQdETV7kc2{L9$h@(3|B=sRb?dQN-TE-RAA$E* zXK|gXL5EzMy;0k4(-G@9t^}-td68=;dD}_ecJcK7O6g(hvm2|Opv6?N+Y zQFk>U>aGDq-3CC^T?>f*&Db5S&(XeXwfWlG)~Xeb=+}VG+iRWYE%0m-=!~Pi)s=TI zXpp9Lw@1%=I(6LzJE_x4aZ@As2Vx>m-YA;edYs#D73Q|S#uGii$-Ur${@26UHvqZ- z8v!=}ZUk%uYy#W}xCyWcuo-X@Al-L3*G<~Odf=^xIiM{UAlljhh_)I5(bh&lwABQN zwl)Ext!6;q+=}cT@b|`Z+H<~I`>gJJVa|4s9pXdJ=IwUJ&HR~rZL93Fl`sdl-3rfy z&%I|`VI{T|_NWc6HOhneO%b9wRz5aouZ{Xve&hFjD;+VkVXL!~>J=-=oZUhB7UTh4 z0a#rbm_^heHhMMp!bbPobH2G3=6c;E>(@5UYq;Ng-OQgbZ|YA#>Q6xGPeAHVK7@B88X;H>UBtrO>cH=M)*@dDQt2zsFF1r2M4^lCw4Ap`UpLE9MJ zAZYwj0<^CMy4_~e^~%ajRr@(z?_&L|CssX=*SsH>pm{$V1kL+t6g2N=qo8>|O@hWR z3qZe{1YOPOW}rLlc3rQj$v8Tm*7Z(lzIZLq7ioFENXzp@TAnY`OU<_%+F(3%zIZLq z7ioFENXzp@TAnY`@_dn&=Zo}G^Yua-jDz|YujTn7EzcKedA>-?^F?~8`Sw7Ym@i(- z^F>;oFVgaSk(TF+v^-y=<@qAL#C$iBmm#;mw21ec4elS$oxb;(EL2iiym$7x+)ufA z?a@#-L~9R1Zhp89wn~7ZjaVsd4=}7HHcC4K3>)|JixiYrwx2$KgvxN1d zkzf9e*OF*O)IKK=m1 zU2Qm~mY12lv8yS8HA`Mbb(7}q>zaP9Bx%{1kaC&dowqH9M23~ea4YkN^7r8&Gc7eiPqeZs2i=GJ}Ue26<9^3g78Z^S#Z` zhimS?57r#z72nd+w~CVE4~~29nP11hjwM(A-xRW9aDTq;|_C_9r9)0vD~2X98rinlrxJv68h6Mu^Bv1a zex8>KafkAX?`Yg1U-2D{JLD_Aqj86P*>_0GzC(H$zWav?lem*=5hL!DnuO;}!gHq0 zh&z;1d`IIB`I7H=&b0HKS%^E7Q+!9`4*8Prc+Pb4oVj$|VIGovN8|3(bEcQ)%tG9u zyy80=cgR|a(YQmNS=_N)>CZ7N#2v~jzN2x6 ze8qP(?vStej>aAGW#1t!`wr=4_-=*bJ9@vv{3PW}dcQ;7tvQp3I?0GTlv8|1;|}?< z@07aAxZk0i;yW65$d`Sm)L~2B?=TNZzN2x6JTu=ZwVZLkLwUt_H13eE_>RUM@)h6F zxI@0|JEUdbA-xRWeH-e`$bV-(wbnGNDa~q6^q*O+sn(PzXI6XS`kU37W;LZ*?TPxg zt2ISkqSlnCYgU7z|Gvz%X1nq^L9I2-YD%-((}+DKZ>lvV%9+)kxPE4}rddsCR(qoU z?P^WoJGG`nU9%e0h&{|_X0eBSv)DtvS?nQSjyV_(ug~hQ+!9`4*8Pr%xX%r+7s8`H104DDZb-5)2Jy?*Q^FL;tp*nzWXxNnhC{s zYOTq0CXs7Ov)a>$JM)}roYfg|XOc5%+?nT0<7^Mt*{t?NzG>VooimN|LDV&?L5;XG z$(dhPc8=lGFb6R*dlNlUrCHyRvki`$y zX<)D(5V&MBU>kh*0M2spOqY<@`*0Sk1sJyI3^(T(-a+bVduxE<7Mv9r6Z>(&d2toY51+p_*xB=HuY{VS+b8|u7&F$hz5 z*P5ZeeHnZ!{F;&cdLCqiU-8*6JV(@Cb__ZJw^o8*@%b@4FLzQO%?Nv}iF zYu(S)8Q4;GB9u_!5c|GwnQPI&?dY0Z)!$CyyB~kD0E| z90TV?4D9>J)-LTS9q0~)_9%3xLVFduOQQK6ZISk9Gwe|{C%yDJ=cN9O`?>zFY3E#; zBRi*S^KPwQ$LCDHsky49-wM6^k{Otjr4+N8WD+WJJB zl;cEOr)ZP%ooH(lZBp(NZ7rfr%7dZ}zc)kYC*?%Zwnnr``BAjhiZ&@%indD8Cgn|R z!}|c{&fkO28X|X|vi$Gl&fD$9|IexU>;IQ>D#qh!*mplAe+JLCf3kR|<>n&6yb5_% z?8`K?>(KsFlXlb-d{JAV%}FzxkgJ8wpqWkHzLZVODYuLh^1ZNmz|1CZU&+HM7avm$HdD49DVvy6Zm|jXE@5+pnN8kqW>e&p3D6TcWgNca zV|>5QdHP1JW`oanrTH&STTD-!HVzf__(e zVa@4hl{xr~z+NA(XSjmXE{=EnC)(Yn1J_@}jX#^N)Rkz_CtN=NZa{lQc`n2y4i{*#I6RB${r6HJ{* zB;v_5NQ@s3O-{{BLVF|_j*Oj{3{K5V#Df#@WH1zq1>>n;A{n2UibYbLspNQPEOv6b z12~UG#?zG0yy0-DClsFO?d$LBi+Dq!ac|h$)79TI9trpK_YOq5;18LMkBx5^qrMgK zc#Xh8?CG7wZMB^5f?8I7vE{8M-&FXUxxBqYmL`Oe)#Rcg@8|OGDE!@=|FOb9%=vG1 z7TbS_^M6La=v`t!hcW5KUggP5$Eqx`u#QMw<-L8 z;rwGqitXIw{9iFg7>zY_167l23crE#1BxB^+Bm$b$%Uha33{=E^G_@MUe5ozvYr9X zzozhignX-=1SX2@9|XSH?@G=9-|TlK#4{65M~El<#0>m4hIl4tPI#iBRFv@lk)KWy zPcj}3r9;G%j+{zU>Cp7lIIttJWE?^~9EJcVp7HoJ{)mX+?`+bcF~BHjvnz%q78;Ah zf?jX$UgOf=XIu^yE_ZhqxC4dDg4*zI!R_iBF)oJACJ0)Ah=ko9{FV}7qOvSEq4+Zd#O_mEH<7%;jR=rPm=dVB=|qqm`MgFDn^ z7#kYu6|zIazEXafQ;H7$nkP8&(BYv&`&rT|-f+vLaNFYJFg!-4FLA+9{JkvEEbdQ& zMqRw;2{mya6u6HI;(ZYNvU6J8-c|wq@A-UO=!^THz*a6V+J!!6)1PIbZP707n*udO zUyNTMoFmfXexWb!(*k2$UjWWQer*E+c|xBK2nCeiiy;eN0s4R8KfJA=zNjON-=8%y z8RPG`2^D1k`1>@}#YYzS68T5`owo#I(?_CsDb~-)`FPtwp3r}j=dU;SGJzvCT#$p( zJ;5}NJa4fW5&Az+_(ESG-ZqT2lVl&ghY?;rusR?$c#H9i{qt4eiLt|_@w&k8Q)2$2 VLG&x=C|sE76MkRJDhdio{s*8X`||(* literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..f4eb170061009c801e0b05ae31833dda34f7ba99 GIT binary patch literal 20496 zcmeG^e{7rAagP)wQ?g`Bwrtb3Vt!I&M|LEN`c0*dOv$pWD72!eiWX;SXiB6+)s)DR zv||UZ{E4v?FAEY!4Z7|Jmeg^atWBD%Yu9WoM0P094zLG1D~1GX01e2nVpxG-TZdpj z!ri?e)Z^19TPCyoG4wM&-@AKv_uhB+?%wfzq|P7l^qEa2w}r5)fGiUeZzFK?_9YK% zcnXo`B3Af+6DcJ{Kt@?N>s|#@T$zAY&WB_%CMRV9eJ4W&B;^c8IhX<=yBJOmWW!JH zWw?N{9M1KVBfLM54I>OC))L;}g(<-tmOZ9|k!b?@liqi`o+> zjF$o}+Zj0n8A>(){M9x>G=Sa5dXEl0LP+8}zG;6j=5w43L}vn_*o)7-=I}))W4|Dt zh3`0gap9*D)h6%F|O-Y26|@c{Jr{ZXKKV-eqY;Ob;3Jmw3%=9uwK2fo?+ zx+6R>5evl0YmV5|QvvI@9jASvQvvVEsTu!UjMY1KYGNV~oo6DiMtpuDWHk_aW9+P# zst}suXCr}Er&^lTDgFtaVt6KW_Leg|ECwq#80@w)v>6C})-&uDGt>+QyX_3?41_-G z8TN=7It>Q9?F?N8LTk>Duaeh%p-9jthIKOy7O^C7aotXXyv};_l<&_|-jjp;m9aoP zV}PL?6vze`$y5H(JmnwHLH>#0qhUi&Y5vd%sWEtWo`6~EXz6!48pgpZS|D-w@-V}WU3Bp8kcR{S@_#=cYkJ!&AfrvLuJ zF&Pb?ioo!b(}9_|cVg;H!0*M6K!1Y=(|=A6CE+wO!{7@|P0m=pzlx3`*G(thbxixt zcqc->xcA9$^dwhK`yj#N;p7Jgy^jsH`QQjM+UgpcXlkDfOiz0!A}!5`c;hqP_SQ4) zt=_R%5Dq9q0XW~p?C2d21$;9{e&~oqr%wCgKwdRt@;yg1@D(^8%^LV?u(g;>-*&_% zBT?^o_|!~1^135-c6uxv`ruGLQ*oU1!^u}h>vc1E+wr9DbYLPH21BAf+~Q>SbRZg= z3ePNxV`qLEyc!_GmI62>7U%4or3+vMoQ%8qIlY_Hkzd^Gi2f2GI-~ua=j|sRc{;jO z06ch$BkEO?$j1mIA7QW{%AQwrFXKWnUU>l{T9hB!t2R zQlurJZ6;rbwe)*S1%=Mp^Qq^OH^}1B4e~tP&%(U`_tm8|`Pve$v^05v!{6ob84h2k zup(VSZBUjjqp&z#OkrWVkixy`y%Yl9Gx(+j_$UNE3W1M8;G=L?(yrSV>@V!FcP(~p z@6x*xUC+1aMM*Qcko@^(Q>qIdH%(q$G8YtWNSn9J0+vpjD-oVgo43!V&l9p|r+v{5 z&(uCwlB@!`QnK|Xv76wn+x&Z__Y-n~ShY)5YpG?YEx02EynEob*@D}#eSx)8HU=-z zzFk54uBxEDY7gVL+2>GZdrDi>E@+E(%|4ePTCH8vg4HBpINrsz#!IkqURm#W2{zt5 zJi{Is+XByI=_cf5QnD{mQc`Q#s0DEzXtO==>^gaQ34MIXUX&~#^BEidXF8G8%_nAc z^BCNIxF=@OUd13oZp<#1IxRZFG7dL!xEY`oo{`)*&3R69p3}>W=kzl2R8b!+z&d(! zNe7?dd=pk{Z3WJ^6X$E)NAcpC;KeHY;ug|+vjPHrE5I!PD*!0iCH0o(y_ zJ3t%29f+gPY_Mjkw8aXbRe&u}*A{@NYb!w1RS6JvZ3Bq9wgW_6I{>0C8$eta32RC1 z3v_O!+I(qgZPCV(;M0Jot)=BwNW6$AN}6kJY5SrE$w;?5^t7W<*ELvyjZO;h+68lh zSW1((mM~#mAg#4&@B=J=vKz+Qh0g=?zGS1m)a`d^i@T5xbHM)H09)raCcD62yBD+< zT1&07dh5$`mZMf!=Xx@2wU&IEKL3zqvvsx*>}iLI6_SPbEE~)gw8ZFZ7$X}jk&T(fObDx z0PmkGN;bh-Y|}}y5nzJa^y(7&=%O`JO5-IGftZCDgqYBvt()Vpj!(~$@{wiNeZ_iY z^K;3x8EuwM-&YJiG?&a`zP@i`$v#WLQ?r*!&eV$jFG&4gp#8JZf3EgmfqVA;f}Yu~ zg5KFf1%0yz3;F>c0NexkAmBrQ4+A~|_$c6y0{$4_j|2Y1Y&pb=Zg#`Gdf@JbyASSu zxCh|&z&!}}5ZuFXkH9?&_oHw>2KVD|KQXI2&grE5;pGZqg0%*169g3CTLg{;0pME& zjzt3Cl>&!#hxj&u;}-z{-!5=`aSr$nfp1{A4e)x4Mb`@p6Gin4Iw^mAd6zpu;qKmq z-e9%rdU0{0q~TdzZ^`GgwM26fbu(SV@W~ZgVqOWlLC+$ITa#PZ2hTM*iV% z-KIp9yrz)WQ<$hy*OsnPU62MStVO=wV$LVo3h8}ioG#f+uc2~vDF*9naqNmkHjZRV zr1xy&bjhZAEtPks7;Htp-kqFJvNh8C?&5UG=K4M=?@BRvunxzszFd{Awo~8RYf6&! zdi^|I*Fy;%`+N3Yu0nkGNFuZLk>0zX$)&&-DA%Aj%p<+8;c}ITgL>58^xZ0iJ#7s3 zb~4y^|97k0r1}TdHJ0R{CEFkU-QW*g(|m2BJqM0}J&`7@QPNGHhqxKEAfM*}&Zqce z@EE5{{vd~`yw%NMDcY_7VXzyHqxfUs5l)xj1e~_b8 z-sWbo3hmeLV|K;o%<>218SEbjaDPC)5xEHIo+&1$`eSf9+aLX3VfG(5#h~XbgM(jP z@y9A4r$|!D6_M$1;tNA&d{? zQ~WWE@qu*NA1vmbJoagPj9`2qpW=^Ej1Q#C{$M%6#d8IXkD;~V19B?beOSZ?ez{5g&a@e7Jdh^ziuTL8@#z+5J@gee+hz~Vitq~uCN__P5_!!{v;o<^Z&GPMWh{~?SI*LYo@ZSVpy!QO$+&Mm3gNOCg_8&4oU>RgER@8FHYK2A=g?cXH;X!{=ixb{jr`J zt3mOHT4Nd2Qbsiw+S#baQfn#XGpf1JPDVABQ7vUubD_LZjiuI7$Y)e@pvut98^^e+_3`@@I6l#^U)ZtCkutsJYP2Mm3gNOCg_8&4qR{ zs*;0*XJ>8jI(v ztXgW=pyon58`W6qIS%p})m&&NqZ&&+_d!0RnhWVhHI{l#gnULd7uxk!HJ0p;VG$ot zYoXl8ZQ=uKEt;>^QDd3FA6R4Ix%lrF%qLTDp59?ENluyO;e54ZR=kJ8cNi}MFP;;p ztj1U@tcQ1$@SX+F@dydMUquM)WfWHH4A!I=yqi=|daawm zy*h*YQViZhYG{9*o5B4$g9lO!-b-3&f4!T*2Ax4iior&lhYijVi@*-}{-@H)-kTv^ zvs;t@m4#4=DJ(3SO_^4GQj1@J0y-TcEx0+@Z3}`Bz!I{bZAX#WLdZ?)wtpo^ zuRhG^)j)@jPn7N?zq(mi{5K6rJMakWKcL9n`&*2DuO!zd$<-fc z{q>4m!*4TsgCy53$vM8n`W=c~qnFVeCAkhRN5~(%ztah!Vg=X?i=hHwE8sf;wy(+s zPM!;zbSsVB<`jc1JQuX`T+pWPr2XwF20PXr%NIo~_kU<{N^4sOyh*{E6}&~kTNS)b z!P^zQL&AA1{TX~F7P0h>8Q+0r=jfk)#yPqpVSl0HX>Gx#l_ckMZQibR==hH1|9kGP z?yw|hz?NzbH{rVm=$jOMQa%@bCq$o=+eP1T(I@43(Kjsmq?|AM9u$32{uh0{qEFfj zMBn|QPudqmU%Tj&_6X63?}q8KOZ$cBs}p_F-XZ#`MW3{fh`wE-Puf#N-&WBl?JuIQ zRP;%Ejp!>BebT;zeYg+f-t$lJc}&F1N2dRqz2|l@@|!&=ee*Ya68X&cB+NrsVg3J@ z{3X2i{^8Pnrcaj$?n|&wiTL4g1K=!_fbu#2mb8p0MW%on9lIynij7n1fg8gne7+95B+!``6NmIe3*$*!zXf zHY1(9e=VJugIDQ<`a$T_jCAt;wRB<*UZoRi5TSFUkxt%kq*LVJ1&|Xt_-XjwPUib? z<>!`5H4D67Ik#L^%iyLu1~=DR^fSQwdFM`29;e^G!*M^)x*o?ZYqsb(?&n$G|p#FJ{xPa=P=uZ2F%H{-fFSA8~rBFPr{TPJdIOzsKnx!{57+uMau>xdE9< z$S*nlj6(l6PCvm6o)P+#)6aP_B-$vZ3kd9~&?`B;U(o|!_l9c&(H_p$bAZ!J6?z+| zUsLSq=JfKBY&}0iy4gxT31rhB1iI1hJ}v^?=yxB)F%ymlh{Jz&27byy9FsGr96?_! zNcjIqk4K3k8ut6*KH`W6&crFVZ+dDR$bnEa3?c6KLx2;&7*z#vGo z7K0?@8w-TIPG?(Rrs?X)G`n4y=HZoQcTa}e)0Al*c8X?GTZXsEmEmpb%ru*ux&?2m zt4%bUn-7a-d+!m^?C$6kO;>A%p<{*G(UKXsqd7A~cVC9r)!dTN(3H{J*_4^2t26WP zuFlNNI-8n>Sm(;acOK3t>+V=-x?1!4!AxGl_-h+)@A1LzV-K)2mc0aX!f-j`S1rs! zS~NHmfB#B!i+xMrD2scc5EJ{Ofc=~i&NHD-TlpQ2*!DI9?LX4#u#gw~qky$sUi4cz zE&zT<7Ilk$v7ZX4aYoTE<}aWf`thqSqW>j6F#*N?&DGFn5N(9_z>Uu(s+Cgpo7#iLhTer*{EF znvf4pu#O{!BIMtenK>_#gq~4o3)>-GT(+aE<>V7915|@s n%wMdZCx9mA4$aKf&-Wei{GvmQEASvRjO5?t@;4O$1ttFrGpv7g literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..36eb76a83c293f4b9b709e78c7f4ac60d68b1c3f GIT binary patch literal 22576 zcmeHPeQ+C9l7EtATUPAEPMpMXoM@7Af)i|6mcIi8*;axHP6%w^fMZc)OSVj8ODicM z*~&U&5t}SWk*xP>zwU!<2!UH*m%C$kx!WyEOHsA=2f0ePtvV={s$3OUw}0GLQTJ6_ zcT{xU?_>1zJR-5gQE;`Bn*65wb@#mK>FJ)H8Tr(nk-b)n#cLyMDI?d2g>NHx`Sxpv z*v3dX^7@D!{$EL|Nd>S`*UN^NV1^qLXy)~A82l)7OKE@bHu84!@`?CW7X7q%m5ezcac1q$C+0{eX!(*Otd`|djU03oR#2Bw1HM8I_<6rT=765n{{1y>+Gk@z(k zng5|HFqS+Ti1-6zW1&Ri2k}rK=zk(UnGC^5Fc=4(KM@Ozg)U7*qN9Px3$E$FROtJ@ z7hTcu@kA&|UT`HQzZ$Z?>^c^R91Z!8Oil-1WupGkqvPYD_#9JtJ{AZHCA*2zOQR?J z)P%5K`vr|6~#R+zCTQB)qK`VMX@B+owx4aHdoPPuYKEy4hgh=E*0oG-#ko+B=nYT%_(4ZKo>0pW;TM7&mnl{`nhUaEmNN;UBF zA`Eax4CW#tQ-qZ~N4!<4fxj!&z}rO_5RS-2#FZkfJ3r5bp*R0F>*!obD-!O1C6 zHs8KhgoQjOd|0Z1-;`?LUyCqs@qp}vkBYF6=Y;<#)xhscHSlo}1}@$sIpMN}6k$U4 z!exuCWDE9^EmRj_;ZhA9 zhgO84e0MaLY@wxO3$4qsP~20Mi-VJaM9yz+w#Vf?{JCuDGf}%yTmKR~jPj4>?cKYa z<288-b=l%8*}`zi7Dkq1K|O0X`MB=YSTgQUgr)+qa5Nsuoxg)@-^)FJ51FVHJb!=V znute_#$fjoQ=#dke|++IDCox)=3<>`Kf;aEF-Y+EaPqxQ|04&w1Mmnl-q}Ao-qAA=nws*D$2vVo_>whKzaSCQ4p+#Y10#$D(=r`zBNri{)ikVj>pzk42A8 zCu1+V5+|lcqmlP|ikN}xND!WL8OAPK$&Xx51dfHq<56%M9=%sQ$2}H`CnlrQ3*sR& zdla1aKMt)fgQvmbc{(5M2CTy;;Np2a=cBw>@lm`No;O?Ip)&G@|FQq&rw+cDDuZoY z#iM8(%Ug;#YEpy_cq(tV`Ghj7y8Ka@x)X>~W!2;~9F%y@FOPo_w{7IdilTm3bz8g)A&yAp2}L+&16pzvU!7Z!3oFIGwVn< zfnGIPbCozP@YZeao$5OXIZf=^IlH~uwxJ=sJ_E9wVQpv#ufy%rY&+Ay@Ejd$3_BZ} z!p^46OuoT6i#qEv+JbgkTX1U5*%Z;5othTjLQUH@m+S+us}B3(x23?Ke?HkH<`;ThffIIIy^ zr)SV#RiHzz%v`JPvgwF599ME&1!#w3Bv+1cnPXh$*fl0|>>A26(Q_~larDZf4(AN# zo3h)R>u|pPIA8mAN>^mX6A0bC7O54Z+!Enq$1I>5Dn>jBpRHUO?i z8qZk+#7vX6PzSs^@CDjh4T!eZ0HUpWK(w_M5N)jkL|f|t(N+T>#zo5RXg*8lR;|rd zS2tIzaD<-(I%}`4wL?NiI_2;*H)Nd)8YDB_>C&^Vc3syX0^8jbH#NeX;9AO(SM!d& z5s&@0+_5)Sb%oDtaxQFy{Wrp~Gy!e|+yvMJxEXL0pc8O2pa$p!+ybZp(s;#V-=r-x z0A; zHPY>5=NFEfo?k$EegWzE1*GQ}ke**adVT@v`2`eqsIP#gz5=Aa0;IkIq`m^Az5=Aa z0;IkIy!kls(KvxLN}N7lSj0@$*`ezJh?foziyKcT#XDvz(j9R9b?YSE4w#~GbZHUy zcgB9an&yb($01Ka-hn)z!M1Ka3D@q-45>YI?e^VOdTiA*>8uscTQTj&Y?Gp@jIpQJL&j5jGu1ymJQDglm#r}3hQIA z9)@)k)*!6oGrH@fPHOMFR!1yweZjT`k_gbN1r0t$dX1p*h6(6;LE9L;R?ztI1B|Z| zG=3}q^m;*8GP(iiR+~-N%ga+0t@FCx!RFCOta=u=`8+lXn$M$2(0m@71kLBMSXW><^@6e;_UU1L;EkaKjkPgT@DL%l<%G_6O3kKaiIFfwb%o zq-B2~UC1AuFedzg+p<59mi>XW><^@6e;_UU18LbGNEh(OMsfja&G#3v9&T{{bT)SO znKaxHrm^;SH#nbi@_VTIngMzbMacE{u{H5Bv=J+%?Ouk}#75~_FT;9LLFooB!zMj& z{r#<=o?+-~NZHbB%1)-2%NLr|>si;RZAkwVn^DeJ#pR^) zi2S`ZoG+bEeJj;#$uMkf!F|_VVEd8IGx7)5a=vuF^=7K?&M@pkIo}2@C&dBs_io~R zDIWCgRKG97@b*^R|CYaM%5HJecyMlYq+9jYIXRc0%-}YVQF4mNJtRf_JM3jRg7Ur{ zTwaPPml z<)>S=+S1RL=f}eE0K@|36@Ja)x!kvBp(%?z>Dna&)R)A|(1vm&cXK%<77py^d?^;l z5Y@MP8CK&t8@}%X+m8|p`ySwYDHg~cs$c76SdaQ64|6#s77l!w^QBlIcTxQYFT*B0 zKf?hyKWZ#sZsU0aW%h+Y26MAIgtRa(N{d4oq>r5(~p$d6U^!VqxD= z&R1e#!~(6bm#jAWx13mYX!5qjGrx<&;>Uc>(!yELiEe zXyG|6mlse@i3OS$kT1sq%YAO119N!+<&;>Uc>(!yEU;YJ$8%=kynuOKiUpb%kY^SP zEZ+|Eyqn7lD6hl<%?rp^Vu9ub^1 z<(`k%2*PVbE-#>*5(_jhAYY0FUL)+hM&$AW$|<^`oj(7a$?BQ#zka(Myelvtp70r^ra@EXy=YeX(Dpqvs5G%p}u ziUnRH+`LBQ@&d{!u|V?z@}*edHKLE#h{Ab6sSz|Ud~%H#&afJh%L^#4!~)F=$X8;4 z<^|*{u|V?z@|9Sic>(!KEYQ4w{AI+#x0QQGdcT1ARH_m5egSz)YXlMZvBv!Z$|LC_>`*G?M zdkM3i!K}BycrfcF)Sdz5%z6tvpJu&;S zgK3X}bNT}H66dTY_Y9a*d2TT48O(YMBR8O~lpEBZ0p-kk3p__=y@XlMVAflpzF9A! z_6#Uz)>{{S0flw^?3)H810rjOi!Jo4jJp;;_^%e#50_IvZ7EF2u(;fro zgn0pND0SlV*h@TZG06*f-_7%aSp?p7tn?h3!lecqEd+kwU^*E zBClsK>n)7DU|u7Pp25fqD5t~%%?rqvV!^CuFzYRhyny;rET}z$krz--x}PkV7Yf&i zyq>|d#}Ih|Z78wudF&-l!@D#2IkehK@ESqnp24iQF!F+VjWBu!BQKcL2$~nnYeZhp zVAfk0dBMC!7(IiL7f?=#1=DAbg=<7!&p`76%E~>4$O|Sl;`7)`SRfX#m%#Vi-z}R< zXW)B?UZ*2HX_|+Q(E#s>D!>o;-AKKieQ$$&&1p~nPZqunfq8C*W2`NMcg%X%2e#%~NON6R zZFj=)AiYhYw=4853f-d6I~01SLf@*;tqR?y&@P2;S7^6FcPO++p*tlSe1ZO&hkmub z#{K4QaKV#hDH=l;AcjQzq@?&c12Q6h@EaKT9zwg|@YMotFNgTS?_}Uxp;{VC_}zmN zOShBZE6sOX;XAtQ)#mjTRwyx7?Q8#yAb;CEjK2-|(4JF%7y0eg@~UU)`1bv5e7mA| z%e{<$i=@{r>9yR)##Jn2on8dToEm z_-&G2pQPvdG8=a(dhLG3ZD|Wl z2>IT-8@k|9u>3!(T~ci8KzA#2k3#ne`%B>(%i+`v3Rby`|Tdo(5m4IoyKpZ((dgj7j-ij2#wZQf?Px z4~sD=&x^79#h8@y#n?S!Ov?Xa%qPaAS|G;m5MxqZ5Mw=JOsWxL48O0U$1c?iG1el+ zq}n0IwumvQj)<{FF(%a%F}6mGN%cjH!I#l&E>f)#W94E@syjG_br@^Uzd+w3u9puj z|2MVgM%T#iYf|>=?`snI)N2yvp-T||zaW1L-!}bhahK)#BEh-@bxK?xe-7h1jQ_%9 z9Q6cW+@qpRyw6`^6Kb8X`E@g!e7um&lV&!dUJ9E@Gn;(8kWI|NOXdkRSJ?EM+2rGe zY+?>xViW4Nu({97CLb?k6Latqn^60O&2BTBe7ulN%)v`+!u^A=shQd2jj6=pX1xS36ngXcj{oX10BNlIw$U@*`b2#$C6 z^!D_G+=0NDJLvB0=h;k?-U3Zz=pC&gY7TG4|*2D#^#~28(Vy%;mL?e14Gg z4=a3nl3|`1=n>`Rt6V-fAan6|HMmrg2K>RHw49^cmE^Vk5)(H6g!7vo&F5d@{P36Z z`F|J*lHmuO|JMEa{NHf?YYPA0IsZEk=F4B_ z{MQfV^Q&k8K|Cn@dd}AsJMc3xSSraw?8F(yc5=Q?;dgVsP4TCf^M@7wPmpi56F2`m zied-%0N?C)q^E#y_B&GInvNzz#1%X-4ZnpYu8HZRu5cg`Cj5WoC*#Bwj|KzD0C6Ql z$CFe#Ff};_>`){gg-bjbgbSRw#-daBqhW%-t4#(*0mGopwiuE~U^Eo*yWQP;jm^GZ zV{@R-*c{4j4h$OH!46|{$SpQIdJSV8IkgU-VZk$K$aeesg_>umM{M@(84{aAx#nMQm^se;-o}i~1yJ)Ww=7)I=Q= zILrmH4&uJ-oEGJ670~~c%|?a3sDlEVxx5$``kYOFhK;txxTu=~HAP>{U!W7l@im{& z7j;@-gzF2yW003yfIyzm7rzS^=#_oAu{@wI z9s@2S|A@b{8e?qwSr={!|J8CnKHEl~&_B=f*Lj}j4=V?(kOa#R^4ze>i_rg(%!F;B zFA&qT;X6t8;Ju7+^NG~}slh7dFX9Kwj+i@a8jH^Bi#UEUAoeThFl?CVf5i3Eih@Ft F{{<*nkRSj6 literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..7676e8b634b530ef415f7dc9eead3fc5dcef2936 GIT binary patch literal 22560 zcmeHPeQ;A(cE6HkTQ&v^7%*Va!x6*~Bg^1MOr(rha_dAkPBN1GMs%LHX9oTku@m;9e?9*c_hbLQXnUgB2fz;7yVVc2MaL5_ z7(WVH)-iSl8A*x(KV3_R2H1AE^X~442ub|VJMIs~ypCgm=tLkKd*;cP9p30@>?3mM z+z%bzk@#_M*y9};3B+RGj|RMc&tuWCcmO1Re-wD0*razPaCtNw@p;29J0`s2f$w#` z;)n!;u|S->?1+thBVc{aal#uu9`GC+oAAHRR6V}q!C)Xd!%SY9^!kO7)xhXg-zg7u zAw0!TO$J^XYuK$`;-Aw?j7)@2-S7%KL}TR!jotJLD-Dc(*H_pku23^*?50;(V_@{V zzQT5Kg-(OUZhD0-1EVFckk68rz2V7_S2XKp9xP%?;N-fQ2YH@#=4s!Rr~RQE>@WEO z@yq~p=iopdfZjaq`}4FP%)$QESUfuB4|F=kEEvwgi987YJnchy+QXWOOXMxhVlfEy zVh|>Ca3T*vJWu)<>2I{0o54)}~{TbL0qbgZFZ9AO>P#MqJCmOV&XAJYNSN>nE!+va=oIe5q#gkR?C;FEkE{BsTtE*+5v;jd>#BY2M3q#lLq0c ziR9oy9)+tWOWq!=d3z|!!NcX)WOyuoOeDosQ+Wdg( zd|9o#Z>7HmccHGDI`j6>mA8jOx8gyK+84ZC_xfZ!>WKx$y_2CxG_VlAZr1j##;@PN zY{~fjjbk(#IX(%^kB$c>;-28x$$;O3UxEG}Z%qF+y(tNIBQpx#@Yv{t<;RQoXmZsw z{H9~vd(smOd*hzRBGF^qIhDZzk2fbj+~*lM(&U9(n9)X;FIe9^8W2;*+m9VyDJ^k?@BH^0^epF+be-%6PqMCO>jK<~aFur*IhiA=$tx}dNO&9%+6gSPr-2kj&pEao=cN& z&*2P9lczcU9>*s*zCv+TdKDdm<>}=VOVTA23)6)Zx23mH1idHlNek#v1U-tNM-lWW zZcN&A+d13Q>us&Gt?OF#)6G`11n$pdW!V!jJVhYD`8Q73(Q}bnw79DXp#}yn`ax4cd zg?^K3C%Enju6tsh>7JNpx+msQw~EH<9L%>L%;^vZy#9pMYF~}lzXz}1x}DM`_R#q% z+w290M`Om0ImaE3%DL|9bhHkdcX~Um4F)oHvnz|+z6P7dnL@bDs6T( z@K?ilq5n02=)VFG{jUW?|LXwJ|9U|5UkQl*HvppljezKX6Clp9gtgTEG`;>ZZKkZu zUbL(<^f=IIYgzeuSiF%=l)CJdFbB0#$ZwlNPdn;#y#{7;os;6`jW8b|FVp1pIb8hC zlD+mc#12x4Y=O2mLtk*tUaX|?*KM1%+0Dp@Ye4xHz{cri$##hGrgPfUjb+v;z47^J z%iUJU89kY{T1!7qpSj<%(mGWLV{WsW;W_cS_btoJ#Inp9vcS1Uc@U2=LbTEqk4{_5 zLX9h)`)1>c(lGk4EVq*4Wh=gS2ZnBZ2 zOWQP~;e2nhvGIq#(D4VP;}1y3ANvS>!MM`#2c+WmlIttpSItx0d z4i?-s)n3pA^u0hI0{RHh-9YyM-3xRd(EUIU06hrw5YWRw`+)WX9h|CwywlBYxXyMs zI^nnrjxIRvh2sz$N8sp&qX&*&IQrn|hhqSaK{$rs7>2_KhaZmMl>Kw9nt(o6JV6L}HV@$b!v^>u^oC(|eIe=-TxNwTTZ_Job^q+B_{J!O!FO9RlliE8|3>)p(_NrOdjx_$r?^(Gwsvi7C;A-`uE=Sy+aoz%W7 z#qbd7_3i+@tu{K}Y}KX7TD^9L=Gt%qwkf!dT`yK4y~Ur%U-A&OFLyJnK>NWluBXIzXq@w<_>zaIeWjaW6^?)Z*C9UGz7pSo6>ZEk*gzH+g(vO%>}c#;bq&ieY>QX0qcu_$?OSq2Fim?SF>FbKnmc z4*n6tp+8-S?-fgSe1|cAam|t9>%;s-o*ZAchSc!2gyydw^B47$_y#e5kuS%WtvOD< z_R#zt#{5M+CB8n)U*ya2Wowj+uT?aE{g}U~r^Gji`HOrxzHCi%^Rs>M8N{ zVg4dtjxSpSyZBm2^Vh#r{=zr?XA&+|9P^LLo%uaD=ipXYB- z%HO?8e7iA!r8)!ivIp~5sW$8im&oqAr zF@I4{iSH2RFY=}M^7Yxx*Jql){g}U~r^I&v^B4J2eEIs^#n)$=zk^HVuUMZ&{;KQq z68SrzWIZ{2h|=_g*Ew!a%wN<~ z;v2;LMZO$gr8dO$YFY@L1vh_JrtD?Pssr(h|v&dg{eO@AeeM5JW<8D+#8`a82H8tAbsD_r~n^j9=y9PNd#}{j9jBhSA z^e;@=d$3wV^Y7hRwX#u7jd3%oq51dpOs$N1Mm05#yHO3zzt?Bg%0@Lc+8fo-{60XY zRz^Leni}I{R73Oo23fVTQB95ZH>#oK_-578*zQg87i(#ZZ!R_Tm=a&LhBm5|jcRI) zn^6s|*2<`7R8!-)8`aQ8wX#u7jrK-0v|1~po>5JW@iD5QjcR41ni}nIR71=0?Gt-3 z*3#JSP4X9OX}Uh=P(%L;?n}$}u+$owug_Vv@_<22jd3%oq19R$^^9t29CxD{+Nf4G zs;SZ5sD@T+Wz;jOsWCoAHMCK!Y*bUD{f%m9Ile=3EsgEoB!97%rt5PKHS{SZzG@B4 z*XOKS*=JBwW892tX!SlS>KWD4IPOL@w0hqa^^9t2U~<&GpeaEK1MaPdS4gy zjB0A+->8O`;~NzDi?uYidz1XdS{n6osi92}U#y|=*;}_|CW+7N@Humftu*<%X$GFt zlun6f=lFc}HPFZBPid>Gyc6{BxgkD}JB!cW_P9d$jIbQOOW9nmP{QT1C#Y^K)x~Fu zmIJM!N_=kJ+8Ro})cQ)s(FZ&T>)3Vpjm*C_N3h2E*qcPMnNLhn*&heFpWv{Rw$ z6?(TqH%K&$C60GH+_S4#?tG^ehVXzTLFej=FjvJqO-S?fJm`pdiqG%ixv=(PX=^#) z_ClDa_>3RkfhebQ7N7qrb9Nu;x~8?6;VX6ey0*T^Y$4>jb?v_r^)<%dCE4wj z>>Q6UxkIt5^Dus$WOt`z=kzkUQ?aWbVf=c@Zl7egJHX_-6}yH}#&3}9_H#Qz{`iBc zMo4WdU^7gDTXb4jdSreZjZi!%J-%i-pSXv zeSD4E|Ap72^I}cv`q%LpnY3pL z*?mGL?Vm!{DP+=KDrEbGOxjn4tXasUJyyu-giP9Rg{(%%q`g{{;s z)1Hp&?={H7pOU|Y_n+RH+iUuKj^Mry`@G20cR{X${HF$Tv=e;sT?Kuy6r zjC^wWQa&#m`Gk5y`1~^?pIp9_&u5H$Ld_z4e#^)wmoMcL_bilUjVIJS!e`vbCzmhf z6Zfn|KB0CJK0QW0xqK;~xMwZ$3H6uod9RUAE?>$g?pcd`LJcQ;;`^(M<`S1L`Gi_h_{8_d7hNZp8~GG_7S2(zXT1dPwPxNyFF!qBrdi-w z^Xd8Jc7`iz7_O{kSnjmw&wSi4{5 z_e*Sdg(DNAfv9KF z8xP@+^E|PLCmwdSG^eVwGaX?SDd~hTUN&THF$4Q`mCqt#pF$eU(NYl zyCh{eWn?k=nWFFF`X4F$4$jvUKkU!m6_dB?vi%?B`rm7Q`^Nz=l}MRZ2gZp->J0w3FlWS{D0&8w|lewe9rlyo@{;zoj}E;SK(K1{w2i^ z{QL_(#l+H|?Pn+FfA(-TzlrnvmGN{7{|f(I?LOce{SNdz@Qr>4N*oiB zcz`(krzYU1t;8`paoiE|#zKVukNkL)IHD21H|`~lc;IB5YJ11WMt~g%M<&CVE-u-6v|dz{XuyE3P)w#;eU!OUsr?#yX>{laNW=Cs2l zPV1X9&g$EmGu-x!-`!0aXH5-zgu?FTj0cx9qubHcAr$tscZyS2hck29oVf;9bH;DS zU72gx<67|G%ILPWWfa=#Gp)7NHwnLOZJ8@;cV&9q-jZ?F(Uj?1NAsS%en6X-G5-42 z(|Pnr+u?iIGA1Tp9*EN!Uwy;Z^5{pLa1?(JQ;5aBC1|w8y-=8m{ZU{SSH%4h+p=<6 zY|Yl-x;ljBB)nZMJ@+3E!}0UvGg^-27Q zbxHhP)k(&tV?^bgF6W5b5ss;_Kgs{@>q-81U~&DDKMF~=734W(B{&!MKT`O@USJ={ z#n{QJHTN^h$uDd5kd C?gkzJ literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_80x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..c59edd0bd189b158dfa5f7e889fbeda3ba9e0a1d GIT binary patch literal 24640 zcmeHPeQ;A(cE6HkBMSot4A@}M!#JrSMwY*20!IE~9RjH#4l%5QkS*DAkS#}+oseP8 zvx5nnrGa?2X_|f{O9)9fo85G~A8E2fWL>*o+lX(qc9F^}`kIrp4<-+8}x&O1-fI@PgnkJV!F+6epPkn6<4_mOz{exjG{ z?8`-&huGo&<)nb*ff#MQOg)b#xHCcRd>o3!oLrPfyd{hkloT=^?O+LntY zWW1oV9j?`Ti0Qo(>2#E_B$3Z|_{q0}tx`&YIANYp@p)tSfcrZE=439l=?s=C#Iym+| z*>~m#E`NV)+#mM&`}>1qW8aGg{Q=*D(VT z+x~(pGB7X}jFIPEV?z%I?Jv0|{NeGS@A%M2;AJ+{*Ec>e5RA?+ljlbL0byh}F?zA@ zgpaxqo?<6PgU=1s)~HMTE4swUNchC3uCPr^R%kNWO;=cKV)S{hutls;GnwqBE37av z`n*@zDpu$=ne3)3^q3eexI&JS=l$W)kY7yeWgct-5(uuFd603|o~38{5Nvk~6dcRIi5!GLmiD16?ICAkid+lX2WPW&0JS$$G_)CLc=JLAUeW^b zoopREm92w6&%gmkL`w=0&t~8yEfANob?{=g4!)m(0}+T6B3{YBOIjd)oUMabvvu(E z3>@%4w5AY|$iPclAby#xgI{Iq;LQvih(M$eaWw-kX@Ph)`zi99-HT7#bF3 z^UdoSct{Jvzh>*;qih}gdj<|J-6aR%;|x5c1>uuy9sEzW4t|?~gG+ZxLAYWe8Mu(6 zaK&QF+Jil74+R-`xI8u*9*P|o7sVAzVFpgpLs68qhvKX~lxE=JdqZQs$au`xSKW{b zhn9h(^l+4C?V%!T50#7Ykl9m}ORYoxu~gjLY{tbs{JCQBn3&zHt$z_7M)~^EX7?`U zcuk%{U9q%h?V&Sk5BnD5K|O1q@p0YDqp_%OEI8~R4Mn2C)cNaTb1(M%^_Z9~c>ey@ zH5iSIkHYkW!@-f5Z(!)5V8Dkh&_CgW>7UYvlJGRr7~l^N4UX7;GEa^sS1iX~bq)I; z@(qOjG2erc=yC3x>L9`6!^wAReMj!9_roL1Xl+y9Ky|}laCq1^Fj`xKj4w9g^SB>s zsDsCvm=t~e;h=w{<8{|)bZEjK3mWT4EtLG&6%9TBk44kw`!-Y*i{&NP*x+c?*B==l ziH*MC8apxE7YV=HkiiOE#{=-3%W!tZN`B~i&_59zh(;i6c=TTJ9CsoZ9UF>_%!-H1 z>``#q|EOA(15bm+^K`DQ2f7lUfQ#qxT#n<#hL7SeSTy*b$@Xk`vit9T^N(9{fX5BT ziO12foBCvM)FcBNh*WCMd3(4z>9bPWS9W&;e=4VdoQ8uE&-s-hzjLgO{AA9Ow46yM z;k%JMYvMW_fe!Rzpb4P#B{6v8iQrYWt8S8^C`_u=2Ch~@)k;A+@ttS3yh-_#!(96D1~v9u8%u)=Naeub0b$USUZz!{H{$b92_5yd_ENs!8CINox_(Q%URE$>b?QHkCMM zop4MIQ;xV3>lqCV-x1 z`-x&kFHqh3kaK-$$XU86!KVVj)9I4%Vna&_{0!A_fwbQ!11IbFeNAy5Y#H@P~&$4>CE z6W7_;iR*0a#C057O3&39z}xrdbT|*V{%O0td=;+03DN2=)V{!`Y!>B{?`LV{~LfJ#-{C#^7C~41=>tOL3!R%N9YmY zllFqbvyj-4pLTf4ivfe01FmnUOHaD0biD%5T;-;8<9fgYT$f4ma$0;gVtj5-#b;yw zj?meS&e=^c|3)}IC(unmHK0zQn}KRT%YbeMx&>$%&~l(#fKuFJEH-Mh8$q@ajsbmb z0*bzzK+%^56n$+5ioVK#qOUDL(N{Upx~Zk{{eYhYxA z?zoXW^@p|<_Q_m02B+N$-%Fo)%eK@?Y)kDS8|-UT2FEu9b2^saKV>fn)h&PW8+FSa zVf0~JW+zLQE+7q#9h0^_p{bgY(s?x~XTT>YXI+(KgIzczxA7 z+4+X!rso?dJ>Nj-`Nnv_u^^7<`36eQH&A-MfztC0l%8*(!Y_>-@HBQnY3xuB{xJSD zc0g(DfYR6jrLhBgGu%LEIBxb*+(3#Z&b@m>6hly4lg?^gKLND5M%Oi1O+&U}4z#kqLW zisvzz?92xoTOE^F|2mgCZnfn+IeEeHopPbSL(<=LA z=g{QdoF3qh0N)GzQQ(gO-v@jE_yOQUz~2vi82Az3M}dz59|L}3vIugGZta3~9)ztM zwnMP>z;*<-Uf7Ppb_}*Y*aEN(z!rk-e%QjWjlebvTNJh!Y!j2Z>!eN!4_seGEO0%- zz6BB!@T&z6F-N{g@OZNZ{2IaA7{6BV_~HfV*9jh9Q~0{wcye4U!$`8qcXp0Be^@O+(H1kcx5 z4*YhzUDxyTrybiL)AcGTUbrvE3wb$S$jk9UUXB;?a=eh280{{ew#C1db2=XNvUOCDfBEMw%f(VTccAQG?!{6y zhv*g5cSVBHN*vdn&&NsU9_5FYak+#6eH*oRCm5}(#JN}gh|MEm1LX(TaJhsLy^`8{ z5{&M|aor_+oP-^eAKJ*}5{7gawf80%ZFl4Ro8MsbOV~pB!CSao!kF%+_MHhv_u;ti ztuSu0lVZ$S=7?A7l{0dF!!d`p!x$yciF`<&rst=}%V=*MuBrMhHeSLh$`96axrA4g zcQ}9kR&wI}U zZ?O3lY#kZratT}HFtyjbjFzK)?+F+u#hqdc^E1zFIOga#U<}Hn94D})zs9e9M!OLc zJyUP7@d~z%%y7Aat=?~Pxq_{uf5hbqwtAlUkoi}zb>xq^T)|fFpMqREMuDxPe=#3h zALXWFi{>q?0}{4q-a?s-EtX3wcuq~_EgYv{i{>qq%h+PM*UfWqDsSO91zR+4pwtnSnzy9; zX(Mkb_tm^65?&Kic?-uW*rIt02+5DPFM4w^YZ{auvTQqN>T*j8tml*e3I8MP9 z&08p!v8D7e#{CwKQ?Nzz7RqI8DSeM|zlGxzY|*@hav58!CK`Rxg7;fkSEQOq@3&C4 za7{FNE#rO*$1B*Pc?;zVwrJi$xq>a4w@|KNi{>qqE7+oW3*`#77L&KmL7zDO+v!j3 z70r4=v)&Q$W!5XIJt2-W>m3mvX1$_WPiWRVqPlMX)yV?_?tyvF=`1)Pw6-AFIdPdPJeqv3pi)ycE))Sibjz;dn`XS{mwI{@JX1ybx zbF*I2tS2<<9ns#bS5$jK9B0-$;`ue}70r4=v)&QyZ`LacY^gmV+M4x{M()D;WR|;7 zZkD@HZkD@HZkD@HZkD@H{uy$Yj4kXL1-9-}u%-5j{5e}%PiWRV8hH!rhm0+yCp7XF zj+5#no^zVFP%hO){v0o@Cp7CF(Vpflw3q54f9_}WggDNucf|8c^A_4mb&)?OOzR2F zdPlT3&0APocwIEGrSyboYt};=c?*3g*!nW{iZZtFS(m`p-&;)b7WPH_p3AH!q^@L`j}+zN3=K1Ti97TQbLQhP!pZ{axU{vOXS&08p! zuw~X0n)QxoZ<@ETt|-{zHPPq^(blYoH1ZbuP_Xr7>J?>d$$9H3`2Jk}-bd{fc}*m8 zPiWRV8hHzAij=q1@2ZTvh2s=#(Y%Fn8Cxbjp;_-}7+>9WnllklDvzMt0U zbi`k_%)q-2j!E&(9ln?K5{$=pIFfcxVLOb&_l@wqn$!3WO_PVd7g7j$-Rbd!rahkW zX*zZ<9gFWG+3tjQNAP_&c!y+q@7{7k*!P))%)MJ$0S;FJtpr*I6kkrP1zHER9jFK1 z>xA!L@q1W8!tdNd2;2u!x>aX%TY}Ns$Vw`&^fJ0#XVjHow2G8bz1z!Zwa#cwg3(%1 zOZ9bLM(cG(8xo8*l6I>1co}Wd8Qqa!^mbflF?>gy2l2vp--_()T|$&=PJ8?}NqD~v zj$;$VVr34z>;to5J6w@RbU`UEy5{U#0MFg|All8ilV_ z_&SBJSNH~nZ&Y}X!Z#`W4u!v6;vtq8?^bxORMaSMkptt4%PVwD9cz3dpViw=yQZT!dbY87EV1-nayjHO;&k8B=ntjcG z5R`Ado5{C=9NH%?|Mj)p{Ku(&+dWLbO|iS}UM9ayvfD*cc9n;jzEZK<{&gnbF4;9p zcCP!F-lf=8`Ix*)vfC}$x&2J!E^GbF#L8h-!>}m&@yjHSnk?iV*n7&T2 zt3S@<^^#qyWY;jv^bLw#V}!{YCA&7s&hr4%dlb8-F(z-4?Aj%}9pg;DL$SO4K_
    ggpID=}x$`?LZNrD}j1}F9F&Ld>K$ks!ERA$#dK;-A;2(bAr*`JjZ!?j%(3N zsJ=D9XdBOQ?L5bIeCC{VR^+744{ST7YfJ}zm%=wI{BDK!DtwE=w<>&_!nZ4Yhs5)1 zz5||jifg_#2j9<2uUWtToNHF&wDWx9Bifl_%@Lo{wHc?@sN?%WU)3^(o6U{3_+t=r zwWe9{T@%nfB6L!H6S|W^C)GNk8y7mM?g`z9&`C8==mv#Os)s^%Oz5QAD0GK~PO6ha zcTni0nkjU53Y}Cxg|1!bq*^L;yM#`vt3uZxbW)8Kx+aEaK2%S`Wg>JLZNp)E0 z)(f3flZ9@z&`I@K=n8~Rs?|c5D|AxbMjhSFM`!zfUzC^Y2x-4u3&jgLhnCpW9)%F-NejL!B4b>90VqgZ>vLdbAUA(W{`(*Ufyw zy@K%hH)cM$ej%Ti&3wZBhVc29WY=J|wsC*jj)=9B9e@`*KTo=>>{5wy& zyk1aYgJ+f}uP>`)bh(?+6?KdjdTjcWVDm-S98wsge=CRcev$no&bzGHrsKR{WM7H% zF88u|zsUX)=Uvgt=KVtZOqjQ@9WX|ptxdsmCPE*#BK>$B{!hqKM&R$v*ayGj#m`AD za4?p@AHdPif89t%aL2>-@T`M=dW8ed3$kx$1b&rthX%h@s_om?uGP3*@K39#3P(l; zgHhk8KNiA2>i3OBe6bMx(SLMwc+A)DAMuTikB&y7F&NQ*+&?%pG6?#hFA(e-AM_24 z3`Bebk*Lof4*McwzR_r8U??0Ms~U^;SB1k9!`ne}f3QDB8Owf(_BZDV~?uo`~Iw7;)^t624XieFI?cnE#+wsc(qmseD#%UO7rkXIFX z86V%dJ$*c5mylm7<2`)*`-;4c%QeN%U0mLS&rGGyVYnSL31N4J%k2&6@(Wyk>|nb5&$#@>{&e|eF0W(@Hk5zOI zE`R%OSxU%1aCt(Jzs2Qs1L@;G;POyUy8I(9?^fjh$>pE)rjNhD<NGwQPffFO}+Ze<(I5O@E`Nu+p|Bv!ml(?difIsFZu2}G)7#-~&9_j~iFdU7* zB_0UC1x{T3kzxE15rV(75%c!}4S_Y=W0ZvbeZjEL?XKTr?0Q;^-Ifkxx4p*LZLLo2 zHX6Ha9n#%ygh#(LXYjrF#BjOetdqSMh(CuZwtY|P}BIWn=q zU-R&_AHJ(){~au838-a65w`eMAbxF0KVpYd@q5`qE$Wlt(H3i-FcWo9&`v%O>mbf$ z=e#IyD}ev+WD=exvyZ5Qf|hf+&LLufAdJh`3^) Hu;l*$zSBwi literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..697bf1870a830392710688a64e1a0a645ccbea9c GIT binary patch literal 24592 zcmeHPdvKH2mA{f@Ba96uyaX5|7(om%LdI_bMm92Wh#^ik50a&oB}=kwWLcGD1v1q6 ztYS*DX+qp=HrvfMFGABt+U;(#O}DM3W?%Uu>_E1&+srmI-cCEyKiZvc+uiB3ooLUw z5Bci)8iGW#|3u8k@1Aq+IrqEwp6}dyu0HFL-u^>olgVQt>{CJJh>4dGJiL7VI4ks5 zBCms3;s52NhExF?bv;bH0CU`!Kr63@Y|$oX6@jsykpf8_qfrlTfshT1COeDq*S9iS zpsa`Y`kQ;1{5KQD7-uB0R`UXH&kW=T(JSb4^}BzP3S$30XnSJJ?*ct+?|MJj79CH# zVf-j)SQxP-?;-_4$W}LdoQp6Cr=l_fTRa6#_{xm;jzH8TSu`E)GY>1ODjCu9$x;^n&+` zuJLd<8A_3tUCEJ$L)Ndj(*Ec~$ai8S7JQXe^#vxv;ZS0lsk{{T2ZfSVqx4GPl#iMa zmQttUp_fLQ_o%n{C3=hFvFNF*-r+&fSe>S^Yu=$vqjaP1uv^@rL(|wb@329mbffRE zN8F)X)7UldutTG?=pAw|dD$P0NBp8$53^tqE`iQS0w2Hq;yz}qDl5Cbvq5tmD_QZx|XFW0~i$~ExA5)5285*!&5 z%jVm2C0HmLgddk{;HTvp_*n@CE*zBy;TI)XC>n%cmuuiRtnKTazhfG;Md$BZ zuHnS^L>!tQ9t*`%zVOJ&P|$}d(7)ip^xx7$NjQxxRPaYfhGUl37w}Q#l4pL zFC6u!d=HH$PH^K?1_2%qC*R%Yd+=zhAC52+EggYyQ`>N8Y|IyqH}65hmx}p1+fKH% z_yWlY98g9>aK4Gr(Ki$g`D4A`cEuATX@3gX3+|Y_=1PP<0q3Jd4Sx=n7L(~KuHQ^U$xOcPddQy+g%jg2kT@P5ak85ZC6XiK zu^Dmf%uj-G0#1p=IXkb?1-KGU#y$L;-oyD=UtD-ZZzT>qiLOK5=~EN6?|$i5 z_g6p}FXV}1YADWs${7Swjxvnk{9Sy&682*LljGkT%yZ^T2cY;!MGZLvofqGO%uD}& zN*40^tSN6go6o~{I92A{9NdNh_zS=sAf2Qo=G^l@djNBQ2A}~*D$O|$fCq2@X;W3s zfoQJExe+U>*l<{?a!8U|a}FKB0CeE6Eroi3o)usR>FTQ70N|sl_LvfGkJQpg8bQRMWuW4k=$e1%VcKuGI<=XN8vgP*Tva9`P?jeS)M$> z@lQB@n&THKuFS8bV^Eu~rMNV|lwxJRlH#`fHi}U1)A&sb)T0RXC_+7oP>Z}9wYRIZ9#BPFb-KO8E*-ywBVs)IiT5BxpZIN|(`^+HNv~LC- zn=P_72Y7~+b2f(Oscb{UzM(#1uiqT8Z+1lNj;-^$Hv1Inugy7T9A_Lec87f`L!3^# z(;3-DGTQdfU9J5LYxnt5+Rw1|r=gFV;a)7zXG=FB&yniwnd)k%WtlUA_l0A+9md8W z&&{F_9I#hqE68+VjDMTYWDWDlNyGdwT&LlhoW$|01s!sEa;|2d#Xzj(xSZn(j&&SY z0@lD7kjrVVljb_;7aSHoy&l>j5_cZUC$Y+z7Y{upV$T z;3mL=f7rk;>m4(zfWHdH6Wd!2i0!Qb#P-$#VteZVu{|3gwznP-+uHz$?QI0a_UZw# zy-k4F-ey4b&5X6$`2_7>jbpl|##yzjI`S!?^VXU=D?HaoXR5oLHt=0X4a5bz%gDQS z8wLcA{BAeJO&h?6;CarIS7$M3o*^D*9_AVzL9zwfazNkE*U#IizZmvS;Je6&dqDXX zz?P|H*+bwH8_znPXsNMI8ZFOFS#Gn!vu0%TR%`Y9`A6=wtgud2!kpNxX83;j>^qia zW@1@pjaZ=Upgfqb5kef*%SWfIHIbI(Pk*vyd36+RSZb|g$+G2S%34kNCgcHK0l1{n zGl}ScXR6g436fehQwYrzc6>J#&4Bmm2XEk7e^_9MgRM@KTs#bM+*yDTkL; zZ?{x@e)4?vGfpADLz3S?p{4V!}W2v2H^_86@)7cR|Kw6xT0{y;EKbQfGY)8deU&6Hb~uFbE}96 zd>P6nc<6y%EoizMx@&HYpy4?{`C37PUn9Lv(DI7xWTFZxA$YEI__d z&`TLzFKB$f0pyzmy^PVDfo`-|45P9#Q`LCZAa%#*9D^B(TY?#5m(^+*OP6Mmivgb+()G4J|Zpm5ox)PNXvahdXYYEBrn3c^TMo%|Lg7F zoH~8yV_Dc`Wm|3bPuuyPWOZ#f-IEaV{(soPB|Ho*#7t?chhYt|P`b{;a5br-w9Uh? zo>(dE@Gx|eYDzbF7&h86mh9~nXR?!o-Gh_-=f4h`X6ohV26=(XnR@m4G2GOaJIAmE z>jjtcdeVF#KU~ZC(!3aZsD4L|;l2iJd*zF)9cjLhA6Ucr(!3chRNs?h=*4=$^}L=m zpU4ky;(TddjW(*^mt)x9i0yCrYu3Ir-^dSa<9un}jSi|mkYjic)(h_7^(22les~w> zOMYSOrTWKm43E39{iZkT^IPoHzwKMAvyDdMG=28t8CZtlK3d=X*3XQ=dnk^qdku?Jw~#v|zpH2(PF3OKgnuC4V79 zRNv}hScCH#`2@@>wxjsV=mh6W{z5`jzs|#OHR?xC@p_8C#6HRSlE08)s&Df!tjBqc zOu@Wj`-;DePIJEGFJy%3J3I`Xs2}|duc!D+?6aIN`3pHg^&3148*zRkPrj-57g#G$ zKKiHJUtrA;>k0Csf6nw&e~JBNvA;xKVE!@sGQ;SX7{*>@{u22H^N-Qj8AiXwF!tSf ze|fmF*k9rpUob{W{*u7>f;`z@Sj=ASioYZnZ+{#`uDK*k1uH+U*bHzBzSyD@%WOK;>&+oiv48} z1^JS{@b$>X*CQHV1~I;1J;h%F7+;Vt`3ql<9DF^Z@g<1y1?wsP62|y~e92$fq}UjV}?5FIZ3Umr;x_$d~+uuSXuf z9?|#`#rT5t6n}|fd_lhCFMK`f7xLVm%V^MO}{;i7zoFzC?I@8RhXM%HvCn$Cn6?FQYuZ zM0tFPN%7_1l>KfT;|s=4c|F4Tg1m+65fOWFj6*4mFIZ3Umo&x~wy5y$v~ z^%Q?eV0=No>@Uh*AN5lhU$CCyFKLV~$d~;^$qjJ-5y$v~^%Q?eV0=No>@P|#f%++o zFIZ3Umo&x~2e`m&V)Xr%aPPv2vAMD7s7hF zoC)V#mm}$NAzjXd`nnuR&4sX@E@wiYxmu1S?^)zr3H5Y2lE{rvj=9o}%8|ZnF5bha zIg&0H(&bF(2f7?d&4sX@E@#4d)a6LJTu7HQp}sChQgb1!r^}ge9(6gAE*H|}OsKER zk*;bPoY$-6NV31kxf1H>awOSbFjqoOM_igF=c&V+uT%aQoG zQ6U$?db*qm=TVm<@pGo4Tu7HQp}sCh;^$I@TnOvwaweQdU5>=hv5ImbUCxC1x*Un0 zdlhmatf$MFaNe($Bgy_E=Srxj%aLS%!CVRb*;bPoJU=bq|1eLITPyZawIhu!g{)#3FlFlBk6J>UCxC1x*SQ(g|MD3XTo{ET8(5NP7LO;;uNNO&G^>jHC&Z90z z(&a+BoC)=HIg*+SVLe^Wg!8D&k#xC`E@wi0U5=#YLRe3iGvT~nEk~05WmL+QP%o;< zkz{|tT#2qnHy}s)EPU@MejHC&imDJB-vjQBECSbgnFrK z#23t!=z4SmawHS@3+71p-F{W&bQa$U7=-U4+U?cZ|29p-_q^4U;+-CR-{T*lK7OB_ zw|3Qep&q`MfZz9@!SD7DbVcwTggVH*?Ok1wOjnmPL+g5JU3_Q3((8@b@O>JuHF{p7KLtA=r)CJSLhCf?o{Z#3Vo|Y!(8Bey$@%Xt83kFcEJ?<&XS?e#EbAui02_A zJrC!gj(8sMJtcU*#qnabw+?W7B|H!KE)%@}RY#u~c=dVpRJ*aP%dKv*n{oq&fH#O{4OcJDW=G*))y7#`rU+rwjbx3QkedvXj9 z^4RU=vAfsUN#%!f3~&3uvGtsYt$puV_DRpW0rY-_?o#Lj3hhzoZiVhq=z|LFRp?%Y zKBUmMNi>g5{|@J#A~t=$0^ijtUc*;GPH`jG@b--TiS|!9&e|N+*(t*@ZFjUA_^!-H zYy0)B?OWO{*^96bP}g`9zP|yo=Y&jJ?}h9cA(PgAA$wBDql|m-%dr*e^P~7`u;mlLSm+zSWFngbC#gvcsMENTp?TL!_!XNF4 z$PM2U;kxr%hzH*ze+TcsetUMW>HS%P`y$vUiFol#kQ*TXo<@#(f-iDbwD}XAP1tJ* zoA2pta`|F5FY9c=eofeXOJ|eI7qj_oolV&D37dbTv&rR)*~B#-Wkur&`$l2&3p$%z zzL-s1;}_V3y{EAG8J$foU(6=1@e6Fi{#Dpa>1=ZOVm5J&UtkmVz`~|aXOqhpvx#f` z0-Lap7B&y-Y;yTxHgSz#U=#M@!e*<^CYLW}6W90!Heo+6Y&vu{xqLC3xW+HA2|0nV zxlCu1%XK!z8vjqAC)W7ChrjnI{H<5r>A4!G1>U(iJy+YnaCsxc6>f%gEeuz7Sd7;| z_rqSRNnMKmZ5g)vVV+}PyR}^w1Ka&D&oi*yEN2b$*VVu?;N+-Dtrhar6)rsXB%*=~sm z#c9qzsnkEu`8jrD1*N~>{JB7ZMGF@>|JnPB`QPAta(6NRJDh)fpqT&9oZqML-{Jh9 zg^KIH$N6vY-w77dpK|^Mh5swg|NVoaCd2nRzvEaje<^jKCB&)lS9AW46+7_f-tYl` z_;|6Mot*!M!f)mLx0LbpaDM$zv7NV&Z?+Qgd&ok12k>>j`*05Uy5D^uuGn}gL|nmB zG59G9;u?-kxFY^!gz*27pGpu{Vm#!i{bM6Tzz#(dJNmXKDWE|P@&k-Qz-Vh3q@~7q1e+rU+gRt4;~c7rjDjU zv1gv^Z4ul(9fe|RZ+n5;*(-{j&4q@0n+py1>?zdkb{8sib{4puoqL3Z-b006boUnQ zcJ~(U_h56O$2}c|3OyZCg+kHWT7~NX|e3B0Q!IPdH9xt zeZ=}G(8>8iF7!DY{;r$QwO7c+dMePN=!^RoXa_mI0xtB$`Ylkb-`ouCgD4~33>WrG z=zGNPy9mTe(#Io}vA7#9)UQGUKRX$S-)k@YE<0zdH_!@v)WtsFN5m!ZJL+-9F6>Xl z_|~ol$+mJ rEszel#Qh8Z!Ei3_9g2ldA72+l|Aj!bE9eLmboy^_{VR%sLX!Uj>muPi literal 0 HcmV?d00001 diff --git a/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co b/hsa/gfx942/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..333832ab1031d81881f6160dab86a0835b88f4e7 GIT binary patch literal 26672 zcmeHQdvH@%dOwn7TQ&wvco{J0LJVP3BjmSDz{n2>0YjWD*(F&LS+-=$Aj^&{97tJn z?O=zcX-L@ZcDJvPgph1DuT7KPY?E5rwB7C>&;gp6CX-~w-O0}EAKMw4b~`(p7WMni z^ZkT+Bf_W>6m=Q z?eB>l@P~c=o}S?F@Qcx)Kj3>P+8+yoBoK%K&o?~e?+Kpi3lDbt!_T=R{(<1r?a#Xh zdwYk2G4h;yxc}jx^M~#cfA~Pqx4%CUc$umCx)1dB2BQkm1+jN>J^r0jK1hAY!O#zXa>9L6|U77ebHCg zDz4C@8SJW8*r+kec!fMlp7V!?LVhufrrc9ek99gVQ?${R3jz zJUg9*hqOufBv%K&&eg$hvT$&EmploVvha{L3ICa^gWu)q;J>qQaC(O{3FmAi3m5V% zoU=J{_TbFfLrE4M&I}KQ`(yjXhvJ;AGz%x`Q?Vdt4`n%fD9^&fi~YmC!2>a0cU66A zI*cqFrB6pi&K}n0?4j~nJY?5ZIo;atA5P7ihmCk`&7X6&Mvd9kw)HPSGs@SUHoEs( z?pM==I%jLo*~7M+J#<`)2lZY1ob9@ohhkCRaB#ps6dH^MQ}16V8~a+{zdahWjPKu1 z-F?x)14A%;-#{=D^Y!*03Dfq+veG$i-Gx%t7&ervod%%Ct z*Bkc7d=Cvq_jBh|1_>UUlONRh?%P%8hZbhErm4HPs=hBcFyQMQs;Nf87mN6s>JQe} zLR%A)qOT_$^hdV5?H-EukN9Ij>pD^wO5SisgAYPmG;O?Z!-`_F{m?z!Hx%{t3?7KY zhMsp19~$T$41Z9c#TB^s2cXYoIXh=3ueu-dj|6+8gD`D)_g>M*9SKH<`v)T@MUxqN zz3K}^H|0S$SoG7mLLJ~d?0}1YJm;gnIA|;0OH3~`zmd0V*KfbSfA!A~=fOE1_(HyD zNmB;?$>OfFv4NSghMakVS0=sYN$1MxW}uJcm5`%wQ=-qWsQkYp9pud^Thca}Ou~0K z1@^==+<^&r2`~Xj-?&11;uO$czyzQPXabUad%_Fg1?)uHR**0d?F9)BVqQU_6wy(T zKp?a2iBgzu6VSkEF9JP4?_BVMba6qV6Yw~(8^`SqBk6FgNy1zw?RJKQP@E(MMjX!V zVSwhhq|sGgC?O?J11BBsszHk2pTc@p+2#lJn>s z%uddxSd=WHn4iq2xH`F-BIrGe-?V@pMbM)NdK5vA;?lUwbWOTWEOs@YY+lrCHjg(y zUS}4>?c`|uy}7nTGu&>HJUeC2D=19bZyE(EnY1rJJd(688ciM{WW^HKNf+Ex{aA6_ z0J{=0{{nH@;9IweizS-~IZB+y6HaG|V@X+Pang0N3w*j(fK6FhXi)<2C_7J-F?@o` zmWEtQ%R{d66(QFOBjhqxrEJSwV`#r9VVpFM8Yf+bYiyiURJbZCLaWKRcKl;k8h@OP zdn%jp$JzK3aE~kCS{!iCjut|mBE@UQi;F89vnoP(T{x#}U~WwE)D*_RW>-Nxk4#u| z{QKm1+_WDXHSNdYcnprmM{z!9gAF-9I$hH4FcD{SoWpT0$5M{-083yF$oUa&Gs0~~ zrkTyiG_x6*W;P?!Xj4w#!%2w4m!?d3fAD_Boz9B+ct1^eKh8CjE?N^hR_;2vfYe@? z4~b?W-~zx!fC~W^11Th|6;0 zZAKpbxcAjamn zv$)~}y}uG;qNJpvU{-PH5ulUKl2Rvpu8|%uZmuYU*fmNZFSy)h(!Jg^A$cU%dnhhn z3NZwq=OlSKZT<|LzuQvtXB2ru$CkTJu7vRoxZhQPD*FAMU~Bw8Qt=lNTMc?8Gt48FIk6L3wb0 z{e&3BbM}onOG34CzWdGEImKc0;h60tg|p_6F=sL5+mHuz9$;a!S>Q)6{55^pq?O&Iu@taGvVAMTsJ|A6%U$9aHzz&NAtKOlYo0qOe>NZ)@z`u+pb z_aBhH|A6%U2NdJbxdfWdB_N$kKsuLzbS?quTmsU$1f+8bc-1(9qJqUyCygUW^~BZL z8KN--i*(XeWtvX{R#ls(0a#r_ano2qya{qZok`*ifa5gI&P?HW$DF51XwEuy3i2q# z1LPtD&Q1Gi$RB4$N$KwC&by1up}CL6lXiSxlgYb_Ag=AjqqvUVJ*#+)Bk#r0CyHOF z5c01|@~=|)D@08pLj0MC(`nKA}!A+((-&FEzc*?@_ZsK&nMFId?GE+ zC(`nKBAsbIt3d|u3!P6qm**2{c|MVr=M!mpK9QE^6KQ!qkSefq6i~X%%dng{DQ$QeR*+&!uk|vlG@rcu;aWnv5)1=n z;PHjWfa$nRdr`BF@owbb65VAzg&fhAl|iZ$f-F6VqH=FEC(zb(PA zqZY?sd6A7T#Uk>%S987;lV&5e?@Tb9(H^QD+IH&FXM35I(car~-Z zmM2%bXw14+6~`;h$_Y8&qE2@m=qPztkL%2J6Ugtr$G%+TcccUN z+Lt5t-jBHQCo5Mu;@>DtkJXSLVij{M&s!L4`+~2RCy^)RFXE?jO}q>ps2A?%dP=ND z1~^}eRnkN4onD3|7*nAK-(}+{vAXX7=S#6lg4Dj$%WwhOhYxW*B~~NfQmrC25X)ZXwitU&wlx451XtC2tEd?{ARermtg%disTC-mem zXT~b7FFg06&b~hZ9pp(lSj4J%k>?_Y0n`tFpX)2J8u?4kS7J5v^ry_f607^3<9sDn z!$07BB~~LZbG{O*p&#jEb>Exmu^Rp%~xWT=2hII zNU=)uD)OXQM5~G^D6SCSmiaq z9$o`v&Z|ldK=bOAYk*LK)c|3Pe_F4iz7nf6uOeTGRhn0kuf!_NtH@VkmF89CE3r!R zD)NQ&TJVwL7qXS7MdsRpcwNO7kl6l~|>D75Pf6(!7d%C01!(MZOZNx#U#`#OnWi zHh#rkfNsyP>kVMM>Usg{o*(sey#b6jT`xel=hyWH&|cRIQ1|?(r|S)1yyy6Y4kb3~w6UfvH*pruD2dKRO z-JW098?bUKu6I&yRrma;r|S)14C#6Sx;?+HH-PrKUVysiM?GC{0Aonk3()QPb-e+! z*YyI_JwNK{dIK4AE3TzHw~AQR?fG>*0xP$o4<%M_q+Wm=tJo93Sk?6cUbd&ls@ewVguU)LM3@+z)(a;z$Qek-q{o>UiL4AH!be5nrLedV-0zpgic_B5}ey;KMAKD4#x zM?GC{0Aq;eRkWAt0N%Gw+w<#s187h4D%wkR0Pk~Kdw$f@^#(HLRa{HeSk>(Lbv*(r zuc8knR&S(UfE=q=%ByxIR@GjBZqKjl4On><_qdwwggqMo$h!Wg1?75P%E>h}D)-T>Ouyo&Zxtg3r{ z)YJ6_GUnCHH9*>)U)LkB@~Wl=xRH7Ra;#n{uf7D&&&c1is=WZ+o}cDbTto~yv;D+md{zm||nFT-^vLwADVdNPm7Jzj=YCd29k!y2-R z%4@w0>r96A35E@%hRPeg44X`b8xjm}Bkfea(aUg?$*?)Wa5K4w%DrBOEhfX(1j9DG zk1}{RqyXj>pV?jDWY0n)-*7qOpC#eBT)58_FlRUA!LxN{?Z=MFQusL6UT|!N`9OM| zLc0}uy+V5wx=Nv|6}m>DYZbaqq3ad8L7^KJx=EooDD-U#y-}ezDRi?!Z&qlpLboV% zt3tO)G`tu1eqDwh+Jf1hcbnlY_{=d*KNHWuXF_})#--20G0+j82YkL6o>MiRDef!< zT$2x<2Ydz^o*OTvpACE-RrzdeB-_qc-fM>xn!HfCxWEpn{DO1grv&-yzQOqGfDbK2 z%5NwCb|Js$aVmG;&*W~!ZoQB3*GqO=Bs-6v$vui)RS)A=Np@Q$yXqj5S1WckeT-it z+1)PL)%G)atzuWVpYiJ?yKRzP{Q#5KD|QWojNc&H-67dEKFH*aie1w%<2Olm+ahsTL8kO+qHsB|=s& zWKxYHWb1`Ys#k<;t&mBzi;%4pGO3OcvZX>M)igpjU&y5TM#xHpOsaK+EMLf^x(8)g zhhps$hrXf6FYnm@Z)%^b<&-aLqU41yYohd8_{*AzeBm__t~EY>;N(X%Ja(H^ShvA%BhI1PkmTqJ?&+9PX0Nd-mR+G{g{Ws}2?)B_r;JCB5I!qk* zdiFDL+&SCWxYx65*q z`e-6z8UBG6KIVt7!<>O%W8f>yfc*Ayau81&xg2_X^tD$w;Ix2_{s{aE`33`io80K= zXg8`oZurkwyFNS^=?g}EL;hF@f5gW(Jm`ys;75Lm7{x{Nb=~aM(8#9qjE72Zz@WM|;+X!y^OhfU_^y6QhLT z2?YE#{y=YCeM5bH(Bt>_cmkf9s)m}LV4$XZGdDEw7iziVB(J|hdsSw+8*>wlu~+c@8&_0r*AU&3Du|bALjbM!M>UFnxN-}WLnXGg7bg(HAxvx|D5y7_NVjD zaQ=G_r1O8o`HlCb^WWk8@9atE|0Cy{3jZSKS47hFKj!>P_owrJ&G{cG{Quzm+Maa% z%bY*dmCi4s0aQr#D*OeUU#|FppNWB2A#wDj`?;0#FZHJL>o}h%^XcXM-3tH5$hSL* z`2Q#fZ##gm`yHtxz}Nkb6mdreV?p8$9E!kiVG(y<?*Vo&936y@cpv~DaN_P69Kat9Blx>oF@HB;2&~x|LlX9P2g5#(r*5lt z+SqEHwtB47_D1WpwI+4iWSzFPiPNeUOSfvPrBGc}B@}9^TEuBXjWt?Ljb+wavxO^E zd1~v0ZhK9gIBoL?1y5^>#cgd7*IQNPu?BA2YF$~|)&}8WOKrP2ZEvy^+M6T=>vU_a zb?IAcgf3i}HJYccE}LKC%EktNz022r@2-}ecd(=-LMMfzHlAHv7C&w*2lR(|5q{z-`22~SO*1GaK4ZWd(MX6VPR{m zkc)LwprP1{>lf$(c^(|XUaZps!`xl~-h=$Q1qkE`dwM~rp!_}@vhd}le^Y(V;Eg1HMH55&xf+A;z}q5HWwHoR8mMB2U<#8fAi0e7)#W z##2bL6V#EXERRIkzslTMe}%n3T*j=qv$P25^fAK2FKjlD1{~u0MR|mmDXtw(t(V!& ZPR0ETff!fNAvn?5U*h(0#Xup+{{pcU_OJi| literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv index 9f2183e46a..bbce04c538 100644 --- a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv +++ b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16.csv @@ -1,13 +1,25 @@ -knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias -_ZN5aiter36bf16gemm_bf16_tn_256x256_bpreshuffleE,bf16gemm_bf16_tn_256x256_bpreshuffle.co,1,256,256,0,1,0,64,0 -_ZN5aiter24bf16gemm_bf16_tn_256x256E,bf16gemm_bf16_tn_256x256.co,1,256,256,0,0,0,64,0 -_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1 -_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1 -_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1 -_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1 +knl_name,co_name,tn,tileM,tileN,pf,bPreshuffle,splitK,subK,bias,clean +_ZN5aiter36bf16gemm_bf16_tn_256x256_bpreshuffleE,bf16gemm_bf16_tn_256x256_bpreshuffle.co,1,256,256,0,1,0,64,0,0 +_ZN5aiter24bf16gemm_bf16_tn_256x256E,bf16gemm_bf16_tn_256x256.co,1,256,256,0,0,0,64,0,0 +_ZN5aiter43bf16gemm_fp32bf16_tn_128x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk.co,1,128,64,0,1,1,64,1,0 +_ZN5aiter43bf16gemm_fp32bf16_tn_160x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk.co,1,160,64,0,1,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_32x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk.co,1,32,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_32x64_pf3_splitkE,bf16gemm_fp32bf16_tn_32x64_pf3_splitk.co,1,32,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_48x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk.co,1,48,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_48x64_pf3_splitkE,bf16gemm_fp32bf16_tn_48x64_pf3_splitk.co,1,48,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_64x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk.co,1,64,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_64x64_pf3_splitkE,bf16gemm_fp32bf16_tn_64x64_pf3_splitk.co,1,64,64,3,0,1,64,1,0 +_ZN5aiter42bf16gemm_fp32bf16_tn_96x64_bshuffle_splitkE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk.co,1,96,64,0,1,1,64,1,0 +_ZN5aiter37bf16gemm_fp32bf16_tn_96x64_pf3_splitkE,bf16gemm_fp32bf16_tn_96x64_pf3_splitk.co,1,96,64,3,0,1,64,1,0 +_ZN5aiter49bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co,1,128,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_bshuffle_splitk_clean.co,1,32,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_bshuffle_splitk_clean.co,1,48,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co,1,64,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_bshuffle_splitk_clean.co,1,80,64,0,1,1,64,1,1 +_ZN5aiter48bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co,1,96,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,bf16gemm_fp32bf16_tn_96x64_splitk_clean.co,1,96,64,0,0,1,64,1,1 +_ZN5aiter49bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_cleanE,bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co,1,160,64,0,1,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,bf16gemm_fp32bf16_tn_32x64_splitk_clean.co,1,32,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_48x64_splitk_cleanE,bf16gemm_fp32bf16_tn_48x64_splitk_clean.co,1,48,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,bf16gemm_fp32bf16_tn_64x64_splitk_clean.co,1,64,64,0,0,1,64,1,1 +_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,bf16gemm_fp32bf16_tn_80x64_splitk_clean.co,1,80,64,0,0,1,64,1,1 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_128x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..7594ad577d64979760ef94eaa7e8f8fb550f09b8 GIT binary patch literal 28656 zcmeHQeQ;dWb-!Az<+Uu^GWbKbW$)Su<4}9OvLs6ef2?KM2!A1n0oypNSJKKWkR`EN z3!6If(;^9>D6&u#g{EmB1PJ*cX;NrNlS&&VZITY|*d(386gp0(nMr4omeQn?G%efn zyC3VjS62qKiTV%AJbv$-bI*Bq?>l?mdB1%tpWD5Ek1Ho9=w{-T%PzAVxeo}+{q`2I zbK5NFs+foVKc5w_JW|6qD9n>EDT68W$Z=4sV@g#9ILm~nkQE9Uel!KfmJ6BPn2A5$ zCuD`XALja6MA%>J&BS&gvUT}#M;^~b;(PH^{_6eR@w0d%-rsvYo?7$U$q&bSzb_n% zuO}vqmx1mjLY+n-ve|_HzL+r|;m!j!x76LqSmLYUla0}iu>bx@tToco@z~%q{&4I> z#~W;a-&g(NhWM#)ODNpX5b5anQY;c~3_TcYjz`GS*cc;SsG~jH5P9}QOIv-otdk;UovJ;R-9hCa)| z-gSoSEKa}g8CI(q1}q1A*BMq>oTi+i%#vrqE$z{;I&4rxaI2I+a=lvwJg^smj* zfB!W6pRSL@(+f~HjR5)r9Lm!FaF+hZrs4lwM?BWt7^w-UEI2-mAo?OSX6YZz(w}N3 zO62>>VzmgRY7yF}5ky~vc$WUBv-CeZ4S%@^yV9BPiZhKU`YOEAlWlce>fI zE1eCm3{N9U#zKr{8{qM51AJy00c1w(ypj=5P9sXjLOh*qfakIe@RexMHb=>v_;C)H*1mCMtJkg*8=l5K#WXB*%b(+KeNL46V4nnr|-Mfg>=0e+Kh zfd83BfT#Cqi*O-_O(TT93Kw$RSx4|>9idj=fui14N6j!@gFc&NT~^OfcBO(Rg|a+G8pVO`b{*1w+!(_>Yhu5J!@Tv@k(IO6*w ze=g)yS=`?3w*EOHqeAr=M-RTA_iJIHF67i?9icYs2>ahp1oN!DN4oC$_INDR5jh!d zkG91kSI%FZIQIKJe}^q@Q=Y$X`A@{!PPNnFPn?Xj#zRfbrz4Fa+yebQi0QxNq$IJC z=>g%E<`b>%7bofAav|sVOa7DL)1ju8a6I&2TkL)roZG0tgPeSQTj=P)iZD@_vGOhT zO{JSpL{6RzHMN&*f)I+ghDtY8o!(p?s_%#rL)j7`f)nL2)X)+Mx9*+_nIWUq-``AEHLqF-2)f%v@ylDj|P3P*0%y*EK-y7I` zux96S3s?`mdG$SkZQM-i-0a2ioK#Ls+lb7@XU3tj>FgH&uhdVC`ks(m72xz`qcFtY1||DriI3F zq;VW+97h_*aruPT@%DMoFZFI8*}i1EvpunWsKUvcaIv0=A1}yBZl|}KVxJ#(<>t*! zxvuFVDV1_90-j5`mUN}gF}7-%cf?EYX>)h}#2WG|VAoz^-W>YYt@qV}ZH)CWk8jxH zDR3_SmaxvF|7!C{h~qz*%{&rf9=-IQSJ9l@ zvekm6&i`k9Rq)Pt*1p z+CJNb^<79l>|)pDg8gm>I8Wkyi3=naN?a)M8j05u7Sh_Vi=8q?r;O2gS;XkPEMjzC z7BM<6ix{1kMU2kNh*8YXc^~ENSH^KZ@qQB?Psw7u-z|8*p0!+_b7Qo>n9dO^zqFVN z%TmH6ghhl)36~KT5iTcOM!14-Ibkv33c{6y#e}N}R}y*&R}uOMy@aa?eS~WWR})@G zxQ4KV@H)bDz7mT!Zcjef=RiktF5&E1!7iYWKD!k@@5l=8OWoz&Pjpv!*+i9ZMXwK^ z#R{)DUwAJxXXFh(UxfU85%Tjz$j=ucKVO9Wd=c{VMaa(=AwOS){CpAe^F_$d7a>1i zg#3IFs(E;ay<+`H$JdW=TX){XPWn7nIBa4EVS=BNXUB1l`#moe@H+C+OH|{iK2crp z(Z1vQ4tldV_iO~Cywhdy|1IR9i#1D z+K$s!Pg^5xO|(U6yN|XO+FEIAr!7WXoVHHdPSf@fZD(ogqRpW#L0b=PeYEw{HbC1T zZ9}vT(>6leC~ae1j{mH~3U9x>nB~xCh4yo(D3QEG$!Ow8UaDj?b|e=mnLam=mnj(! zZ;-rP$#~d-C*WPObw>uUsAUn9u+8bQ|A2(rFLkgv8z{c?>4jWr68 zADt(jN7&cb2(rFLko7f!tgjJdeT^XNYXn(eBgpz1LDts@vc5);^)-U5uMuQ@jUel5 z1bK=zTEU*8>-y)$)%A0k_nY1E^Me!A^-fe2c^~o09&ypUoxDe6?45tQo>DR>(9K+2 z_5=kMFgKSAg8~<^JT4an1r{?8mwiEjB`lxI>w*H;vjQ#$f&$Cgd@ff71#T@$xF@!_ zdnRH*F~{}nCqHs>zEP~#1?Up9zENz}2Pep51(O16Fs^5g9H*Uc=={N6YVKc~ z6u3WtV=sJJ97oFs=o5>jUds!ohWpng1s=k^v2q-L z^{e9eT0TLaxL)eDymD%}|M8^2MvUwEkQ}Gw8}xk}q+ZKAXD|1UCIz-s;rOLL701`| z5&A@h)N6U^?C1XNNr7>U>)9g5Y559$-!`e&^42-X{ZA(ap52P$?|i#Bwc5)+zuqJp}4ERxq6LzVtjUH zKKFN1KBKP5`h0}<3OcP$v2MN|EGW>8af6S_aYjB5eO&6bd}a@Ge@{?g0rIZ@ zv6sbhjC>yWq||Hq%zC+hVNl>A_z(Vx9B1V7(4R@Yme1@F?q3uXSd6^u|J;C1sn_zEo#*~lL4jM5fBoNm+n&#Kt%rT!Z)85xHDC1u&=3Al z_?h`UbRjdJ`!9+79ry=3Ce2=ug{~X&lqRq^BC$g^m;yvYf!Dc7V-K# zjQWgmMm~?AK0~kPv$$r}$!iy{&!ec%7-!`380s_hdOnM5++lgG)gd>%o4 zhF;HSaZNlfuZ_GukD@+foRQCCsL#;r`7ExXQF$%p^?4Ze8RLw69zlJEUe9N7&25+0 zUS6L^QJ*o+$mcQCXXy2O7T4(0@>`aCA<^RTSXBeFh^%KAJe>+`Uz&m*!vkIMQyrq$=W@-pkQgZhkn4=tY)sL#-8 z`7HYlCi@LupL6KGbLEwS1QShDY`rygoaq&lqRqa{~1ldM%%2zfmar4PKvn zP@gf*$mc%PXXv$jmi^FFQc2J)&&dBEk>NE6OKFfZ?C;JUvpL6KGbLE zwS1QS#yZ(=tVexyP@gf*$may=GxS7dr+S-&dBFJ)Mx0me3t!2mFzco zeRig*&#K>0_1Www+#~CApRCW0tj`HqpL=9|?vwS|k@Yzt>vNB+ z&wW~beri@`KKG+OoBamrv%TL4%6@~_=RwqGj5G3i2=y6yEuUq-Q7iimUZ49>pE1tJ z=K<7b=(T*7{YIVaH+X#>M196MBcF#*pP|?CS@s);Wxv7eb3f`c#u@oMfcgx*md~=^ zI4=7QUY`e1pE1tJ=ONT*=(T*7{YF&w8!f!wKz+tIBcBIQpP|?CS@s+4vftqKc@Xs( z+_(j&qK04_sjY` zAnWs>tj|NTKKIM|JRs}ypsdeBT7CXE<9=`$^%>VDtv-*SK0~L~XY*bb{qrd5GsYSD zJcjxVy`Ima-$>v4qW>L6ea1K=pGQ!iq1W?S^c(4WW%!SxK4YAb&ts_1(Chgu`i=Cx zHTvIS)Mtz{@_7XH8G1dRMZb~07l;2S>NCa}`8*8x8p$Dz0l*uizo6vxqMR<2Xz%1LJI*2lC9uaoD&F8|MLk z8^>XC85n2dJdkHLj>E=f*f@9*imto^Pkf%0|L-OirE(7CioCosE#&Jj_Br$05H*$lx+;oCo}E9EbegAkm}4%u&Ha2W#@&I5UB z<2Xz%1LJI*2lC9uaoD&F8|MLk8^>XC85n2dJdkHLj>E=f*fWjCxCG_&I5U7<2cOc3NX&b zc|dRDILzk|FwVw#AkSW`YyiGn?G^J)l1Kr<#(xPpz%EHZ)rT9w@Z2U z7S_->JXeY56?^b(;=a96JVRMX+@5#u-e_X)-jW0#TgS)ZnM?QWbM>(^7jg@kxWd<|haVHsf+VGZF{!aE6rgiVB)z=MRf^!z+M(~DA`j4LM-QrG~u8kjo6Y+>k2_d9xu`8gi8(Z!zSX4Ebh5-fGC(40*dD?=a+` zA@4NgYD3;-$Tfz%+mQDd@-2p3Ysj}6@?JyUr^%EH$d^v~ZgtVTz{}ey1v>H){4@R( zea6-2JE48P`)Q2&eB-%udfwRgRDN9{;o4dB`Np&9^xS_T|E%M=V&k*EmDOHcf5b)9 zcCT=(EI{n zVGkI7r42$~s`(w%{5C~|eUsr=c0%aOG{4(4zw%~bFE{)u?iczB&96@L+k8^kHyeJH zZ9-qE`Q5JhRXrf=RfgY|4x!(o`Q4%U-E>OWZ!-LDeo*Ld*8D!K`E5NT>{|`LZGSBE z+cdvJn&0-13j21$Z^t>I-=X>4srd!Fg*|Ba?R;41cWQojX@1qc!d`9o?RrG$cWHii zYkoD43VV&=xBFv4zgzP=toiNvxUlar{BAif^tWh!M>N0M$ArDs@VoUBLVv5~cU1G+ z`$=KnYxwQ^l+f?f{O*x{jD7mG>-STs_YfYW5A#Ao`k|JyjPMSU*AN~e`9{LKNG>Bh zO!CcyM<-ot_seVT0ms9yO$U<#Z+XXUTcpyH*@>Zq`-UL^R=#DUF&M!bnn+{pF{EiLq2H8w;6JsA>VGucNp@A4f&8E z-)YEq8S>qReAtkW81hj=zDJW~t^XZ;ucd1JZ`}ATL1vE^qI>q=XOCBz@Sd-H*wxGqVo3d$rv9fJeHmyfiwvEcB^~=h(PT92H zS=m-Ao7P7w+j3>odTM36R@t=vTG3%`QEVjjz_Nihnu*IZ%4i$5@EvB?j z74yI8dIMX=deVK1iupF#C#|QnPZjfLwwQG9qhh{kiz)3>#Y7J|X`Xcdq+#Y7J|DJI3)Z{o`M$H4=LH1LFBiC=N?_qufeV8Iuc;AuZLQmRi~iQYO!tPYFwXy03XVII zye*D9@1UE$qpQ6*ll(1?JHJjGcP4pU9CyJT;5uWn+D~?b8p5rij#KUJZLv6wXt+OoqPg`1*(0IGNd2i3q2|`6wop@B zEEH~O3AJ^E+GA}^%`K6RjUBOujV&#mCpVDhzDPry3%)>OW4J8b*i^B(a`Wa$ARKN8 zGzQ8_E6W-pjb)V;TOy_O4~#U_H+)FVdNShYDvW;garTcgZFm$N`f_}Ct-Y4={n_lI zpx?*{^D&=Me21>Rj`022?8#D1NxO|w@7t8A z=O>xsJXw*ce@KomxFu8HEA{I(XX=Ng{;j(+^?xe$$p-c;j{XX<}0^)DFu|CIW(Co;#sBlWK}Wa{Ve1e(ph zXXqD6eWMYF{+t`VX0u~0nQ?BE`dUL@A@$E0>lu{#f>>spzk}Z8Vd~#IQr3N>xBXqm zM@et{yN=A?+7^#6f8&`}`b(9}f1>r2KN{|cGWmb#<1yxswKaz0VdjrVPRIG^@X6)| zQb$^1ZS)auY@`o3^Eb4e#2@!$_^X%ka6Ms^yv3eC))KCdw1fhIiaqJws+#m}b$NPs z_m$o1%Cxrn%GjFpfYO~6YC!4kDz&>QaAo(3wz9lT4JZrjRJ+xcyX07Ibn*6gWP zySu8=+Uh;&=~VC8q}(>`Nyn|JR@y+#&h&iuTzSX4D>tjSyDLl8?w*P*Dp+8PIz(w; ziwag+x?9b+bW@-#yT6B-oj3lPW~k=K!JP+g6@^Tt-?S0wJexNBT#&!ijzsnETPd@; zZYdeQ_(`2|Q`bj@wQ?YSHi%<+q^vG`_&MBvrBd`Qy?CkXqrwuYS9awu)%17O7{B%^ zySkn#^cntY{tCTh&!tWItLwKyNBS$kd(dAukN}}N}~|d{|~13#Lh+n_~Yl2 z{B@(maG4lKOYj+J!pZpQ!aKlzKe30-f@&?GuLDCV?;E z_mgCVm216z$vnE(I) literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_160x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..b7a2b37260c89983c3214af7da0bdac6d9158e97 GIT binary patch literal 32720 zcmeHQ4|G)3nZKC~VF&>Nq6AEs0RsX;oR9>90uB%`YQTtysL^4POfrFx#3V%Q()fsg zXsseutJc;SV~jDzG)+^x?UJddd$!%9d(?JM@oeo$d)htSv+ZuzZufM%U1z`F{WILT zxdF{sj%Sa}K`Zz4Y&@8f=OEIyv- zFg^ylFA?fE3Xx4E{P&9)^AT>^UUF^uEsS*xgbq|kT0(jI!_5uh`j*EYelai9T-)+f zR(5(W*1-PU2JTqzxSLy+@cPaX*t+=dzfW$`lNffN%hcYIoNr7 zxXj}8N%wHG>Y?9qu=Dn?*y1!{59L+zVyM0;5>kf^hzM?VN#No-Uj+R+D@oG7G)e!m zN%(hHhFjwUP(FzO`T*=o(tl5q{uPt(?`>&quB#50_|;XgcM?JLL8wmBKa!+B-7|5E z{8(498iaf`2u+g+q7OoAlKzL1^glcae>n(S;#b0(&LpDfqwwa@WCNT?HUQP$$zBax z;#b3)CnpglVIUq)Ho#NK26%Q70pu02>FgEJIf*C<1JRvqfZk*S^iLvy8i=!3#NZ^N zBn-rGvH@OCHo$i$5kL;a=CfBsbP`b#2IBk42KYg;0p6TM05uS2uZXuN5hYNLoqk$2w6!- z$eu)mzPgrRgbH@`UI$0L8< zN-4Iuoo`v+OJr2AGU4cfkMr|dSg5yBN|KIHnskJ+j}yT>YafxWd$p;xIoJ|D5Ne7v zHiyrizjATxk9+?1SllK&f8WllZEif+M2D|E5N>D<*3=yeR|l~K`UeoxBb<~ZHZtBI zR9{!y;Qr<~JzU;O+539lfzY8~O?{{}cu!;Vei@wG=mrmR^1Thg+jkU&h{9|xTvu6> zzos^P;6SjZsbDpPU~5A#e~tgpn!;dZON1E8`Y;ikxIG4|>cgRit>4LOYOZSwwUT;V zm)US$bNFr|NE43yC8{kcDX-+U)HXE-s~QhBv^Kqz*K+TH%EtQl0!j5HZ+|s$U-7VS zx!7xY_k`NQHO-AQlo*kK;&$7@%`J6}4W|_GEcsv)V4jmoEV1I*WrJ%7XA?6XkUV`r z>d{_ppy&&QJ^P9vP2spCwuB?KdjL;b zs2xXY$C27`q;?z^jCvjKaqpw^ycHF*PM2eVlmfT;C(UICGD~M7+W~sd&)~c)0!jcqq*dl!7h4-c~j`A+p%{uHZXRS zd3+~5o(%W=tjN5W_f#jv^e!ZytgOf-QNp8QKbj@*B)2Vyco$?xyx9vQ-i5x1*S9$0 zU7Q>7<}QtRmoAHVm#sJ(E6aNX@h*w_PWg`dPI-OaBOPpIu6O0i$a2w`b&NW$!FIo6XkrCgqt7desbJ(a}@-46D}oOMtC{la>8YVD+rep z#;@Nj%A0K8sd=QIN9Pm$%_2m9=M$p83kcEQO9|26Y(n%mhYgTzDEl)Jnc^5 zb4T3Qdgxx_jK(~k^bceA-RPd>X`e#ZgxBMu=jX@Ybx(IO_jFIhP5VBsqwA%P#w&g1 zz9XKDNa4&UKVLXAy&f^#GdyhS^qK64C!On4pd&ezaO#vmJJ3h>h$5f&)TQ3nj}#tx z?nsfBjTZYZJ?2Bs7kS0`q@RVKPeOh^3HkXXdx^osE%9Uou94M);OucP}xk;6u> zCG6nmtmibYffJrH8T@{B<_z6)={`yKB_HiOuAkDqnU*z&(y(IUM+)wgA`!c$}Ij+6*b63(_4ZB4W_)7DB`8*PVZ`y6eDX=|s=p{;|qqqH5T?F4NP(e^NHkI;6Kwo|k{PTP~T zJw@9yv^`7P*J$gct&6s9+Inc~rLB*)e%c1woxH;i%e-l99!sHIr2Q1SX_B0!WK2j& zp08v~R7qZ-WXem(mns=wMj<&{$@uaH$vH}%D&&Pq#>9{8UL{WxvQNqQ{t4L^DS5h( z7b`hK$d@U3hLCfWJX6R^lsrqwOO>1{kK;_6){(yIA;-B=8)NM2V+>gzW61g#L)OO_vOdO;^)ZI5 zk1=F@j3Mh|3|Svz$od#V*2fsKKE{ysF@~&>_7_vUbko7T!tdB8deT*UNV+>gzW61g#L)OO_vOdO;^)ZI5k1=F@ zj3Mh|3|Svz$od#Vo?whGWnDCHdj7PUf6VuO>&W3pA0DMu$Y@cP_W`e*v(KHeiO<;? z`|zKxpp*y*bTb#1Jpq9k%+2M@fWWydjmud9f!WN%WnVyGE=%X~vVg#qEQ8DbfWSgF zlgq^cf$Lc&mjeNTC2S6tO9KMSvO3(O%cmS2Z3~E>!OwpFqt28!vi18ObX`;4$TshP zPC1WN9u>F??YgJQcG~p>ea{T3*RCsPC->hI6b2|6*~R_$ zMg>-*UH5$1PP-nV@5zyR?YeY!bN@(GV0|%;zvvg@_}cXfeb*AH*RES<5BG113T#EY z?&Y$bc0EJivr6i<>)N@U`yYx5JiH#q&;LLiU%TF+?<$gd?Yeg=xW5w>coglr*U5HT zK0x2ILF%=(K8|+X*U5HTK0)8J zUFx;Gaw@t1Q&EA>mf-j|&B=)^^78wWcX9gYN@wLUUONUm9N4>VnUe!~)o@3=zCho7 zoA8TLuEB4Wv+5Z1J$KH@QMINN@k5JqfL&pM-E{(c4lK?ogZ=K+9J>w*?7mlE&*xX? zl*4{xGsmuD0=pj+*z?8BIlEwgY$wOACj@psEwJaSJ97eT(YF>Yc8|7AOU$?4=PBQK z9T4|t}e`!Eq8SKmUEoXxi()Hmq#d=oXhT-I)0--c1&(9X!W5!5&6 z^?VaGevhp6yuJ;hzM-9wZ$qeW(Chgo<^+4?+<@1&VbnLYGxBW&^$mJG-^3guBIgpk zz73+jp`DR$L#S`i>-i?;98GfW!Ry;F>Kob_`8I<32ECqdVvcf1&Q*AQ8$^9WJ0stQ zP~V`}^G(cY967h)^=%mS4egA48$o@8Ue7l%2Rb3=LcG2WqQ0S>QQwA8-=Np?P0X22 z%DEG-Z^Ni>XlLZx2Cq>#p~PPMD>m4?Z~SkRo`g-uIJnE1odsisBeR^ zz75IxHZ1Ggh^%jevc3(;`Zg@<+lZ`hgR;I2$@(@d>)VK|Z-cVF4axd8EbH5dR^Luf zP0Y7W)HiJ%jk?u^`ev-B<$RLK`6RDz-KcM9XXINC>KpW0zRCHdN6sgCed|PhLpvki zx=`Ps*YZuyCo|=IlGnFx)Hk#<@~sE;4SFr#qLD+J0st^P~V`}@=eYs zeR4j@>svSK8`>H9)`R*6y_RosKDkWJCwYDAM14a$Bj37E-=Np>P0lC%az4rHTQ}+( z+8O!QgZc)&mTz)CSuE$1yuNj!zM-9wZ(XQw&};c7=aT_BpXBwe8}$wCjC|`seS==h zH#wgymGenn-#RC%Z)!fN>YF*AoS?q-81=1F*0(NM-@0Xe>yh=XQ`WaGS>L*4ee03+ zty9*wE?M8YWqs?B^{rFZw=P-Vx@CRq(dyeT(h~En7xm4YPolor=ac1fKFRA_Kk6IW z8TmGV`Ubt0Z*o4lN6sgCed|SiLpvki`cU7X*YZuyC-=(vB(HD%sBdUz!wIiEZ!=aam?^`pL_osn+?sBh3~`6lO+Ps#ZtuW!8*)i*VtRQ1iAPfk$Z28{aF zE9+aItZ)6Yz75Fw)+_5sz0!Z~d~q4QTZ( z&B(Vw)Hgn-(B_k?p$-nCzM-9wZzHI0(Chgo=96*m2Dg)XXM)m>KpWWz8TyU@@)|H4egA48$x}9Ue7m!%Yy$f>Kob_`8I<3 z2ECqd2KR;a*C6T}+8Oyag!%@(o^J+M2LEByH?%YIZ3OiVdOhC^ZVl_NLDV<2GxBW+ z^$mJG-^6?}&c(rh81)V9jC>nGeS==lH!+`#b9b084o+0x)O=FaH*-EYL46xB>f4~K zZ$q-a4a@pABJ10rtZzfIz75OzHX`fWpsa60vc3(=`Zgl#+n}s(L$baN%lbB=)wdPI zPyXaj9p~gIZCs>{vqa9>I7*X?L^~U2iJY-ZzLzm20bxk$9L zahAxrb8(dVnoZ{_;b-G06*mbxxXMopN4ec?%qdah_`E{YI2#vf<1CR!HjdKdBGJyq zSt1W?9Hotmv~ia3w{esv7m0Q@&JuZG<0x%hq>ZzLzm20bxk$9LahAvf8%Js5B5j-{ z{B0bi$wi`_jk81^*f>fX7ir@x;cw$8O)e7cY@8+X;#?f1o^Lu=2|pW0splKGO61$8 zfup=TH8I~zj#55HEZ4b68)u0;vT>C1xnrD*L^~U2i9E1zl=3-c0vBoHEa7kCDCKj_ zI2Va_HqH`xVB;v|bI=4X(#BcB-^NkO=caKk676i9CGx<=QOf7630$O&vxL8mqm<8O z<6I=#**Ht&fsLb-&v6sDNE>Gfe;Y?BpZmtSNVKzYmdKlPag=(#>0BlJY#gPYZ{RAC zZ=VK^@(EXBzL^}Qjf=E#mdGO;M`?1AXlLUrkq0)8(#A#FI7|53I7*X?L^~U2i9E1z zlr}EX##zGO#!;GFB-+_HOXPu#qqK36HqH|MHjdKdBGJyqSt1W?9Hotmv~ia3w{esv z7m0Q@&JuZZE{;;qw=T7oBd!vD-Ig_+o^RkP`F!$Iz)^lXB{APjj#ADi6S&Ae3ulQu zvT>9q7m0Q@&JuZG<0x%hq>ZzLzm20bxk$9LahAw~UJFNQ<05UGCH!q1rO8F2osF|Z z9@sca8y9KgEa7kCC`~RB?QEPS^1#MX+PFv?X9<5BM`?1AXlLUrkvHe!DD`~n)3{3b z^;S4p^7-VafTMh$zIUvDr_1Ch<$N-MiyX3WmdGO;M`?bK4ee~4CGx<=QJUX- zLpvL13B8S@G`}Z@b~erud0^ux&F|HrosF}E-o{ay-@`*Y8)u0;uyK^;_x8}v##usd z<0#GV`JtVSvqT=)I7;(-foNyrETOk?l;-yc(ay$MB5%&cQR?|Nr0N@SmGB!rPkjSd z$>)=w0**3;@(mm%-mUoA)MKM~C!>?T(zDi^KAMtxjK1fd-mcym!}~(3s2<{dfS6}@ zW(l>!dm(t=;wav&xOH~~?|@{|vjgw$-I0#nyK_5u+X~(m?~J(btcYaceXWX$NJm9Q zE@R^Rn~a@)?+QN!oK1+ADHjtK6BZDzCoCZh5Z*$F1J@9i65c~tMtFj-obYMFUG)AT zy#s~swlY?KCXX?HKw!QjaCKB*0h`V3g#m#@j=(ihfos`fZZ8f9T;~Y9Dk|`5R>1A+ z0|GZV0yjnlUc*YbJrEGM$q~3YDsT(Ch1*L40=GH>w?zeB%WAm2G$8OgN8t5Qfj6*w zxV;$)$2L#^a2;3PJcr$yN+jj*7-r@+{9Tj*h#xaY&8=OYh3EnN6 z>k;qtLGSZ=M*k;9?~lS)UE#i9pH~KEm+!_8i zHqs^duTwktJ*$iES?Ye(q1~@eP#bl>!h3}D{+F*Sy&{uv=@hzO;ax*|-!qfnyYQZu zaqn8sO5gT}T=Xb0_KtsEnu{K3zT>(00|tG5mC)yto*rp({T}w)ccx5xgxgn#g?+W* zS5PbT1)AS&nqOg^uooJBMf-)mNb|d0^ILO3*w+|-Ya4}rt>$-!=2v{TuooMC>so|< zo#t1e`CWBT*sn7DuD(a;uh#tT)cn@pE9~nHzYTvS^cyt4yEMOzpBMIxhTk>!3H>#i zUr_T491-?_;kW62q2HwW?bZA?9~1V?hToP4gno1J z`=YRKGyJZ7ROqkO{Hip+(#M3o)bP9ROG1C0=2xxxUH^ozUvK!`@MWRDLGuf1eq~P! zdzs<4{VPJhUGuBa{C0d**moFyH$EryH)?*hnqT?z!d`Cp-SmRc-=z6PG{2oM3j0pO z@8*|;{$|auPV?LKvas(m{BC(g=x@>d_Gx~*UlsP3Jl7*(Ox+>3OOsdy)r7WO3sa{<=iOjT+Qt@QGvB`ZWNJo zqdKRI+xJBU?*GVhj}vO{QTo1nk5&sDl5aEQ+YR{+L#{C7I}Q0RLk=49UPBHUa-|_x z8FIBDhYh*LkZTP&V#sxdyw8yLYqG4JUizMps-4+sc-J*?4e-h*yarg?;eB-N{l4Q_ zzVy)}j_;V)x7NYCYM-qQcs~5USsN@`>mI#?t{roYkb?J%$TmyawDp3rd6Z3CJ1AR< zvT5rGW&0DYTln#`HHEVMUfHzug|hud*|fEWvi(Zgv~`EF{Y=@kHHfnPMA@|Uh_bz{ zY}(pH*?y>O+B!wq-cUAe&7y4IQ8sP;qHM#;rmba^?Nw#d)-}r3qiot5N7hhw)RoB$COQ52PxY_%BHP}laZY}#5(*)}Mfwk}gP{Eir&584_{ z*{)PJZM~*!%al!9yD8fuWz*Ji%C;T2Y%!&MqL}M!F=_3sV&>ao zO8Z1Hb8Rtc{jOqW+hR)lL@}{Ofi2-$A`U>soN9|H?Gwes8f9Ee;t^EL56C|5`jqyG zVq%RlE+%miDkh$@kBceo6UD?DWn4_+GgQnU*kVfiL@}{O85ff{5f$?rwwTgBQB15+ z#>FIFMaBG@EvB?j6ccNdaWRRTQ86F1#gz7mVq%RlE++9iDrT!KrnFBK6Kj-lF^MBm zF@v_4(mqj4tWn0rB%Vpdyxta5+9!&MHOjb{#8s)7MYfpIK2c1pQO3n2zDvdQ*GfEf;w4F1NFiVqfUB2+M5czxxx%y^znTaNHTUyB!?&LO!p; zac5SD<6g+;RygjgyToxXWH*>Ez?uC4gh2zey6vw@g&$DpcIo0C07xK9lj(bs! zIPQgfzJ=q?jfmr3(C1uq+>7_oy^DV*=m>qUgz?vVfu1b<&-iP#Kq>OqgdsmJ6?h#N zABX?X*dIQOHDb41+TBv$sscz^L0Me`{WZ|5eDpU$ePv}OzSaIb`sZ7AWqo5qZMZqu z6l#s&m+1#v8iTD7`epmgO$S+y)!w;8>3yj`vp=rlJHM9t?;HBxOZ}f)65D?$^@qd&#*NeX1vHga8~V9Y z|4Snd{knGg(DiU{JOtlaA@yereUa3ka1z@Gr2fx_{_ml8d6@cp#>(pk(%b%S!-J%^ z{oMwZ*U;D+W_i{3Hqc+9V0pC-2lFDKmI#yoL*LrW@|qi~L#-i}*BU<5%3FsH)K!r> zT;JSCDPCPo3C{AW8V}%)cQE`_i`GyjVT8QJo zlhm{L(TX;2OYZMoBbF8i`YXWCpnonW0XpR$K!@lPMAmCxu~Io4_;p{Uh4PI6T`< zHt76X8|*6o*9^V#SGb$(YV7o;XXNqyvSTyI=%Y>bUtK?UkxuoUcHTyJ$;H^_>%$i`A4B{HT& zj-(yC^W+n2DMlK^o?4{C79eioI9b~?9gwWqT8Qj0VEMz|?5uw{SOTa+2V_7Cr2BFH#l@ZEeeo`^k_PZyKWlSt%&7Axf9)5&YIne0?N^GaYo zK9~H?8~Zl?*&fBGpCZVlj-@y>l$lp>eS3kGQXsWyqt?CxR7Wf z^xD+f7*)YF70>39FQqdK=ImW(IKc;VnGAN%8M;k`KIs{b@fpe{gWYq6 zeI`Pm^bE)O41*?v-E)Q`CPJIekY&j$@k}li=fg%d4FXRIOs;!r(C68tMfsx^<&V`N ze|0KZtUBOC9SZagc-*4=mn_OZQHT6%g5_w-) z%$+d6osg?TiQWlCi}GhI%Ac!4o;hKnnhDpHI<)A$aQ%W+1sAO z&{E@wZ&_9FM^+WQP=^AR5o7BaaitC|HIBGyRl#dk6@0%A1>6zq8S%q9wA47_O{)st zwyNMKbtqtt7+=qbN*!8i9P!sy6})3r!Sy;6a7V0X#LYUi)Hvd2Ru$Z~s^I-P6kL5Y zk)Gq1&FgD*XsB_*FRUu~(5ix8)}i3)alI2hszXDK6aLMrg8#It;D72+aP@OKC)}`+ zI#lSraKk28)*xEe;H*Q#wL&hFE}rCz;)cythmu-XbXwNnwyeQZhlcN_3$g5}Vr*(4 zvhEJK4n?)@@LJZ;XIX>)el*m#t8#Tb9WSi=EvSunf15uyY$GOO_j;^<4K|}xOfnI}N1(sOjp;w9o06~_S!IZ4(zElzTN~&oa>Mr3w*zzW zGqIUWycl~bn?K2v(>^HhxHbjCT|Av$=`$h(VBt336~a|?R$a3 zY%U+0&Yqet=3WgH&dyC`Gw&U?GL^u|1nhiOwcfCkHv&(^PbX*cS#Tu!Bg%JnPbc$* zbasA)Z#%Qy;Eezowl=^nG2drrETaHBU}rqa_UWUHj{N+jTl6I$hytYz%K!Yv-#zhb zZv&9<AAa-{2${zM$HqzUA$Rhw#C zR#kY1(`2u#!QcwOj{sHx(!e#_D^~!I0;~X304e}UqrDOZ5CwPwaa&VGMrd!U1Q9kg zY3>l3Du|Lcd&LFrQUJ>6wie(67;OhVh&!4pCjdS}?D8{qK~@F9r-H9kyIq5XP`FH* zLvFMQx$5QEjB~A@v~&f$wR2(*k@H0w0CIM%vH(DtM-PbX4SrR0WejycOpEm+IK9d=LzZFDXmCgQ)JOm z?gBX{*>;OaHhAl{^se&|As2`!Ulv8Du+yD-;0o||!|8UXc2odf(ApKZ1~1dTE{)GE zEh6uZio7CUkXIyGS}YNdSMqpL5-FLEb8)kAN}B#F_Zp|9jk5%E?uIc1ut69jr2T*a6kmYT{9WU1=2|58h3#U5Qy?Bj4w!g*u?{nH3Cc2c_GP26zL&=Emp(LK*TdEK4i9q_-DCi!UyQB~l)dcP+`7OJPa#C)h&$ z36S~|AoV9e&L>@#H2Z)??E`pdv8ntp*Sx+KCD;CJS+%22RMpo4F|s=rus-@)9X_Gqv4zWy6JFlGN7uiH z_J0xjANTe(1Q#A|7+l!f5L)<5Lu8?^VFd64fFA^WbU}pNRO~?*cM#4HoDn!j;5-25 zK{!Vjl)yQKxW2H~PHd11(1tug+{JOcfCApZalEJkz7=r4ASg;>W2wo%3_6}z>y%3r zZugdyeo<7EmX?yE{~1NuryB#?`Y{mKkAe8z#^_^X_zhzObz@*#KL+CZF%aKmj9ug< zShL?*#kIFn`qAP!h!b2b%3-(kw8Uy_XWJO9t%Th9hi%}Fs0IaMr??o^pwnFv%4yq$ z@@b;!aFd_^y=T*>y*ol1J7RINu1?j#vM%US-Y^Cz!b(_ACA+8n<9c!T#Ruc@N^-L#1kLk?!58$yLBslv||wCYsg8KJ1&+xG`=B> zuOU}aK7#Qz?Zx;S zY6;6{!t$BM*NgE*K0|zc7+<98;>+?`Wcf_vD{mHGp3gkK#(dr+zCM%q{-a_2_F;UD z`Hb;3&u3Ru%V!#2KgJjN4E6>vzDU>E%ktSB)$*Cfw{Nrf@_gp;HRkgs@eP>7_nN`p zAja32&lq3xd?vgWVjhGrzGnH1@iouqYVE{)4Q>`+p3gkK#(dr+z7do7J_+`I`%dYu zHQKBeo7HTz*{ntzYccYf)oir+ZZ%rJX7#li<;-d{ug%zxwVK!HenX6nHQKBeo7HTz z!>mRdYccYf)oh&C-D6v)oirGtVSDaG4h$!Y@GMqYP8HPWDV=>#ahknJ!`PnSfg1!*VN*@CN&%FFssqVT8w;VH5=!Bw;HXtcdxEi zqnzKQM(gdxTFvb}WU$v*qgg)J)Z#&tnvHgt)o7N_)mn^v5tEvY^S)b+*4sPC;|sMK zquMO2YB&0Oci~uNw{Cd z=V5%#ynxS?dweN;_I1Itf#mb0NrM0x#`7oU}det*i1`x<_Ks^s^33A=YEWc9sm zZcx|(5WfG#eFJ;(6U-a; zBRWNG-v;TjB$oeMg*^utryJ(9r2(E7mCy%*+Xau~px z?>69)4u`qme7C@TU}szKuSQ`Ct@aZ2`Agt)?(dS$-xq<0`y2N@;JICX$>Dba^fiLN zaeoB%eq7Y|_4l9)sNO-_Yp0b#lErx$Z++f44zSI;_zpo!o;= zj*vgTy-kKtiU4~zx>zcJ_ZaYA1OA{6XR*Ho z_YXYw&)RU0p>_@a{wKT!hfC6>;jhcfZrM>@ROBT|9#-%f{r}q9>=_nd-wW2OVU61G z*%xc1w1@pnS}vEF{tYyVy`|NojmYk}pOfO+{C=a?yM553s5I` zKKvH+E71Q_lYW%rbbd`jop_J3K_}E2t}|?=ll5<=6W7QFolvj1PT5Q+>)%W#u8|Eo zq2_U&t!6q|znMHRWI9UGzga3X<&0_Ne>xcUSdK9?= zhI4Q%JrBP&Iv~SukmO^>Cgs6k0RH5@eVOe1Y%(9q#fvF?SsyE8W5pD_%+KfM3bE<< ze5`OPm&@jhz%hL?KAWDOh5lqLk(@d;8%xj6WMebgd@P>H#Il80E}xxAXOe||h5Yos zOy=}lKTw`bP8TU62NQ|-P&_dcjtoa4$zVJ_9ZUp=28M^GlZm0>@c!fg{6J}XYWiV5 z>x~e%wh8>I>fC2)`<#s4>93_f!049^bhy=kqnUhUp!YNS3kLcKqrYjOPcZtLfqtCP zZw6?i`tQq(er{itL>mc4f5AYfL56wWrZ0TgWxmGvud?6U@t!3{Cj*?K!OM((^k6Oh zPZ(Wzq?Ue-(bw=kTX*~!Z8wu&!241C@e@Xm8t8w+=zatJT}F2u(Q6^(14ch*823X) zPZ{X{%IIrjwR-L_`uovZdJ9dUX7Y}K-pT0(J@7ghj%IRM3v^Z4UPeD>poba#my=ai z+K4i`Z1D3tNVki`&3-S*r9KCAv+qJK0^RJp5E7Wr7Lz29I6DuYx{$!^{HZ`HUPux4 zAL+$B3FNbhcri`_#pIbH<&Mv#r-7Wz8E6A4)0Bru(w!;gaqel}B#PXSDUwARuf z$;79UnOHCwK2mLtgsaW5P_;QeQEiS5uQw;F&G8Z592lyK4Xo=Oh*ZS}#v`0}XmCGo z4on1jGqit@HzN}Rycr&@4iP$16$?$Si%pDk-r-<%4&n7Vgu`Q;A$+8&EIeMFS$MoU zaQH~I8HrStMb?!?#;c=6CPplMP-96Lzs8A8e);&=qrYj$VxEQf4>6sa;k0v6-e8db zK9+a$>lVjR7O#a|j9(u)JjxjH`iNt}YkGR{%U(NR|Dmc`F3+!z9C{f)@8|N2Oz?9n z)Xn?(^^`-IG4g&se-0(+Z_vr}>o^D`1etY57}_X{HsHbQ691ibP9scFv#{r){hd3Pf9p^zfGOS@iHnBE9WjOi#dH(z^ i(D>Y;Sv^t*v?jNockppJo`Qy%{B0(G%OGGt$^QaNrG_m4 literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_32x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..432c63e0d0886c12442a17222121c239a7dd85e0 GIT binary patch literal 18512 zcmeHPeQaCTb-$!2`H7+|%9d?fR?H`xmSsn#sBgMU9Lka%C$Swj$O?DJ!cZb5vZh3i zq#e8U=qJ`vgv^LNb?AnzK%B&JvbJeEz-_WM5ZPhC@`pXxS^r3u25^B5D6kDMwr&{q z;kR?{yZ7<(`YDdcpy_};cJ2QX*YPX zyOJLB89b+=9**_cK34y1B~2SF`2n9i>(37IPou~6`TU;xhg=lr_i6j%-F^}Du)oz_ z*cZhU2aK0Q?`$x{oYJ-+S_Ah-P^SVNJu?32vEL-5@V&@PG@gz4PsB2_v1Io5FTC!L zWTvw}BZn8h=Z{S0PDYa9$mC=!oBehs7Kw(R&m?j&sEI~1APZ;Hk;&NA>11jml6>7i z8<~l{GX91?H8quu<;d&)Y~rgitr(_(Wnh`j*i&JaKYrDwdfyR9;I*qFhO_ zP_Q0BOK$Mxh*=o?+85>N}u%zNBIaO( zLjTG{EN236tO^4n08iJbf4oNhXR6SDGn>mKqOtJ+_k!oDFd~8wtx-Q-qdvq;43Vqy zVh%zd2O(XB5fOx3jryl+)IVE=J_BLQ^n~kL6;?zPuAi^fz+$Zi;MrTPH;kFyaQ#vh zRw{t_gIW#zQLP5PRfPfO5u;^~xLk#m3Lvi3YT(UU4Scr>100C5NBp1)D-}RotJT0e zwHkQ03Ihy?{bi3RRbiz9h`*}UznM@10sRAHe4gkRQb;6G|L@ZVJ!xblPm!VMd# z!i0#z4V$B83rfuvTvb@OnoTDYxf6U-+_1T;Fj5IcTg?_cHCyPY!os%`*>LJ)EEj5YqoHxW($W`V?kcEPnoWJE1k=Pv$2^-I-bhJ%IkN`=zF#6cicj4#r6Ah z|8ypGG7a5N&%|bP;i<&wSTu~k0{smprhi0}l8}uw8$^4Q?~$X*4cl|?_-7)g z!&AvfF8q8dbAlPCbr9e&Ir-s0__vM@MIeQl8Q429)i*pHo0$nur3d@T{fs$+fOzd|cEn3mvH{gk4vwg>(ola-Mlc|%l zx%3N*Z}O zlqzOTs?Y(X_z)~89Z6ZU8pto(q-U7KubV1pc+t8Z!d*_gn%AH-qui3k=h$d0i<;eCG-p8Xec2e z&GwQT;MRbu2zw*60~%@pJIFg5O2>e{NbKs1c897v99=qKt=sJeC4|xi(x4WhY$xAb zt|NcATvs2+pDSG`-Xu%QH_1zIzX@FEIKBrLB4^#h_Vl zrnFISq_keIr*wzDgHmYs0={X1c9cRpN}(O4(2mmfqF3`ScrR}9?pxZoai6xYuLo>SIW~LZn=V7!t#Erh@r@;*=Z$j7W6(>quH9(o@y=n} zjU{zSJ+Cf#RqtGZbojg-9dR!ySoX8HQa=U5{^k4iQ!x6OhcR!3J{>Rz$0#8$6K7Y! z>GV08pl|3OG1>@o)yT`sxTcSI8;W&g-o)%b^+HjzAIod@r{I1X?l0vLpL)e(~L94b}k1Q&1FN3yxIS+FXe&^Pw+Gg^LtSdYHZT`#>jsK#TY+NRbwF{Q3X0Qrk=k~t^DdXKp~)Hl zI`F#Uaw|57oxm5IT|STQT~ZyeN4$Pb_xEZVY#_ZiK3 zQJi=6b5A;2lzctR!K>Ke-R{EsjwU;CG%0ZhlvOH&`6UQZool}`r?}#SYrpxsgKM2h zwBcx0$eO0LWKMBXxeaB&*8yEqAIc+D!Ow?O@6s0UJ9C3`FV78mNinEynOD(whrGu6 z!My4E0j28)l&&98-i~4cJjDVi#R4eB0w~1-D8&LO#R4eB0_c7H03MKr9|Wl%K-ePQ zU@%U70v}QEc|2eMzYh2wheOlq>kAD%3xJbh zr;XS(9m~wl2F^1(8#&MHY~noZLA2}PJbrWl{ASMMOM2kjIlso>w*bFKQ8cZwvEbbE zqNeo<^TDz>ALPaPAivUl`VBjDK3Ep#gS|NMSS)QdAu)p*)zMXd6Yi03Ay{xdRXd^K^??Sc_m~}m)3XpqYgsUl0ki* zf}^{P~=BBpYC4;tMJ6|JfC*X|ou4X0|@YdE* zeNV}t9X{;4b*V$|^isUN+nmK7t!G}`E7)e+252Mg8NRp3Q*?gaA%pf{d*5c(URX1f zcRj%5;@Y8IaqW;7*A97c?T{DNP8|ClZO=U4Ya!D4cIP%n@gM6dvF`2!tc5*@Snb{c zSV?=4bh33dr~}*e>|*UCto?hKT)>)aXMSK%7tX)C&jM@QTd;r7@3GD9A!vg#!T&kd zT888QD%j;v0uB{zAgJTwwJJ`evNVwYve_&kr%N> zUc?&tRbV|NVNLy7cvhNzEj=??3?VFrl>HjpS@|`}1;1u7L}4*xWxvLKDqv0h8f6w( zvlybX7*h6YY%gI={Tk&W*2s%kBQIi&yofdOtHAp2tm3s4L#SU{#}IeOh#_Ua#�c zG+v`zz?#JnPsoTNEBiI>Qvqx0*C?~Vn#GXrk`Y77evR!Vtf^n4T*MlA5o_c{tdSS7 zMt&7opOdhrel5ii8n3Nm2;t9RGhSQ75E`$oV~F|OUNK(dJ{7R0evL9Ktfgna8LzRu zgf;bRl#5s+FJg_nh&A#e*2u2{Yad|!@&6P)@hoC>=CC@OAl_DI5&6tv#&9XFAU5|q zi}2^5)tSTUjDj|Qoz5cf**^I!Vs+-QI-8ijEyWS}%z^E!&L%7RHttpC+Z=1FGl$g~ z#q@2oVddK>7kwLf(YKKoeH(evw~=3kZ_h|r%V!a*Gl$jL#Pn+^j>u;Y)32qtvZ7z( zUX`)7ICEH?QB1!^8xq!^$5~{ZgtdGYu{v{5zn0D>re8~E5&6tv`n5$2u{>kr^IpQ* z>dawvMlt;wZAe&u9%qraA)j;4?~WJ;V1D>JIEx__XAY~giRstYF~mG`n0_stO;+@4 z+@})OR%Z^YGm7cgXhXvK^EitWpIgwM7j1JkBCEz#7jYm>=F_pD$j9eE*2oS^PQViZ#eR^7k>A+y8B7i@8i) z>2i-l?gjJcm@ht$x#68%G(YNw^O(1*D_-d8@)c;?AZ?2|cE?`Gq2l`x$fXv7K_6kc z7($jmT(5$`R-hoooDb%H2uXf`Ih>F|J2iuDD;abtbSK*JK;EbU z#*ObW+7#n`56V@qQv5F+-c>+9TVYIV>mcW<4SwuU-SBuEyybWV#)EvP#BY=M?Go>k z_#G18CGk529>#*>T?^;Kw&uWJ?t>vT*$Wi&R{?X5cR|2=5!!IP@tq0e-_=*0K{wE@ zdcYgstH3)WH^m=O15kwZ7tQ9Hax)o-br8te{2ZYdl7g`m81%#e429F7Q7-cA1b zR(<13w0_$GqkfyDxBXE=zFpAUCFuDcGwOYk-i{{>`3^yEx1iT`#HjC*^maaJ$ae~Q z4>CPM{`iCS-LRAjP<`BpdctzDWzeQ9^epbvg(-&y3obLeA7x3HEP@Vg{_x5Pgv z@NDfb!ZV$({RJE5=_~i(*FNJtI8yLl9QnGs;8C5$IZd7Ssv{caVSnBBX6J~bI1F=_ z_oxl?jZoLi>xBKv>$-WJuy=V~C$AIsF|WgSy|h1JPxHFA;1vSS6^vkEf7k)zi0^V)>c=-#qyUPUQ z0>laK5068=2K7I&s7F06=g%ax`LLBuh&9~ikd;kVzmiSdBlnCG;uW{4TG?dvE7`<7 za*s`jdE91`l}%P}Ws~obgP_M_n)iU$rk2caDDN zfc^eTpL5V}^KO_sJyXrWna8;9Y%*vQ<2{Vxul#tOWdf(c5_p+UuYbF%r?3!Y^>8kr z*E8Gz<2gB;n1#Pm{(=gByR05QJg)W!{P0ih?oFmCLI+BayM}FaKDx8bMM}V31 zOg20jnGI)8rqiiR4q8l}h)gGDr=dO;j>aZVPKOh-Q>pM&Die+*li^f0oX(`C63JM$ zH=CL4O(svx>;cJFVv{+_sDWrSG7yPQ4GoVB561$L$YdZI80Z@rn2bdSMuzsr`rr?n zCnqL%@loGPF`Y;t;c@oC$~qU5xAj!Y4S3g(OOkvWYkx{D z`DID|uT1WFyt4gWCU1MJQr<`%2oh$Jyp78xJMbA9Tx-aCPgL63&E(f4`4E%)Bs@b* zeq56O2<3K#9DBCX{u3a#`p)zm$gRFJCH~n|E=K&(GqdomEb&jzp7h5f**IZ8QJ%{X ze_$N~{_)#&z?{0IE37~P%HcAF1$;d=3 z84d)74w}W_kXalZG>iMk%;M-sxj1eX_Xl~gZ)m?++-s`!m5ucen$p1$(?b8yC|4UE z4e(-c%pBQBaF|O+0_GTo#}9Dn@c3R{+}qdBizCBk7bC;wAcKLju`zSTBV*-`1Lfj~ zX)IVa792Bs3yu$0^QCSzI{0;Oc>K#pM~^&a1g%PkTP25X3%iJ>qo^w8rSM{u|Nj|= zkkFJiFC#@=eCBaA{v70#haG$lVqXf}c=5+u3zYv|*Hc`dKLI+cQGb z>Rg{cH#t=$eg1sq)C={Pw&(i%In8O3>2rc}5U-uUpp5I&0bv8_`*9TE6{0`OKk%`F z`urSWevjU6v@!o*7p@`=0KZp9U7Q16JpS)Ja~~+<`Xv^>*vCJ_ zhdI$?mswOK6WUY d*X8|2k4~On%9d@~mc=JUwq;wQDT<?R-8n3oFMIz&MQNSl&F{z zIg)nl&Xb>5OEJd7Bu7GT3(@O0k!y!Zazx%WKY_ndQwMuyEMQ@}#>qlzpM6T3zdVAq#} z`o$yFDDx8q{;nl9QUhZ24e06>G|z%@sxUhg^ER2NglJoImQ!NadGv!W5VBS0$(~ZG zwCOyjq968E8DRP^m(rxplHc;dv-Zp&zZ*a9&&T)JKV~90zPoLYH~T2~VSD#_U|SSV z>@Xe@wQSPGMXIx;7U*v_5~2bfJQ951*cS-Nem^`FiKWA?6VcRkG@kzY^RKzWsfqN@ z$;kZoUE#6J$#6Ur9vh3M)89))!;#Q4smV+fG?7ROWTA92JQlq=5l@VUuVPktq;yy-d>j-QN%PE1Zm-quY+qbJA5qp3OF<<(?3!krWYr#D8= zglGsJDRU+neRZqU{1H4gbfVavBz=2rKhp=v2KcDb06#Ayz?CDB$thko zuPv1kp#+3qRvO?}l?M2A83C>w6+!s4j0hzl{A;BFep_jP|0*NEm4^iouA4|1Aw(3e zn=BPaP%4gKDOsn6-Vf}mk8zUs$3bG45ycIbL%bM+vd-8li$GYPCx74fX%4TXi4jVdl|36 zF4T2Xu;K`ZD~>R7FA?Osc9+|AZznUUP&zslPR0_c=<@wLrnh~s_wNYMivdi@yUs4%a2yb(dD}7sdrsd;nSh< zcsLV!CXqV9f>RwV@VGhoaewGbM|;Asg_-jDN5{K-6Va)u(0H=jgG?wh9rF55`@Esi zbPP5q<5Ae(#MLo07LSIf5BRa>yvjz5|QRaB__g??q$Burj z3a;_wThyp4IVu5xRN@Q>mUQH9n9|b56}U@4UF#Ij}IW zX+Rsu4!qE#)#S|NeD1?_ru+a5w?JN9G*{Ks7R>8sfGZTtjY!WG%$sHk=Ll)u;#_dT zn0&L=+&1vDkqtMA(*$qb=0339Psn+qsFxJQX4&G1ZGH!Ao8fdgVw>_n&+FHDhfXh1 z-PV|MYm;v4aL%Irro6hKo>v!~s&h6=n%kVs%`uf^4coe~T3cDYjmvk_R#tCo4*K2< z?OI?gmO(;ZBG#Q*tF_Itra6XvK&-aHm^Jd!BHr5r&YD~mnJZ%VABAjAGasAL%#XqO zIGkUcL7ZyAhuoN1vb9?@r1gxh1zH2+A~#Mk-6^I!wS>ARiuF9?%Ug>Y+#Bp)R#DpQ z*uVYQKV>K7>(tnVCg(x}@!qtwz``-2ikW&_eHHkEVzXZ`!MjP|v(}C_ zN5Q$E!iz)A>Cy_WeVXQgOxowBbo*AgKQJc>L^5iZ4)# zFHnjvQ10ivnm}@Y9Ag5Kh z*33^q9*)is`{PS4cb%48_k6BkMvMvtcOBd_vvmg7S@#<2PD|DJ%q452jqAGveHYa~ z4fu@A_v(uZy26jZ7|Jq%~3h_PkCz(9XLB>{^`kUN~E=xel#kj^clSaQQeWpnQ-HMmxTV#SOI4NP=kEgi zFsE2<;Q0G`VEiZ(a)flz{*!=CEoj%jpV>+H4cyP<0)C{M`YQpQ+HidSK^VW3-ze`p z#N+~gq=)+313GO)yZ#4P!4Kvk>IaT6{9r!vxr%uBA2r6W?+c~)^*^cOH}Lz*__a#- z1u*|`J_z^?Vg8{^#81z02g`Mue?iPYw3F}~#{5IMh@UktsGZFvnt#F7@(yC>QW!^H*W>m*$@v^AGJL{5+U{C>QW!^ViPiFU>#i zYWc_KFV8=D{;raL9x4C4EdP8${$(Zn{Fs08{Kfn;p1%$@e`)^pWB#F?gx>(>AIb&% z*!)%5{H6Kl$NWP(3BNwfKa>mjvH9E1<}b~^{?+o2&tIN@^88&T|N5l->u32lAmpD% z!Y_dNhjT@oznFh0TRDFTuT_|*LCin2lkgkH{6o2jpH$m$eg!c9&`!c{2=fo+B7RaW zME~Gw`N!uk&p&zou9AO4QvL;5{tXNH_xDhLowu^Lt-(gM)~F^UhDJ45uC-`qRFe?{ zqZ({fYmI6$`robw^ED6c#9E9s#Ha>~wHIsgYBl)l(ld-)gN8{ zqneDk7}a38)}ozJO~&yX)nKDqYgCib|8_N4#LvgqLa4=vi`!rg6!F7ajCMvf_+<${ zxdyZOTT*NL3~DmsVpN0WT8nl@H5tcmRD+Fbtx-)z|J&7I5x)VU79%cxgBmR2hqV~( zjB4d&2Ft%A(axwQ89d?Tl(N%5PVLMf`?&{y{B9 zTmpBRo#FevxM%nb*y6rULFupu z;rAl$x8Xk4dEB$=?10_M%Zs} zLKZ*X)C2(=fT}=Sfr1wIGx7I6A@Psk^$Gm$q;$Kc)8@QRouq-v)qqZ0G@Z8Qb$TCZ zrTVsjPIqWJ-I>?vE^NmE`w2BLKHNubRP=pkl&el9_n!sWD}{bEL*Lg`!9JDd{lwB_ zhes{%P0Io3FY?!AnV@|8@96UFAcx0PDsLyhxmjKJ0@XL)uj`v7Kj#5m z?iBoX3x4XLu2&_$mV>&yMey4r__ZF=^{tZMeGlmJ`vkwef?wN%y1q^F+wrh2-y!&Q z2!1<{==zye=`7Xh)llc+y#~*EKhtyMm;ycF%pdBmbOgo!1yETQ*fjxPh?qzeP zgUy-FyPg9V_#8O=iKSi0DGm7D62C{{_ey++#CHlj%h47gN1Gu>x4Cq>-KS-u-8<^0(IV@?>wgteN-3>n2_`qg6MnzEvwu-@~S4;jT|`qg6M znzAA$)C?Z8$0#P#uND*6loc_d?(mqZQB0;^EherhD`GLRt&&kN- zH2mx0J{A7Wu{tsmR6TAN{HM0>izlWhqNz|aoQdJbd7*S7l!?K|d#U79Iy4rZ4y8{f zlZjLYEXGcRCnl#SKpzc7qN67#LX*?uiO_f=6$;1Wp+q{AOeMxAr-H}MRI~*DB@%8$AQFk~z=8m|#yL!9FqLJ?2p8e4-_y@^jqhq^yuUAss z{wDCRsL$>#)!CTb0ne`Dv6;zVl;o{Uu5^mlgy?K7xgy#7nf-f`{2-HmD#?#Bxno~x z{3n_GLrETCatr;CYuSh1WEk(|u4NfseTA~MmesG{-{CbvlK{))*z4Hix5#lJAQ zdbm`6i^<;_E|u5O1ga%3O7cb~SEM-b`4=3ua>bRrWauE?2b__UR{CZtu%XQ*{}YH|$3(ReBWQ#=xZ2~J#Ni7EUTnBZr!neZsk7qn=!>)L5lz=7+Rct|Wc z0!oxgWsq?HABfJ(7DYVH7vY*?hjM W&lS8~&d1=w$p3xje^YXhSn}V)yyi9l literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_48x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..191f0841aea84f87a1f224218765002d966b2c3a GIT binary patch literal 20544 zcmeHPeQaCTb-$!2iJ~Q&vSpi=W%0?8ZP}J6N}?nxa}-LJ9e>AhlPn3cGL%S(iYbvJ zX~)St`H8g@!wX_hDY{}UkT2Uw+oa8aw9VE`WQPGMhIp{E{*fRJ=3*Vt0UHqP3&B3j zopV1PFR!2Ch|Gd?7!CUO&bjBD`*H3$@9;iSXZ9T0YqeMcHo{&NWRzfQ`BV)_e=*xG{ltE{ANFmU?j=a@C)9b70Az`C-lYqj{SW)OxNWz_9xc-KG4JdmixlK=z8LS z@ulF$YZ+UhSo>;Ve+X?F;LzcbdyYOpNap*YschEf}ydoNGkQ+WF!<0K9QVEN1!DfP698ON`%HDS0-Zd(NOHNXF4<$d2!?w zPkekl6-krJp48-15&QQ%CquClk>K&k>G11JG&p)aPaWZI(V5evv)hi)Wuo+Xk8ntgP&4W5wj*paQTn_` zI4nl!HR0EiD*c48(=Q4;x*K*IyW!o7C0Hq1 zh_9Dx;G5+d_;v{f_>LGV?1&diuu`-Tm&!HpYPkkpE5U$Rh{BF|qXa8O3-M;T2Hq;y zz}qDl;0rNa*b%uBtQ0N8UzTg&XXP4rw*&)XAqqR=E`j>wDfaS0ZR7U5sZHSkHf2L7uA1D6g;i*VIK zN-!a>!c~i{Yzy|XE!37^;YunIn@k@UL2=dMEWt?eQZ$rp!Bw_}rV=cCcQO@>pGXHs zyZQ>tp_O2$csW|jw$N6#h4$rGDD70`((q&`RaiGK>v4I9KUXb&6SdoYtbY{_qk^MF zy$6;vUz1a)tCo?nE$l1X!lC6@P`&mUkLzAfq?5r^WGa-1#*>kP|BkZ0m+QZWOw^Y2 z-=BIWlJOG>=zd}Pd5_5^mNeY zKh@`hV@*to!Le8*G`;61oXaB zxVDmSELM;|TCAw_&Ya0Tlf6b37O#mJ&jLVK#n5FQ3N@PAV;w=>(<@#?hEVPgA0S}2KB+r z;IqAYRn|(*W`DlQk{g8S=E=*8){3g?ymj>qQ2D&I0r5=Ux^5fjq3U&d~4b>jF2 zaD4Wyl&;aD=bGFL^~87G2?=2}U_Ia(z}0{afNKEP0yY3rzqnv8G-(S?;5lKgs9O(+ zx~lM!M=4xwOt5!IoUjsUCuXUcYz_Ur9GmiFFSKhs# zL7LXx9zE~r)O8o^q)sozO^x6mh>1LTy=ZRhac;j=nA`dqPxRa-_ks)hUk_v70O$g2 z1l$0)5wH=k32-CeCcq}ZX24B=bl>4zH)#v&fwvy!fVNzKXlnx?+G+$uTN?q62AYa8b@-0!_^=1-V6^(P?pCm{7FAoV98^(Uav zqiX{+T^m5UHh^?(0O{HQ(zOAkYXeBv2Jm+K0;bOQ#UR}mkp75!a4<^u2;8{x?k-*5 z57^bM>l$Ep55>D?tFj#sKfOB1?gY%xeRE|I`#Wb3*V6b7hatZ}&VV@9;99pvA@&}h zA*vBE8gnd!S9s7q8{{ zA}!AsX?ea#%kxEAo-fk!e34#az8lF)klSBe#QV(#_m5{!-}_7!Dyb~qJ9}O3r`^2v zXs8>awFe zW?S|hTe0u_UEn*6YaTPW_ILJXn)1k#Vuy6mwIczBHk8{nz~vO*4c^81lJ7_l)wc&2 z*5bPQhhSZm9E1Ej_i(=CJJL(_odJdoD7S0BiSIBs@VG>o!NVYfJSjef@AUJ0Z!`4a zn)~mAHAi{HcXvL(`HJs$JRUM@)h6FxI@0|JEUdbA-xRW{X>OG+)1^F5qC;W!gD6!In!pu z9m*-bqj86P$#*^}U!*@?9zN2xc4gmySEkLz3@k+#$~_?pUt$=NJ~^4&@c!(YQmt z;yW65$X9$v;|}?|a(YQmNneUWZ&bZ&9yy80=cgR< znxZaIYf98Lt3lC!U*=k~UHP1#)|zHDrCIH1#GaBj)tVCJ%xX_uKeJlXtfn-pJyHL5 zwWjc$T2rE~Sq*B$9_BN%*h9Wq>>=MQ_K+{f9@28`A-#;)`*TZiy`t8d{BvYcO=(tp z8gZxOP5!yls40!OQ}QUTAB{UDZ}QKnMKvXjJ0*{zzG>WHF5-F9@SRdqqOMsDYQ!Dd zP<;1gs5Or$zN2x6bsfLwnbnju?odX$2O4pQeCeL2)|5uvp`3IN#Py?bhkWUtXI4|1 z)t;zt8h2Q~DZb-5)2Jy?*Q^FL;tp*nzWXxNntu!DueY47VBX`t<2kdarZlTPjkq(< znMO@%#2v~hzN2x6e93oaHKkeYiR*6~cbJD1-|?Jj)Rd@eR)ZRGhc*=7eHm)agyK83 z*5o;p$Tg)|?P>WsKE$(c0n%yXu3wukF%R(m4fH13wpna24b>YCM{M%(ZcpFMT7)8{Qt$fNG|_GqTPy){E+2dOMR!?WED&t&j9Ej*ja3=XyuewIqe;s@(A zFjx-=T(TLk4ZeE-XSsN$OGxZ}IE&Q+3|n-Dn{y2BAoaApHNbF-&Twmv;Wo0FwzmZs zZr2&^$T8fB{kY(~xC-Wn&tDtt?0GfvHMc$cpLux33gc*ov9GFtXFa;_Bb(-g^x(U0 zyBo%fbc;f7R_Hqv8lJQk>2Fc!tqQ$Oq1zOCyF%|!=$#S`bHVxk71XH>b>6=igekmZ z%~0RI1ilr1%}9Pd2QtF1_-q)SBWf=>2AzOgE5Wb${1~2>JE@QHd7m0=gj~0;{bz#wmfeis0(>atD8HTj)Ah=lXKDN9J*<7RqIbtVjDLru*CFY( z?q}_-ir$ukjK4+F^GJGI53}~Iir%(+8GoCk*D2|>-N)M76us?VVf^iqo>$V_@c?Vz zq3G@WZN}dz>2+~ELjL&ub?vbA?0_C{PCX!e2}OySb{;bwx}C;=C&#dp$BdW9OxI_Q zfpa1T_I+e)m-dtnbcaHF6uMKPy$anW(R`1#NPDyy_NbbZUi_SMQh&yMq5o^zyi0Rr zXLW7Pt@Z2poar|;SGDxpvd3VJ)SP9(=OWN{P_#+;OSFxMHYt~hw!1`|l-ERCpJ$~yoqgi zAHdxCd+=F9R%|58rHcsvdJ?x*C>;Mw+17Vos&TqKxRA9+J9=&j(UPGY74YEVP+F@wXhj9v&q|+vWYq6mT^M97d8)=+2rj@*~FZ3i%qy^ z2%EiTHhKF}HZiB%ViWE=!lq_sleaHr6LZQfHsRhSY_2e~$=l6rikvbIdLpNc!FPO& z@Ao-R->B7W@cHrdjk;EbE8A@PBOvod=K$hN)9({vzh7j1h5gofY&!P)h1OZn?@BMM zIsL3M3!f3#>*MtdS8&?J5sxDAb(%9Aj3n?SJo@^#oB23i_<1|j!Swa0Fu-X6hbE`t zuaexU!QU#;4jmfNy1gFwr?qv);?on6WH1p*NAXMj!Bjk$j>4DzlZmNRa4a+(Or1z1 z;>k2fj2#b6OioWgdn6c+jGmYXPEL=+kD}ctfEvZ`j+@)!#D~3HS8(4n(@(51EXOj%^pCz7_F! zjle7B)GwVdyQT2_9s<*g>)RQQ{@yuCw~CWMjI z+kcqze@4IPU1$q){wE5bIvM8sNl!r*ub$%aui=L-=`}~Mt4Y8oG2!we=eywhi}LGR zod4SWGMA7moS!>f%>NV4Z#r1af1C6D2a5UcaDJ=8e^1CiSSad=gei^mKT^kN6+pH}$2od0!YJp-J7 zP2v9t`Bpm#j2GKK2z;~Om7D>-+3!k-XF8sa5Ks8YY4~jn@k~sg@I*tYDB=GjKb<6= zWIP;7hlnQ~IhCf;p{dC+U`JxfID~jO3;|9&WAQ2c5fQ=P*`!0GfKkw9R}4ukG#ZHo zz24rv#-+c{xEv~6?(Qyd2MU)3wc*`@+toK>Tn-y*UBi0>x4YlC?Cl;lxC35s*}vQ9 zcxcEF@b5JQ{Jwr6>-QSQ{9a>Z!xDG5F-ZUJA)z)fV01CiW2g=E_zD6>Z$sS%cc{xS zHZ;^LWQT@*rTj9d6dn9EPjKYnBSVMxv!qqL;g(6^w#CO`c#KS6;)0|2ds(7c+@A!E zx_HkMYT`a9a32@M`ylpZ=d`%JtpfVr^ZB^Y7xzJdtz2HT3w_R}Kg&YfqFvlK1!{`E z7{5R`N2JI7LSNjc1;)6(0GxyT+6DyjggzY*3Mju9Ll(XQ^#8(tcw0ezQAZfRKWk(% z#@}%hD#`%x_i3n$k1X&d@{jmCZwbbxk3{iOte=zf@wS6Jq5lTYUvKPX0!M4OAP1#; zf@vIi-eNH#^nal6g}y+%Z5V4O$v$=uBfNZIbwFzH7ULKD=c~XIV~0!Qb&=nv#Qa5r U=vUBDxG>Wv{Jxk~6cm#D50GU0_5c6? literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..0d4f08a2f2ff12d9aaa1ecb3035e80261bb200fc GIT binary patch literal 20496 zcmeG^e{7rAagP)wQ?g`Jwrtb3Vt!I&M|LEN`c0*dOv$pWD6*ociWX;SXiB6+)s)DR zv||UZ{E4v?FAEY!4Z7|Jmeg^atWBD%Yu9WoM0P094zLG1D~1GX01e2nVpxG-TZdpj z!ri?e)Z^19TPCyoG4wM&-@AKv_uhB+?%wfzq|P51=rx;69t&Yt0a+#{-bUcz?Mnl! z@lYYs+{6n1Zz83n2*@bwVcn}>iYpV)%K4Bi#^j_dpzma;fW*ddl!GY{vWwy5KsNm3 zUWN-O%i&xm5(n{9%y8@$U4(v8X+9 z!gwjrvYnALkfCG)z+Y`6L<87$tmo*^BZMTr)4_uuLg~xoM*PJuH>A*L8 zUU!BkCSrj&dCeJ{dMaT3w)3Qr-+I>kSsQw-09&fao{hs9tvgTZb)L%V^{XFbC%F+D8=AhwfEE1ZEpAP6R+kLEm0}i9SV_Bnn z?&Nt5PNA-wdh*uLm$!z2JJFzeZJoz;Z$;u!Z!9qFiv+{bz>5Ed*w}aKzef$k*7V*_fAZm3HZJE5$JF5VEWJLp(LC}W*B^-smU43_gB$TUCp!c!CRv#Q;MqAut6OCs?2p@47Z$Pb;7=+tRn9LTF?Oupxg2EGF4qgexg4Yn4O>D$iO zWF+bx51*QeM_zZv&Q6boLmwQ~t(T z;F;RyN|KczS4y_tBn}h2b(?>$^nOAv5UX~{YAvF49B~;)_4gv&MWI3FTuu} zhiBLWV_V>vEM0`WOiK18N=j-h8?_+L18ueko?Ry|FQJbQIf{}6WIkiV|4b*6y7|Pc zZXSc%5BJ0@+N&63$c@?MQkO+XSjOQd4mSg|!ZVT^r#a7Q&U1R1@tj^po=WP21z1OK zF6rPioNvNvttrR(cHn%i`zT&q9lTiSSlmKdZk9u!Zw0soUe>Pjb!`QRx+(you5AEO*LHxYYX?BoWe14sB4I75 zd4bNYRGTj?ttr}A5_}r)w6)Y`g~W?^qNJ(Do^~v1kc@PPQ%^e^bX|iL*x;h@u3a!E zh@~`nYY7w91=3QJ20y^^C%a*+UHCjO?@M;-OWkpowzv!FFbC}44X|ZyW3m(cwR=H( zp{3M1tGB#7XE|zxb*?AVR%^+p>GKa+Hd|*4!JZD7SRq+>&$7`>EE}ys3$!xlK zqLpm=@|?9a*s|&CUv1e`5<(r8GAr4zaTA%dmQcD0X@D01+)(J5MX160Zq*!%yB+V& zwamRd*Xkfix3+s;!!_ILVD^S*q4owy?G2FH8z8keKx%J*)ZPH8y#We6j!O>q{6M46 z5AgoEqGThi#a5jp8vrJ#O|LGYk1kpxr8Hh55r|ocL5K+r+PXOo>-h96v5hRd?<>|L zo1aUj&1kc9`o3cDp}Axh^YwijOZHg`o|?T>a;8S~e?jX10_~rL{&O`43p}&;7j(~d z7WB*>D(Ia(SkMP}Ki~s^4+1^}_%PrjfR6(HDBzC){y5-I%-SGUbh8KM)eUzK+`Vx3 z!QBt{0NjIc55YYQ_Xymha6bz7V{ktX_Y4=?SR)=EV^D;m?)}S(24Ew&3;1lKN+Ly+N8Iw&gh@F3%Bhd5(z7b3|O8BjWNL5x><{5trwPxI9P10(H(T+ee%0Yeas4hv? z>2>pTT@NL6?C;)txf1c+BZU<990&cDFLv z)4^cx{ok$hkg6Y4Ra=sSmTZ6Yb%8%{P4l&h_Uu0b_C%VrMoAZa9^zrpf_wuHa6ZK! zgU2{s@&`Fg<*gnDOVMt94};xs9K|2~k8rx=4|0Uc+dK?bp!~pb&Zqce@JpO7`GXv# z@^%k{m1w^{AG0eyXO=%8&tQLlfcpdTjmSkvADCitsy_y&v;EQc6=whbQw$EAWpMDT zEB^TBf^2^bVSHfxO8ywe_&}QM4;GUek4+jMBN!jZr}$$O;{)lkKUfUc@>r(vF@*7f ze2PDYF+Pwk`-8>2i^o2Vj}eRyAg=pvj1Ofm~!&2hUelp08+pcrZSYPw_`L#s|_RfAD-|^xu5 z`0%V1A0l6g_)zoJ8u8Jq#D|B+M>mg;9v&aPQhZb^{^-N_Q1cbWhjG5rc)p_XF@W)b ze2PB?F+Pwk`Ge=HTAr_HeDq;_AfMuoevA*KOa9>b%Ej{)jgJA059Cw)F^KVjbjcq) zU%7d{qVdtUR(yzjCE`QPS8K$_pb{T_JU;q)d<^jT7?k2;N!ibbFg`H0<$Q(lfwa~6 ziilbQ<7foq1NjtxjADEsUG@jdSDD%a^ZyXW2l6TY7{>TOy6g{@uQIg?%8y`tAfMuo zQH&3y%l=^bDpT8FJ{nppK199}@uB9cHR5AbiH{*3AHzI8MtFRTO7Za+_~Z5eN!+r= zGODGFYA*DNQH`b6QpjgibD>R*YAmB#%Bbc-d7~Ojt)-C9sOCbS+^WWs_YAq#LOG)v zOVn1_kG0l%Yph?JviEeg#xkm6o4RCA%7jA|^STFR*ALV2SaORc4l z}?yWOhBlKmmqS}12!W6A!&S_}QLo*L^##UE;o#Xsk0axG<4bD^D$YApV_C{s%z zpHa<)b~385_~)psTFR*ALV2Sai+}FQ)KbW2RCA$SZ&hQ-{*Y@elryTaWPf0-h5lGi zja9GsL#?rlYAK_d3+-%FW2v%~$KFu>y)e)EbND ztE^gT*r4V@I~&zl>NyVb8P!~9C!-omJ@-LAqnZopMm3gtPK10$H5c0TRyCIFk6{rX zP-~&w$Zg^SYb~0u)=^`bz#mv+;ko$l7tAM9aGu`oC`nG4=HYy`WLCU~!gm-i0xzBu zr>#x49yqVT^L#ufzkp}r%}qf(ySL$aLsL^Q(bQCvpuBF%i)a0o4tLOw?}6OzV8ZRL zA^hx&kfjfHYM`(jAW%s)z*>OK0PEl#CA??Bb38&q?^hB6dl`jQI)l|I2Ja^2lwRXu zaIenbz7&J^kZRgr>tS%e&ftL*gZGkV+F$2kuwG}-nPRX3=V6C)#3HZ*zW=GPviD|4 z*BsX5f2H9a56p89Jfp1u&d2qZ4=tJvO8AzWmP7D7h*v3iwSwQR;57=qSHbrw_&o|< ztKj<;{D6YrtKfACUa#Oz1#ggWum#!+&mAhtTz}OGCV09aLD$e_SVLm{B&79o5qQM< z!FO5k9!0xc;?~KO0?3_`XD0O9x5c4Xw)zuZPk%wH-xf3n4eH+y0dx zz3MQdR{%5zMv>e5AfxY<Ivt9_XD*D7-RzsTtOCAk(!?!Y6g|9~QQ?{6{sy^>t3Bv*Hw z_17tK^}o&N^^#nhBvt-04PHiXkmTCA93g-3{!SN!iWOiJEQWG`Er9O?*tRMc zxOgsT)U7mjn^Fum^IXuvb3v=VllHfz7;Im6EMF9{-1niyC9Q28@J0o1Qt)O4Z&C18 z1#eUEb_wUP^k?vySj5shW_$;houhyH8RzKsgyV(wr?myUR+604wRwluuH!qF|L?iG zs@;;D0b8m$+=TBMpl?$2N%>s#oe+IeZWn#WMW2-CMc=UKlXAZ3drBKmfTK50)8eOpDJw7-bH zQqd>vHKMOj^hx^;_TfH^d(S_?=P?m4ADRAd_MY3t$Zz(f^v&PwN#rx%lQ0imh4ueq z@|W=5`-e;SnLb@2xG%vzCF0|cpIb1yGt$ZX*V2hOc$H45L4?kYMml-Fkxr3=7eG$r;HTkxJDKmp z+0HGOY8H6Ea&Eb-hQUp>3~sKo=x2cU^Uj^b7N_68!*M^)x*o?ZYqID#?&n$GsK$QzD%Bhm20R45Q@ zh(*U6LZQ>s^+5S@U_4F{&E@y|ntlF>*0%Pxwt&m$8+Z9#&5iBN;{ktjduvCa5&i(l z_}KVIL_&H@R?7({G1+A=+w#Zyg~eIC|ys{_6AaMP$EKep3~n^_%Ct# znIqZ!f5_<{eKDJUmD61hX4C&v@E^^l|A^C5z1j4ia{8MJ{XI_q82;Xke0|92&-Kex zLVn5VXB7Isary~n@Ql!>oPKT~L!ymhx`4o*3cZ5U`xHI!b#J&f5bfb?JqI|wRH3(W z`ZdL#9!|H7Wb64E(#=-#Ng$j4Akd9|_i+*EM!)+Y&Y5sLK%D-wGw@Ru;+&j0dOS*;(Xiha_Yr42a3)T$HUY3;~#=Qi;4Tj00u#t zwHPEJ-&i2zb-7x5Gfj7UrrG7rG!L&dySg*f?#4{>uuC)>TQj_k?hJ2ZN2b}-*d=&d z+^wS7)O1)h+j@?OW>x_UFb?xyC9hQ^HEj>gO+-5r^S zcXwoF*3sA`#5z_UzTE#J0B?X#bH;hlRY@9|f%8@}l3$ zaRKlhX2X1^WA)onu7fuW= z5B-1TKNZLA2IWP4k@?-&E+&)tyKD?)%%Tmj6>gNpXTVG3CGmIEB8;4=PlWw!oZbl# zX+l0Y!8(FG|9_*HGjf1)m9JxfAWhi@iI9I=X6C#|5_(3VEo_H$aoLZumWxlU3{VYj nF@Ldso&cJdJ2W#_AK!Px^NS8KuE2xPFp__l%imN46qNigBU^uX literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_64x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..888bd5897b51c4e16e671e3d10a24761abe1596d GIT binary patch literal 22576 zcmeHPeQ+C9l7EtATUPAEPMpMXoM@7Af)i|6mcIi8*;axHP6%w^fMZc)OSVj8ODicM z*~&U&5t}SWk*xP>zwU!<2!UH*m%C$kx!WyEOHsA=2f0ePtvV={s$3OUw}0GLQTJ6_ zcT{xU?_>1zJR-5gQE;`Bn*65wb@#mK>FJ)H8Tr(nk-b)n#cLyMDI?d2g>NHx`Sxpv z*v7r($m=6^_HD`50p)xgrjp@6QVSo6!^c;`r|W=VTC%?`HQWc6%S_;r^Dp;J&Co zal*JL_|aO%7ASmQ3GDY_OamO)@4M^Z1B9f07?=u%69LzeP<%QRNqpm(7hHk(MB>+E zWd4V)z*zEVAmR^ZWHJOJ!C)MC{zNP=7P>SMiH-&$FSw=yQ=#ws zUUWsr#}lC>dBK&K{A$Slvg=qNax~;WGC3W5m5KUCkB*Or;&V*p`B)$*l!?8KJ6I}h#6`oJGW>tme{nRF zG#qfS2m`VM4wb0?P>K4F7NP%AA{n0yhJ0=j1&50;B0C{iqJFqUeaM-ZB9|4#!U-M1 z39%xK$WBO>sDHdf{gXxLb0-WLk?^)&gcaEfZ=WvJz?o7FK7!tR`MM2 z$E6zhPN@d|vpOc7S{9Pw7E2L7&818)~$KsX{75m$<^lIMtDm1^MKQVsmN2m=@Q2PdaQ z*?jw25f<{C@L{P2ep9M}e=Wkm#RIYvJ}SaOo)i9~R0F>&)xgI^7`S+kQiKWF z3zsdnk}cRvwoqM!g-eN8WHNa~TojirwM7`ocSU{478**n&{TwlA513v(W6QKXh%=Z z9a<5F^4-x~vW1qCEwnDjLUB)3E)Grx5;?!Q*&dho@aM9n&qVD;ZT(B|Fv>rgw|DPy zj@RTV)MbmWWDCP3TNqi61@)}m``#u|2VX|44wvy=jnX38?X+afQ#qxoR9Ki#Ygd8c;0M*hswws{>T28pE~$rstmSq z6_28EEN>~|s7Vnz;HkXb<`c@S>hecr>P{d|l~t3|a8TkozdZg$+_sS)FIuve`D_-> zVTCnw4GuyF{0J}uNKa^`HFF+lFJJ~x2h;&cxi#Yj@B$u0+ES6x5Umv%H)2^urWVmw zkwG9e)=Vw9TL;w8?NuNL=v@VNkakpL4gx++tlHC7o0hfNwr9cDS*w*HAr$9Hg_eeG zEBV%98Ts~NS-E@WROXrV6|%5+g**%E)3DCNdTB9BzPX4Im?dX9{t?HgIKD`+E?Y-^ zP?N2pSe31!Se`AXxGlSlBFH_3Z(2Z(BFIq$If@`hu`%t`o%7DK>z%t7cCXv5?@sN0 zwp*`ATgmD4uU1(yyWw!NfF}v(0eqI(cpp&+(nkigX#7Gko~p*;HD$hG%r^v1=&PM9;xI#L+8@I-E0{ zZ^~|OuEY8E<9zMgDP6TSe5T2{u$pvTt%F3d25>cCJ>VL^wSe`2>j2jRt_NHP*Z{a5 zX*_2Q5Hn5MLLKnxz!zw1H6Yqr1BkZj0nyf4K(w_E5N)joL|YAj7#AtKqxmeITeUV< zUEN%6F9M+>muHXpqcwr%TVe+I3xn2yAy#+|&qjf@>*DUd=o9 zMm+Z0a>w3S)fGOo$+@r*_TLD{(ge5>a1&q?;AX&0fKI^8fEu6^a0{RYNaGcceUrA( z2)sr(2DG&i5N$O9qODDUXlpYd+HwM-Ee#NDZ2|0>U6H;U;$!2ycDAe9KBIR%H*34g z4%d~Q&f4vc>)BKH+E&?T%HbHC;D~ZE|Bh{imDpC;!#3E~C=ZTr5^OtGK0a%&4tK5m z*4MgLIwEMpR%0iXD^`+OyMyvA$OF0zu(I4cgQ&sv)UAO(oNvr_%|18V?Ih_wZR4DV z*GRXMonJU^dVT@v`30os7m%J`Kze=w>G=hu=NC}ep}qo|`U;Tx3Xu8=kopRc`U;Tx z3Xu8=@aE&hN8<$2C~^9HVG%P~XNRr_AYM8=EN(oV6z`a=NO!>X*R7LuJ79{&(WOP) z-x>SyYMLXCABQ{zc?a@<2HU#zBwV{QGo<#=wcB@B>9JMMq_b8$Z`thbDu`RFV+L!~ z?iG&hwzAaBIfvdX#_yEI@1*1NFn+q(TQ)p1P_}Pouxw;zsO-Rux9lL$_XB+h=m&v* z2p@uWhxHJw55oEotdGF@D6Egc zdKlJGSc9;R&*-j`I;p+yS{<>#^#$7&NFqS57Bu)2={17J8z!LZ1#M&WT0!H-4=}z? z(D<;{4TWvO7FE3A3w9e~#2b)JDvFcge=JVJnXg-f7LGyWR5;UL3W?6C0&#y$tI~1*IFj44d@8 z_4l`edWNB|A!SRiDLb8xdD(uO$v^#r*Rs$gU&qKBEMI6+uV-DOwjupfY(_a>6_=CF zBl7pwaK3at^{rH|CBv|_1@~Qdf$c{+&&VHK%lXpz)|;umJHxOG<$N2soD>Jh-@A$P zrFhV{Q~kaS!`oYN|6Bg5DZ9l<q-&Q9P+t-+LmSGC+|A{bSU9ks^QBlI zLsZ}HWmt{pZ1}zlY(GjY?0bOorC1<)sD7=NVLj@PJj~^kSUB)y&X;0=+(q>pybPQ0 z{0s-+{HU>jxsB%yl-U;o8RSX%L&So9fnRG3eJDRN$>o(;I55TeN-PY2mIj>_c)lv84X<^|--v0$a= zqJ`(QTwXvqB^GF2K)xIcEcdy24$S2Rlv84X<^|--vA}X=AJ3VE^8)5|DHdp6K%QAF zuzWkr^KLFLpu7?bG%p}ui3OS$kgvo7%?rp^Vu9ub(2=SfF_U z`BE(K8qvUOMB%)E^+1XRnioE~MtFIR$mIo;S7L$Y1>`HSK=T6fl~|y80r^TS(7b?r zB^GF2K>jjfp;3tinirHBLGyxnjnH_F$mIo;Q(}SU1>{Syz-vScuMxStfO1MK(7b?r zDHeE*aPu0G%L^!{!~)F=$d_V)*N8q|BMRpQrAE-a@X0k|IKyg0E-#?G5(_jhAYX|E znir6-!~)F=$X8;4<^|*{u|V?z@|O_{-&XD&>HPxcQ>jMK`vv4Jtr0}r#~SwwD5t~% z%?rqvV}aEOH4 z>?O>42D9D*dqy@bt1pI4am z45mE>&gl!(OPsTs+%sTK<+;JEXE5t6jNE{_Qf^Rt29z`FE$|$f^%7=1gIRBZ`ewa^ z+B2Y>S#MD=H(;(6xq-)mS=K)pnb5({cC!Jlhra?fDaTNrr(^RXNYO3z^A z1(cKO2A(6D7mzR23I5z9uV*mpEl{841=N@71b@zA^b9Cx)>{;jnir5S#e!MS zVAfmUIi`65^`%%)dj^y<>n#fA1-u_uVu9C)yq>|d$G|yZUO*d4EPNh&iAp6F)Lw$u zh`gS`thX@of_aTFdIlpepqvs5G%p}uiUqTt!K}A1@&f8hv7q)0MqWTU>3*_cUMO56 z@_Gi-9z)~>w4ubp=dqVK4e!q6=g?{|!D|GOdj_-K!pIBeHNxl_jJ#k{BWPYQuMv4Y zgIRB3^N!52=C zC!F$lnp0HPM`iIF0oxvU$BW;Oz`Nd*&(}-{dly2;;(Hr_B6WZ*fLj4u0XqTVgCBfH zgYRkxiM+d&kY+E#Z92p48HTrzIy&CsWw=9UxHH4>Rg-Lc0{YU7_6y-J#GPh3=GS@CEv79{Sb# z8uy#K!39s2rDzOYfEW_-lak`+49JN1!Ea#TdkF1s2kL(3KqI+@t7qPB4C_q<0(F zBjkJUZs>wb#SYj5!B7X-2lNKOT}yI77taOVx}9FTJsF0*JQwuwT+pv?pyRtT3~#&X zYx#`0mWMyEbxE``ZkH?*V11@eOky_m*B;dK!GG=5PzXzlE_0F(&16F?LvtNx5B& zJuJqgJTJ!X7h_V+7i0H`F)9CxF`pQdYJnKLLySpvL5%f?F{ws~G5o%Y9=lX8#8`_M zlWK<;+akuKIwHmz#h6r6#Ml}!Ce;@)246 zvWYo(iA|{6!sb3Rn|!>GP0YbdY(ni9HoMJi^6^48F$XWP3HJ}ere}Q=j zNo|sTw~zb%Ebr@azcoEJ9rycL-rwVXSN5^}ex~>Nu-{d?;9S#Z0JG3jv*n}p3|DZv zfg`?7#pNVtI2cLb<9WLL=k;t9H~RQE^nG-BR2bm2fRV{*_$y?)H2B+O+Q^7c^SE8` zPitw9M5iZ0aepk34C9aZ{E4VP8HOME#bZ+m|5#w!pEw$eMdL}37&{V}n4F$~@sK|l z8a+DUpPU|#`p2Vje;^X^M-%>7JUTuZ2_@PS@v-(ua`b*CLP~pGB`7MtLNroSA{#*Cw^MAwnuPOY0=lt(Hm@j{w z^It!d&#$5Z1o5Eo>p5Rn?7+{&V5uYzu@h$)+sXMph2PEjHpQP_&L39zKS93LPTc(O zD2g521AMdJk)8s++3!e+YdV??5m)fUH2fBpxF)8Ly261(nDGCRpNtb%JQ@rn1H_dK z9Zyo}z|`azutSk}6fW^#5H4`y8jDWhkA?~Ut~MDM1q_2W+hRx}fzeRJ?{;_ZH8%Tt zjm?2RV{<6CIWTB&2Rn?-A-CA<=rxRW&Y?x4rm+rWTNSQyA1@u0_;{<~Sy5}{TUV%XxV1$_NO7qP)n{C!L@Eb5b>Q5S2TP!n}f z;4l}&I*9wSb6S+QRY3n&HX9ZCq7Djd=JH}(=yNvx88+G$)zla|!J7VszX)HRgFXH&cfY`5~!?0nd{}I1MOr(rhmggT$yr&TZ(yWAQqE|!gG(S}BcsXAY_v2o zTA*x)*J?e&bBk|+ju*W+x5{SjV9}Rf@p2wnN@c>Bt{wVM~u}SYp;PPlV;`4@Ic1(E31K;a> z#SsYxV}Uq%*%2H2M!@=-GcaEtAWw0zEd9R zLU@Xwnhd-&*05W>#6PE(7?}v4y5SXeh{nnd8oTKgRvH-nuCK67T%l&r*iEmn#=z)z zeTD7f3Y`Xx-Si4w21ZL>A)h5Ld&83z2vo9k@o$jpZKF6Q7R zYapJ@*THx4b@1IB9Pk;@wlE`J$iYk2KwQe#!K?W?_(2X1#6T>}h`-3eOV&WVk*|X{ z^L6lc4i5N0v@gtvR1RLU2I6P=I(Ro<2k+(JKn%pfjJTGAm#l&KdA<%l$k)L~IXJj< z*grNdw$1nEbMTNg2*1qN!6*4T_~#rPTsk5T!e=>n$Qp!S=j-4%`8xP*4h}BeCk?_? z6Uo7aJPKD$mb^V!^Y&1dgNMtp$?#bGm`IANrt%z|WDi9}-X1FR_E43BhwqQYJdxvZ zkFUOYVK}rL9AyuOJ#P;+d3&h66%V=HRk_qY=8Y|ko0GM8Yd3$cnp_5EH~U)uD%_0n z__A7e-%5WC?m}HPb>{7%D{l{nZpDKdwJ&(P?)Axd)DsJgdnZGYXka0J-K_0fjbFck z*^=@58^>rga(oh+9~}=C{WlL5a6zXJU|-kAPtdQ%eaMrIVe;jz&P%a0fF(d4RW z_)W*S_oOEn_QpMrMWV;Jb1H)c9&b*5xX&|iq{$1nFr$qwU$DM;G%!By2~IZbM#2-H z@VJ~On;Sj8SO{)Vh68Ya6IVyiNI2k~IQW)hGCFp`8wd8HD<*F^qJgi&{n4z3zYSZ9 z$@H2dHaZ#gj6{x4#3x^I#7>R-BH<4Yh~}b{6S1I(4Tq&Dg(O*SPT@q*0W}=963_#5uLM6xmlmbE0WT1lc+Y-UIZ#dD-cOTxLCJU3TB{&23K&^dJ`^M5YoX>$eQ znY4M`RQe1dTQ=BcZP2IY>C&VP?8?ZR>%?Y)r*1Rvm+dFyEU{`At=2NjhRV?Tv~3n+ zVA}#Vm6f4&DZsPrJXOi?B9(0n*)~>%Y*kxAwk_I%U8QXrZP%r=S?#PgYtwAg2~ush zRab}XBw^V0`5U#JU~Rr|t8FJ(+cVI&EpQzc=&PlTkmpJ1_C#r^-LkAYgx7>)Sq)>N zljr9!_6KZ5$pSKy8QcF%Cz853G^Lv#g(D2d#1xL>GO!`nrsm5UEjr?Ijw?8>*cIbWhBqZWWEyIhb!hnA0H+c>M{h)xH|9e-B>2bvva??4k2j zw%IkL@%n055GnxI0ImhB09*&S7H~b_I>1W6^?(}yD*-nGZUEc_xDhZD_ez*=Rod)o z;ID@9LjP+3(SHRX`dRuy`Y#D0SH@VGe4gkl!|ko_5sfdJW9xIw!@=8(}^`UZ%)3Q>dnXwq!6n{01RH%>o4-DD$4 zm$qp}!};E1W8)8fq2muo#~+Z6KlTy&f^nte4@k!!kd8kf9e+SN{(!=-?V^p12k_{4 z0PdeIN;biKZqiAz0Wd+w`0^aa?7a1K8O^=Zr(rFCoQAy8;9NKBkPC?^QrMqeI zS3a3cn{f=&X?F?4)Lc46_kv}m+bsp>rY@GAwF`NhByXeg3m`vlZ!d69briHubry6^ z9W1zOs=c5K=zD=a1oRQ0yMgWjx)!uO=p# zn{aM|Bn5hnpkW>(T_I>JP=H=5Xqcx+uM;%BgaGtO5~aH?=z5*h2RxVifVA8Pq~$&! zE%yOwxerLoeL!071JZIIkiJnL`gtD)l|DGZ4#t7T0ng<=AT9R+X}J$b%Y8sv?gP?t zACQ*&fVA8Pq?hQ!Ch{Vz+1&FPo_`a|6~%XlVnq+?Fk#-%PW?*(Y>6I&;PLs zLh5E{A!bTj-3-f!h0^72h83iU(v@z8Rm4hZ&CSqWnXn|Uo6jas6V|3ie(?|8rbLxI zkCE3?n5a_cvtC1e)ub5KqF!$a*OSH<`F+bdUm9n9C$)E`7&h9m?Nzg^9clcL-?NtU zr8wxd)ZUe1xDWMuH*h^EKFIIe%=uEBbO*I}rx)io*TWxf{*{VyEwR-Ig&9&hKY*TO@yI!n9dW%1i$;Vo3xBd#XOF?YWZkN7m z2KhZrSE@vABHg>^N)=+?{wq~Zv}=2ZV$VT_z4tKeJN! z5a4_%zT_clU+!jDf%b!ATu+Jb&^YHy@g)ya`${*%Djfg*uS0yWeI>pF$2nh$FF8u> zH8(>$+7F%rJ)`)-T8i?aZ}RxUnkv>?j935k6~p)r%w)%R@LMdtL%+}B+y4xU=fEE@ z9Q-4OLw~vu-z%2v_zq+K;+iAH*N6FwJUPB>4XNR43C&+W<}d0g@eN}BB43U#TXURz z?V5ozsQ&4%ho6tU#n>T`Z0e|Pl<03^B4JYeA$}j=4%_x-(k#O)KlW? z!~8|Q9ACBucJZ~4=C6OL{Dpl6;~NzD3;T^6-{B?l*QeyKpXYCo=kGAjUmwq3KhNKw zl)rnG_;zFdN_7V2We?`BQg86}neg?Q=5H_NFX}1r?Zf;gA-m&o59C4YN){`T?w?dJL0!}GV7=Wm~szwZ=e=Wjpeuev^C z{u-~(HGF-h`8$aDi+W0YhcJJUFU6Oy&rZHR)BNqn{6#$_z5|%Q$d}^F*Jl@BpK1OM zV*a9@65k=rU*t>i3k>M8N{Vg4dtj;~TnqP-vU7xk3*1~Gq;FUMD@J#qaU#{5M+ zCB8n)U*ya2W$SaMRz-XNQu!;^XOX|^`n*K``jq_j^ZX6+{2k``>*M+B=lL6y^7nnH zpX>fRal;zgs8%+rsWHYzHMCkQqn=SsjWIH+p^a)~qnaA+jcRDMRz^Leni^wdR6`rp z%0@Lc+TW;#7JDx0W!2Kyu0amVwKLYzxz^A}mG4Mu4Q*5_8`abpH=`O_t(8&FsHVno zH>#nHYGtFE8tsj0Xth>GJ)@c$$K9xgHma44YHGB;M$b`5e^jxW~I7~foK z=wFz!_h7Y#=HI)sYGtFE8slbEL-X(HnOYh3jB08eccU7bf3MG~m5pj@v^T1u`F(&) zt&Dm`H8sY^sD|eE4YF!wqnaA+Z&X9e@y)8GvE7^GFV@l+-&|_wF(tlg4Q*5_8`abp zH=`O_t(8&FsHVnoH>#nHYGtFE8tsj0Xth>GJ)@c$<6~4q8`a82H8tAbsD_r~+b8y7 ztfjHto8&Ln(sX^!p@#ky+?ST`VW~AVU!SvTKWD4IPOL@v{9{W zR8yn9Q4Ouu%BW{lQ)7ILYG|Wc*{G&Q`y18Ja(sv6S{mEEN&aFjP1olfYUoo+eAOD7 zug_Vvvd^HV#<&^P(CU3u)HAB7aomk+X!X7;>KWD4$TzB?)%&!lXH-*Te2i*n^}a6Z z8P(Ltzflb>$2TbQ7i(#3_a^y^wKVGGQbU^{zF0%!v$t-`OcI~j;dAB|TWRuj(+oVP zDV-9}&hh!`YoL$MpVC%Wc_--Mb3=R{cNU+$?Qw2Yac#oct@CPYJ|!`sLzD!02CZqpfVPcgimR8x74o8bve{^Qw$sMIxFEBLJ^D?KEtlCvUdWIui320|4PGiYUsxn7>iW}@O(~h{Me$E z!`H*c>y`u1U!=DxbhSd?rqFhU-lovo75a9Cu2JY63cXXI?@;Jkh2Evm4u!5$Xs1Hg zEA(!KZjfjgOC0ZZxMx?f-1$x`4B-Jwg3i?!VXlgKnvmw{dC(E_6rbP2b7Aep($;do z?S(K;@fkn715r-rEI$8L=IlPwbxmtC!&mC`b!~l-*+R&5>)L-M$luz*_*;PwU+pQs zk$if+u;eK!uRh4+)r#G1cQgKNl3kNzXTOKZ?TX#D`xt+lWY;X&Z9mN9+ZDUp?`Qnm zCA${MuI2$IuTkuFJjnPvBs-U6xAP$;->KN$@i61xA=&Mb>}roPd97l%>uZd^OS0Q5 z**P9za))A9=VAOh$?i_c&go@xr(#z>!ua))-9E`~cYw)vD|QW|jNc&H?dNuc{P71> zjgZ<_z-E|ys{mal1n0ZT{ccyKk1eU+wAX>tF5Z*|psN zr#&6l-)oSEKP7()??1gYx7YOf9Kn4Z_IZ(~?}A(h`A-exXeapMy9)Yz%g86x3c}}K z8TsV$rF>pC@(J~Z@cCy(KDm4;pU)WigqlV8{FaeVE?>$g?pY|y8c(QugwMEgc?rx#P?Sh%_S~h$|vqw zi+n;ID12&0KDm4;pSWi&@(H!1@QLq@FS<@HH}Wa=ES#fa&w2^oYt6iaUVeJMOtZkV z=F{`b?F?7cFkD&7u-s|Up97mOd(R=|ar(|Xw)QFtbNc<>2CqNZ)ZEhC9B_KQBTm1w zp}wVIB;apoY1$L0hd)R*;v3l^u6i-zYas+)t~h;1wyccvYw+Hr^jS~Oi^-b`znb&A zc1g-`%E)5!GezIU^*>Vh9h|Qze%PPAD<*H(W&1zM^}hr6PUVlE^FLAeG|153t_E3~ zkZ*APU*e55=`%ymi^&U${zcBOxmQw#(?8+-$**PeFLVB{yR-R!$@%1PHvetT|L%R+ z{J-P;^9uic&j0Nr+4>)IzEf%U6V9(v`2WWFZ}(>V`JD4ZJ=y#cI)REwufnh3{7Z@- z`1u!niixE^+s{tU|LoyxeiP^SE92=F{uTba$TwSw`1^>$+kL<{`W@(b;2Zr8lsG0L z@c?o7Pffs2TZvEuVXrR`_BfqQcV$jpZJE=ygPGIL-I>$&`i0Y$%xQ;9 zoYpsIoYl8AXSnSdzq^|<&YBwb2!-9v84oUJMz^D>Ln!QN?-Zx54rk`HIdcuJ=8WHt zyE50X$F<$x?#lGIy(QzUqbbw3j^;gi{eU(vWBm24 zr}OBMw!`R*v(*b|0zTT}>y!8q z>yr4ps*{XO$B4=~UCt4=BOFsBd{Kfnk0-m^bIL&CVzBiGHu%FbaznC zxgSqg*UurA(oD7!F^}`kIrp4<-+8}x&O1-fI@P&vkJV!F*a-XNkn6<4_mOz`exi@< zd_5OsZeoZ3my<%02V%7KF!el|;LZfK^KmE^b8=A{@su)FP*TKrw1XuOvYzo|TRPpj zlktMecDPp0A*T0Cq|-6Rl0-h=;qzI6{VaCEUL4=Se~tNRTt4G`BIXCc4(D6!2j`;k z#0BF=p&zedVnIdhmw@;^&}l$h_ji1)`yN83f9M^t*Ar*9xO?hAQ+0|S9*^!t&3&+mOOG8_wl#_x}S%o`o^4FoO^g~I*5(DTkw-$>wl z9WOY;gM-mPj6Cm*4nG{QzvP_og~kKkJU3idt1j^`=@P@Ep%b6F!geuPk;!B?U15od(dWIwRSkLB?4}miAp)+V9K2{!)J+ zW(1%+0|#;dda|_d&CH; zjAh_N4nizT`-if$Kbe6&4???vgg5mJyvR{_^K`Zj&SvWXYHy}!XgARC=7kKrqy^%; z**bVCTL*uZfdh_+))XS1&A>}qATDL=;Kghm{2&7dA`mG=ypn;Jv_SkMTL-UZ>);m| zIN*V3OCchWftR#E{3=@qzs}adn;AF|fk+|ZY6f1?0`YdX4&KSu!TT9FxU}CtJR-{G zo7Xe&kQRi0$=1O~**f^Q3>;j#OAf-v8F)ww!YA1}`0s2T{4N6rm+q8;aK%D0a3M$G zip7?-2Yc2Y3N!FOK7?5WD7wqaj16*m`~ad8iSu2|eAW;bi=UxbHI-u|@NJ&QSB zlc!KuEFD>U=*rr|zQuS@&)R2vT=()=EaHs@Mtozza3qjAf8A{E#h$-j6SD=+-`_Zg zBH{5dn0{y^FdFj?4nGv|d$9%jM|?2-Gx|^xo<oBzSx{`EH%}$XyLSc!U|LYwjPcX&efSjCcpf>S~ek#zwtv*F%l< z@K_U*qIVz^@Qrr9?i`B@PxxX1V;!l5lAkyufd}BRXxe<=fr?_WyyT1yjYYfz;qlSf z*bC0+iIM(r=-tK)R^UADhv!^|vny8eBj z!D;`aYE=$A4HnPSxwZl5N_+w?p2u@Jju#s~ioamd;D09Dv*F2}zy0k$Y{>y0HykG( zN5gLFlfhAw3~V4$sX6EE;p(K%N@-u&-3|PyoI-LM4oW=dSBCuFu{QG4IZM)VCYglq zM)ItQ>u>}*(2s#8fYOt<#F}^(cn{D7P#vfal;m0y9v~i|-N;+=5*ku#Uc!YmCofTi z)Rvb(LY7$*MG$TssD@$BhjBnXE5Hx(1$l{XppOx&_L$YCC2h7_lMw5q)ygO#l%64Z zS{(MRKx?(p!?ZPzvK7#cx_*9HlUhQW!@mjH7gY+@U+pIL@zgw9K}w zZP8n%TOM!F^Ws)=I{x+wOQHo1H%XqGv*zS2Nm^G;0+&o$i;vTaug>7kh5Z8pUEQgrrul`AbsSqp&(#^g+Yjb+I1jk~X}i5*6|TP-*Kfa-^7$3Pvt^Fi)ujH~Do7~B zKvx4@1GE_ETA*uyt^>LjXbI4DKudv^09_BX6zB$^>wy~Qy#(-9rp>Mb`6`GP`dOnB4^PZ-nD>0Nn&s1L^>}8K?%d9O!1CTY#1WtpK_OD8)U-Vxu;@5o813m}nl3dRuQVp)h2F)?M!SU);{nRs44Gt1_Ya3=XyuKP7 z?0my<)AJ3Ko^PP^d}BP|SP)0_d;_KD8z?>BKB;#@pw z#q*d9vM}a>Ed_VAh;0J*Z0)Id7A>c=W9|JxDd<^)B$zsSgy0shDc@VZ9 z*bc$g3)>Oc`d~W>+cDVsVe`W_2wM=g`(X>gHVWGqY!TREuuV+r&XYPRI&ggzvB32R z`xZz{z^@iO#2ooz!Q;&u@M{EbWBgjd zc~3K!OSnb(p(xrf>pc^EB5`@Ug5PQljE5iXaoMee2cB_2l0@Vxgv z@CKV-!Pb#+E|;)H4pV#0!)OKC_nm-oQrs!FFhBF$hGUL?6ULxS%5ef)`fL2!XS4?~ z(L41P8?RvN$PAY&*y{Thmn+yh`iESuV5|3u51D@jTSxwg%N1<({RzmWV-(mr`seep z^-*p*wrJkMIv`<-<}H-T*kZY~lIPS^-okMTwrJi$xr{BAdtE#Sr}7q#Q?Nzz7RqI8 zv0UxuIXjiNaGZiInzv9cV~gc>56|(byoKWwY|*@hav58!7Ig8NuyEeOS|(wO<}H+& zVT;w1US3yHc?-uY*rIt0ng7ye4YACN7+}qutoD0%4KXReTi|uh2s=# z(Y%Fn8CyypW881yI0aiYZ=qbqmeTha_ggql!4}P1D3`ItYNF96EqK3$bw#R)^nMFv z3)e)W*D~(6aJ+&onzvA{V2kE0lq=Yxc?;zVwrJi$xq>a4w@|KNYcYB29Q29fznlKl zUeT;4H0vD^UuM0c+7se9v)&Q$Vb&{}^@L`+pLGdr{hh@mZ((1=@43u+LYlX5 zjC2oXl`TWU{;&w(D%Gi?gR<(jHnzyj7acuFL znAQ`T^^Qi~GOvk7PiW*V9H(H5<}H*<*fQ%0&3Z>8Z=t<}Ewv{!@)nMh?(gyZ(!7Oo z30r18p;_;U_NI9Y>xzOcUK5R;5N*wRNF#5d4+UFare0CTmYlbqg744e?|sx>k=H~b z_k?D>qmj3;rbu~9{jSQ$TR2X^7R_5Im$7Bi6PopoM&3eu8CyzEXyh#%r(lcbEtJdH zGU*A;dPl^EY2I46CK^2<+M4x{M&2^1iC?B(QO1^>w=94y>=p4Hnr_=nJPGe<;rnS# zj)M5hmKk{0pkEujVwqL(}Z0?}ZdWUU#_N!D+X< zVw#TKOUL57NVYrS-4T4>4c;M{-n+Mg5cYj0A#?ARRf5BnK&yb31I3pU>wwk+?EvbA z_d4PGSNtB9kkC7~5CZpsly21--IieVHnNh+t2~Tu*BNyt7_BDdRPXXITB9>sn_#q# z)KPuChtUR|(Z&R$O{9bB-5y4pbw+n27`+|WSpwe?=Rv&i-M3;pdzTR9n!_IdZ4%yZ zgX7o)u~?Y{@3`poAJ{63;IgW}X4?(Ni~LrF-=^@lDSVZ}Z&!Gy!dEN2OW|u2zE|8#kcPVx?1592c*?A>D`K5bCk)OCA$vEZpS#&?@;V+e~`&< zm+U&Z9U)J@Q??T>Z97l|=t`g-;7ftF0bdRjlB$y9cJdszOSjXU(~@9xH_ve%p5t2e zQmSuDFxt*@TnEo_ou4@;ofSE$>jT?P=^E34-=**^3cp+7Jqq8d@NEj;uJ9cS-zo9@ zn(u_?o#L8r%fa{a(realKIfX%H0?Ow^oVw*L@S6->Dr7#Ytr$3p|5Hg!_DRwpPTIi(uEOdoJC)H}9%N07QZlezGgRpkr4bT3>b=+B(zf z)%5gxx&Kc!9rN#1xDJ0wUW0dBU!U7yxiLquu0x#{*Xgf8uY>-VCVI3Ja?z`x&)3a- z!o7m<`PXJXxqcy^m(6^_{f6-Q7iK=Wej%SvnE8Zz7UA<-WYP?r`@xbG1@ zV`e_Nej%S&v*!7Pdne)3Yvz;d7xIZUYo1TI{}Mj;n)&4Vg?wVon&%Vl;e=288<2Te z;`)VrV$GW86Yc|rPtD9H*DvG~Yt}rUa4#u*;@_LiTPN3>`4lw^F)C`-bMUtz#^0_L zoxEOHX@h5$C$BH7VsyES(G~TK7P)QulVI~j*Bnw5qkk)h^L~;2B+k36#irxDUu0j2 z^Dg(WdB4d166amf#^(J(`%IX(r~@!YpRG;7b0$I`w<7&`9sWFKN zE+}9ufj@wwpZ~m(4C9WQ>)}}k{qzY3oEK!@@F@H$=?)Ekt5n;!uS2VKIpLpHSsemM5H8uuZ zKHq@L@2ab5sv8LS>zW#x12ynNrUU&0Tg9s9Q~ZjGz(eShx25X}xxBI}UCzS0guJTA z%lY`O?djtgyM+8o8Sm!f-&f@AT&^j8?&9)Zd}baJhGX}KR8nd<68tZLuGuGSTHlov!icV)^y_l`DsVS3R=E%eb zf6c?&army*{dcgWC7_lOMcCq3f%vs0{fHe-#qVVcwWv>mM_a6U!c5dbLA&@stb;h0 zo%5o+tpNVNl1X@)%s!$H3R=PCLNDyOnEosa`WAXoHwD!cd$E2&9iY#V>_wdxG{o%% z!DEm=e*g^1#QbzYIG_~&Bg8}hPyMIz)u2Lq{Mwd&wsE?iQR8>qxtI;=VXjMzzIkKM}{7hl*iU^Upp`UQTDf=sL(c8$*kUKhmi3x$|h@IlxyvnStRBjSpI H!jk_5-x5jv literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_bshuffle_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..ad1c2180b7a4360ff368fb1e928d8f7131620aa8 GIT binary patch literal 24592 zcmeHPdvKe_aX)|{1yQ6#SuaSU1V{tPv`j&u_?9IKq(GV0%e3o<>?#HU0w73%AOe7} zfP*r$Tb5)&^YczF5zF;=*{ z5_uiO3jZ%9HKYpIsOw?kd6?tI1X_7LWQ#UAs|bv>j1)-f7>#;x3xup=G}%^+zrKUf z0%bkC*WcXF31CuC1Rmy^64ioy8MZuSh7P5ZbmG(!+L%!p~vEa+BsxL4e4u=v`Oy$dQe^4k{HA*i9PWq?` zVJUSo9{Tcd^G@{^ze;a$EEYX^%{$yD8mrSZcHKL)X_RjE9d?U5bZ8p8?j1I0ly3GN z_J})lYZ|-m9d>Ax7Q935B^UkCc*HN7^)L$-;S%Uv*Rvq|nYT>+zB2U>m7sqi5K0vW zpuYqI@&MddrvA|~^^cXH|57rQ7!HQKZs7$3B^Z$hAy}qJio zmtaI5gjAXOC(6`6Rf0Yrgnb21c*`ikiaZK$ohjGAxpED_+FPnO>??S~Tjxu#QZx{s zE7!m?aTOZ`l_5%C>OmMl7ha_5qLUUXG^{zGP_BACHVBLUZ%i&)UAx`8%pnTX6oq z>l#XojmM$+q0vw* z_`*?t%J=A4;y5=>Wf0);aPqy~zK4&r`r!yO(b5qJH?<9gMn`?&c=Jvqe5shPv+YD% zi!YFjzyW161m~L=9esn*kU!S@Jy$$2oc5=HJ@1amE3QQ7<8VG&)bM9vX)&3;?n(~D z6TZQ*@mMPU6<6})XkaY*-rjO<#dSOgCtn4tm(1i_u1EdpP&hFL1Bv6|5huIpP$D@z z7Mm8w&ipiZK467gD&Uk@oU`*PU4YBsWZc8g={=l}^~Hrp^j6})lju6+ojN&Q``%Z7 z^KaoX5lsrz+V970O=$xGUuKL+5?yaGyn}iQfbb406c*GNSms1 z4n%WR&W%`6#fHODl|z!$nsev~2A~6nZ86jX^eh8ANLN?o`T-v&X2;`Ziz9EbY|X=1 z<;`Y>git(7svKD;o5|;AD#)MBR8+brPUoJ;UMABsm&ucGJr37dxL%yelh4kem*vS* z9RGylr#OCv;`01*ItI1*T8fMFiz!y-D=BWtZ=neFK84@3Ks}03k0R8g2=yqg%i0b5 zS^HCK>|N7ctGkS@OxKgGMpf2K&Sc+NX3BLzck|@SGvlJV&axW~!^5mL<*z-WQJPRu~(D zJU4?ru-9Idtsql{G5&2nlQqmICJggqaGio{Vgkpr7IetviP@Un76Y-C<5G^xIM#7o z4p;+YKrW}bPMYhaXPHiVmg%HtnNE5Zb?WIHo&`Vr#*6{;gZ;@^ts{w6*YXH{*+5p!9t_55VxDK!$a6RA#zc}U7&Rc8htngeTovH3}+Q4@mH4qo?96g6BC;UY^0Ad4_nLd6;W>1j#07%K?2uUq5f7{$kiSfbSw7?g8bS z09z)PWcPzltUv2`s-?y{VYECqX}Qx1&zh0VTdmbs^QZ5&EVE8j!kpNxX83;j?Aw+l zW@1@ljaZ=UpgfqbVL}|$OGhTHHIbI3pZ`S5(&{MMu+&<~q9seoq_vvzO~?bf0&r2K zX9Cdy&s3|!KE2-l+GNY*bCa!hlI?J;pK{3jgv`2ZB{*w4dQ^Kql&1F}7nRoPB>9$F2OZ3oQIF@JFe=kT2M zd<}hm&!2~w0%Hr$r31=_`E&3reSU(}-8b8JV6hQj_Cz*s#xc$34=jc`HdjyJnsQ)C z^;S#87bebEKjReggOdCpm46ZB&pHoP^i3SDI52Uf;?Tt16-OoxRrCXWFVOb^{Q%HM zfqoe1V?aLw^Z?KSpo2h%fsOz@0(2DU7|?N`6F{ecPEXk2`8CXaaNh^uIt14dxccF` z7q0u@dH}AYa6JsyF}NOqYXGhQTtT?Pa7Ex6fh!7E46Zm_3Aj>lr6&y6DTCDAGrNMA zz?Y$Hf`=aHm4c?bp?hXm2^yXQl&=;v_%+gN1dT5{0BsXAz8nDbT0t*j^g2P~#scK) z1-+Qj^@7It8$iB6&`TJ-5$HyX#V{%>GgXae4N`Y()-jNwxG9)1wp*=+v3PN&di&#s zu|w)3mgPPoE%y;=xsOQ8eMDOBBhqpok(T?2wA@Fe(h_u{Cq!;Mpdh!CSJ71a+@qexT zJCmpGej*FItZb{z{wX`(ldP=mrh5`XuKtG|T*AZ9Ld=x5dKlIa3#IEk3|EpWO4~dP z>xq@p4i7^osit&;hhd{FW69oCaV9%K*gZJOfBx%$X}Vs1ZjcwKoUT`&AHz*;xpNF# zuwHO6uP4n1^24>9FU^awlj?Wm818PswwJ%a+L7i9`GHlOFU_0LLiIg4hF+`}T+8c8 z^NIZM2F{n})o7#oeL04Q8nOLNf6dyL<{SBeEu1gSyU{`Q`*RHM!+OEncs6#;>TILYI7OfRcm|eXxR2I1>XF`*$`s-Q@`IgB zF9*JYdfScdQ^*hRd81y$iWaor{dPTKpqF9rE{5U5Z`XIA{GK0C4BXE!_z=VJ$9`1r zA)CIlX|p9;RaNXS5kL3~KG*#D#Q7cxP1omzT_`ti0a!s4C`@T zBa<+%*uLT~BU79&`3o7Q`VJ36C+bH(&Fd-t68jA2Oa4NRQ~d@H!$zFn$kT7={RP$v zl#l!=_ZL_*#Cn4K=$|t^)n8(NS?n*7FERfZxyUg3Rfe&bnZHE7&HQ8JRff^;GK_tH z&R-s@EcTZ;#uto{lD{M{z93Kb7Z&r}JoeG}lEV0c^%Q?eV|+or>@O@vcJNq9<4YXl z3)WNoC4un;`Le&TnCjuNmByD8#uu!o_)8k&3-V=uVKKOm$6^{^;uv4Bp5iYFj4#NS z{e{KsejdANd`V$^!Fr0nq%poAU-lOkzV|+orry zuSYb#L@~Z#J;h&Q7+;Vt`3ql<`uKW8<4XkN3)WNoWd!33@+E)a>rp>nk7#^}Vtm1R zioe7#z93)n7rq`H#Fgi7yczUq*O*iSqao#Fui7#;;UlKgNq3I zONz&rv=mGn~V1_ zYL2AKg>*R+`hhM-Qgb1!r^}ge9(6gAE*H|}OsKERk*;bPoJU=bq|1eLITPyZ zawIhu!g{)#3Fq}%Ig;!za;}7Wx*SRN7tEE=Uv5T@)TH=}nj`UZprTwzmouRs=yD`} zZdAyHu%0ew!gPS@OsKERk@&e(As51Wx||8;QI{j}bF89VNS8CAzAi`N z=U#$hi{g>2f65Uoclff4Lbs(nV8od{J{GT`r`{na~e(Ig*+S zVLe^Wg!8D&k#xC`E@wi0U5=#YLRe3iGvPeyawJ_Yq|2F5Uza1Pxe(UVoXX-m0R!+|M7zB@``@N1_@1|VLcG(1?|b|s)W`3$ z^VY69FVw^L67c)}Gx**9-mVC~gHQ*#x4o+?lIiMlW@ue6t&8t0SbDt?8@^BD^+qyY zuaofauLzlW&kht>4%h&=8L$zs84zEB@&dL1-VfLT?|;END)`+SA<=g?6XNtR++r}? znqzn?Sx)5*9)`CW47cSN-cB}Cd83Enc7vfS$8ZN}rgFE3VUxjdXO3Yr@ltt#j+R11L-Xay;Y%aRpVy?_SOMzt%T&&F=jciDND8IF4LSDb6A z%y8s%#k%U31o>MIGX56e!;ur`|NKhj;wPzm>tQC}s_5N%H{;(b>FptNdJXq5d4r;N z+aEFhZIWJ>q_^#UCf}y$-Tols-!AFxmGl}PV)8~sZ~GrJ{&q>vBk8$5#^f$VZ-FtyBT85dtMbT?L&iJj8o>$Uq z8)fn~MX!B~@!KW6UP-Ux<4oS6=yfI;zf;oNFX`9bmBz}h9K*dlc6)g2?l#s^c~6ev zJ|4ThJa+dQ+o*hhj^UjjIJTY>v9<4g%WmmeH-O%w&|M0>SD`%$-L23)3cXLEy$ao{ z(EAnoPKoBR>EGepQ^clsD)3#c;x&8)a+P%EXYBV^LrFJw;(nY0fOvd4u?+7k%bCxuMf z9|+kgA(QqBLN+dB(!N2+VnQbEA%tv5$fW&*kPQf#w6_qlhlEVpX9(H7LMH7wgzTV@ zN&62W^9q@?7ZI{OLMH7?gse@-q&Jt7S23He0kmchuQmFFQ$C7C(2*>XirqU7yf8Z zL~ij^#ysWbc`!!+nO`T0HU&!Y7bT(nnCv5(W&L)>HWE0nTlogF9>>Gv6FY0V^ z`9d~vjh|-|_MXD#r*$^Dd?A~-#?P|}`&VHzrL)Q93)#dqex6O(0}GozolP!Z$R@7w z^K8OCTG%|Gv&rQP*~B$|o=w<`3!AMvn_Rw-O=4m9{%2=@V8!dr)F!M7I^38)NE}7!=;T3m$@0%wJ==XVKH6- z-4A=MCUq(Lw`JJwhk1^H?bdc#3~cwqJkP*(mwH&cALh9Rw!5r{wfkY7Z(zH1Ue@l1 zdd>muF5eHnP0y?+;arx`*S8R@HvFH^*QO9TNnZm)`g5FtuUp|~f&uxTtNAf3_H(&~ z^HahArv)4uj=`^q?sC9yj5-b-@;Y|9UGUG*up>Ga8ww?Saepd;e~j--j`>m%_+$M< zd^G7B^v8V3@pybJk%Ah7$NfXYu_2I$e8Es)e8@K(3y=B2V+o%>8ug7OeeuLtcsLqL z?novEcSNJ<(e1z)2@R$w;cy3o{$_tL+}hUO))sR6{e$kHySb^oc`y`gZg1@jHNg*% z4h9Bq6L&oy@%20cFS(t%y;xSm`Hgt?FMZa~@*uS-tnl}7zFV<#g!3)*53vj8$wQp~3xyx#{Qjo7x>%x<4EI@w-(g6f$7p#GdA3_( zLUD@oPbl@zbAFE9SV8G8IDa-!V9~;hod4_t#r$t`KDoD;|2pR%>o4a2Gw1gy{I@y( zXQATy?{ofZ{C9$d^rxJELE-<3^MC)asLAju=XV?}<}aoWw1_wr{z}gOv0?}Q+#5dN z504bv*~a;=Dg0K>e^VJx59ikp7Tb9f`DQB-zlSWO2Z68q-G_6)*ZuATamB_`A>s<2 zjKNP?5Z6#_+!gUBBZU8t{8WOt5@SJs%1>OW(1{eS?H?T;1a>Hz7=uSV7=#C$xCX~Y z@y9<1{wzky9{`MiHY+hCQGXy5^|{@x`wPX6o?jmld)o`#&R$XMY%Vn1+gxb4XJ?^qx4TfGv$MeM?A$3V^zJY8qPw?X zx4XA+zx$dCJ?`l!ROsoDDin&|=0cCXI}3Gto99|?77kflkgBa-q-J@ORyWuDwDo)>DBFMPJ;%Ks(6s6>y<1)^CAg{pMz9A4D1PR=BWV zLf<2P-$fu+l0F`(jKv*rp?(z-_}Rul{9b$EciA~xy@6KXqb~LVKO!!P-%*bt9h66q5WO3gF>C literal 0 HcmV?d00001 diff --git a/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co b/hsa/gfx950/bf16gemm/bf16gemm_fp32bf16_tn_96x64_splitk_clean.co new file mode 100755 index 0000000000000000000000000000000000000000..c435c53f18002088249b75743e54dcc2c2350cb6 GIT binary patch literal 26672 zcmeHQdvH@%dOwn7TQ&wvco{J0LJVP3BjmSDz{n2>0YjWD*(F&LS+-=$Aj^&{97tJn z?O=zcX-L@ZcDJvPgph1DuT7KPY?E5rwB7C>&;gp6CX-~w-O0}EAKMw4b~`(p7WMni z!V{%p+c$Y9zASq=u+QB6dvXs%})^x0FXS6`s z4zJbo0F!%0({YHAYdw?FS5 z?Cl*6#>jK-;r@q%&L6r*{NV#Z-~Rqc;AN)j>psxi8;njclV^wg0b%6S7`@Pa$VXiW zPq9No!Dstxs?|&U0lmb*Nchkdudq!FR;n57s#jR1G5Vsfuti*Ul@)>`vbvtkBEYCD= zW#L4ggjkOD2XnMPoP|A~gf=S@&YD?xk!RuT(OexI%hds_z1c=Xn-vXbpUA>X+C+RO zR|ntA)xlq6;eba(ODZBxW#J`lB2MS(;DuZryp)9lF%hYVcr6PrX%q2At`6SH)xkSi zIN%e}nu>@-7GBaO;_q{H@QYj>oXx_4n21zFoX^5b+C;pUtAqD*b?{LZ4o>e3^bd$- z^XzmM9?~Y^lUyDAI#&n3$-=?uUGgMc%ECk1B>ZQt4t|%bga6LL!RZ~+B%HI6EL_O5 zaL(q)*@H7@4<%W6I5RvH?vL#kABuCf(kz^$PsM_qJ(T6_p*#x@FZK`n1`ou1-BtCe z=`gZzls+95IeS=}vxmxS@sM3t<#cPme>gR79ya2&HGj_88Z~BD+t$AT%_v`Y+UVYE zxnE5e>YS}TXAj$Q_Rw)H9@Ka3bGGYV9*RYM!@&XnP-rk3Ouc`dZ0u`&|MqCiGQNL5 zb@xRF4-CQZeFMQr%-7q0Fc|RR7U-X_G5urOl!R`irQi?u_eC6U&fufTIa}9T?g9Tn zUvJnS^F1^e-Ors<86z|IwtFbrKjM!Ct?Ni#D0#yj4L%5M(X{ct4J(Sx_Cxn@-%!-oGk72p z8+zV7d}yG1F#JJ%7FXcjAAmlWG6~<| z6xb8fa0e#fCBOtAed7x4iBmv(0TX~Gpb1Fw?Flb{7qAm)TS3A=v=<~ihr12(!~XdPQc^DZXCBejHJV{CJA$$wA&dHLUEE57;!ka zlkZODkw2Zv%lC{PNjw%mPfkvqCy&E%9F9pi&P*lAx2G`jlH>%(KjQc($LA@|OU|Qn zFgrP$Vo|b)Vtz88;_BpTilFx>e$xVa6hV(7=urecic8}z(>3WjvDnpovUyRn**xC- zc%4}gx09ps_vYFX&2YO(^6Zp7ub?n#ziAYxWYWF>@kr9XXf%0*kQGZ@CtYw)^<%|x z1MEu3{0qcogKym?E|zQ}W<@fw?isQ&SiNn_UI*JThU; z@$Zx4anpWm)U+Rm<1siMAI15c4L0Qb=yXZD!$h3TaSq4197{RQ11y0#Am>N8%?P&{ znPxU4)68aMn%Rs@qfI$|4<{iGUz#%E{lWVgcRDNPt;aoRRM^;Rs*81HGt^r7QouES@F9d2A545 zCu&Qaqh{?>V~*RM@Yy!wNvE^;a`MQXj=9d!e7Faf(+=NfPhND)vJ=NFXUGBP2Iay1 z^%G(g&)GNTED6=l`R+Gs=M;z0hhw&r6waDM#+=2JZ$lo?d4Pra-cdvYKEri}>*O-m zTVu6jPmR^NNW9TlHeujrvd+ccf4Fb@{sYqYALjw?0ppCm|A6%U2c+*mAbtM<>H7~z z-+w^*{sYqYA5e@#=Mrc-mwwReL#nSjsQIb zbQI_q&?7(}1o~m14+A|4v9caUN|0rqYI91I0A6=!V!XF z9~@ygB5(}B5rrcL#|Rt;;dmI1!*GnkVZt#E$5A*YM@{!(la$^yJ)hX%^9$!TNOV9i z5H!3eNG}vLeD;xEBxnbt7YiC6@Bn$4pz(nSpqB``kkLy8jl~$qmkGLv(dB|JX7qAF z&tminper2?)6CBwFQ}X}%_?>u2CeZQqcT9RtcKl$IXJ~_fa8eejlp^&F^E4 zp!t2=0`xkk(=>~U#*5b-H_i3ZeB!x0pGeE|iL^YQNXzqyv^<|k%kzn}JfBF*^NF-P zpGeE|iFBsFZ`9xZtPo(AfL^{)adO!x(LE{t8<@rQf zo=>FZ`9xZtPo(AfL|UFtq~-ZUTAoj&<@rQfo=>DR&1VhB#C+npJfBF*^NF-PpGeE| ziL^YQNXzqybcXp{MxKGS_~|K8Q!H`)WbELH$Kp`A$FT;hD|0>S;SV4*@z1GXH(tPsrhieJxN-zwR zjXUCbj-&CImt9vS`PYB$vb|m|=PKlN=f7UA<}A}geR~oNYf&#y#Py_DL4NOS&X;1w ztfuyj35FXzIQG1E**H=xA-{Ve=SwkV)>3nX7s z8Q^>=R!I-FcX}C?U`&M`e3y-*#Ol5SoG-;H2~zt~FT(|BA3ntOlvs^?lk=rmC4JPs z%*(JGV=6Ruk&Um!>b?oimtvLlQ+vb9umbJF-{N{otVaHr^QBlN`>Fj}FT+ZVpU{)P zoEfXQzVO_OI{W?vbdV?IU=ge4MV^Zo22elzeXg&>YUD3DUy0Sw)1NZ`O04dCj`NjR z4gY}il~|3u%=t>JhJK`v)qQWK$7=YeELJ0bn~K%f3esbh=2cv`q*$eS6?t;3vfS_C zYd|WmqMj0~G_N9Gj#ai+H1ahgl~++uiB+0ckuS$8TU)$*jY;KI)Kg-W=2hg&vC7t> zZG25iM5~G^D6S?SY>P79=_&f&a1d~OR-AxD)Mx( z%GS#eUpG^E74?-^rFj+kO03enihLzjX4rtH_gLmDd1-*8r)!ih4?{(!7d%DOPz6;N&$xDzBoR600%6JWs%B!fS#463J$d_W3*8m2u0aAGt^^{npc@_CmtnwOQEw2G8 zGv-xX@1D75Pf6(!7d%C01!(MZOZNG_N9GiB+0ck*~yR zE_wC7d`(_eY5{ z8X%QdQBR3gnpcr8#VW4>w(%Mul~++uiB+0ckuSw6uK_xF4Uo#KsHem#&8x_lVwKkb zdw30yIj<@;0L`mct^q;`Rs)1F{%O66`bwEz7nf6uOeTGRhn0kuf!_N ztH@VkHJ7~lsQ&TJVwL7q@%%;74?)@rFj+ka;z%*QmbA?JtbCYUPZnftI9sss#j4@iB+0ckuS%pvhU59S8*Lx z^D6Q(*8tYu*{WAjUx`(kSCOy8D$T3NS7MdsRpcwNO7kl6l~|>D75Pf6=8{((5Uc<5 z+4vQE0lGcEt~Y@3s_O-)dw$f@^#(BBbiDxGo?q7+Kzm&;K;84Bo~}25@uuqq==S`& z-T>O`dI9R5AN6#-0gSDy^#a5`O1J0N^$4uoitDiwuQyUJK<)uxPasn-U{79p9ia9C zbbEeXZ@|i}xZX**Ro(NWo~}25F{JAS==S`&-T>O`dI9R5AN6#-0gNGCFF?2F*YyU_ zUe^my_xz}*>kVYgt+J`K`Q)dQx40F+}q!@})X}_m$K3{JP!%+S9y>_EH_d`_R^& zAN6#-0gNGEt7tFP0ld#`?fFqp*Bi)~S8**>V^y=~*Yyaj zyox@QSiO;Y0dlNfDX-d!+Doyj?)j~}ih9z13uB1pRpd*ts@wDHdIM-r^D5d)v8wL* zQBT(!$e33%*8pjIeqE2i%Bz|h;6~~N$gz5*y!sM6KO=w7s`dhOdw!Z%ajlbj16E!| zzLZzh?~1Lwih4?{(!7d%IaW1$eqC?C%ByHE$EvdDxAH3LDX~iPD)Qx6)$I9oy#b6t znpe?Yj#XvPk9xY^K*qe9xduqv^V7VFx>Aq8%Bz|h;6~~N$gz5*ylR73#a;kDqgRkW z5l_N%qh0V^LW8R~{_nO4cvh--R6Gle&x!s6=;JeCNoR9uJLuu_jQHH%QG7;kb2EL; ztQ6+U)!ZB!Z*Hy_r@Ea~7oT->+y&2~;d8j~Oxk#7X9Xebc?3eHK5$in!+C)Cu=Fax zTEH5>M!&ZMS_jnmrnGCBF3~R_L zDzEi2tTP$bCm1%68Y*w}GHfy#Zb&e^jkHtwMlZunCd1|g!_DL#D))LBwwMfC6Aat% zKFZ+PkOG)jd}eonlRXQKe8c68f0l&ja^XH#z?|Kb2hY}-wI4ewOX1^Od%>|8<^$<< z3hh?t^$P7#=qiP-R_Gdqu2twdg|1iV28C`^=q82UpwPD|^hSl=q|nU@y;-5X3f-d6 ztqR>H(ePg2`*j(5XbWb0-ff1r;4{ZK{Y*Rqp9%4K7?(Z|$3RDX9`N~Qcuv)Lrns{d za7{jZ9`G4xcy7Fuem3xVROPd=k!(9(d9NK(X!1hk;sQIQ@(a#|pAzJ+`v&8$13t7A zDZick+lBn1$EnG^f7*o zWOut{SKH6zwTfNce#Wnp?6ygE^#e>^uh=yVGJb<(cZX!x_#l%vDt1l7jNc^LZI|pe z9ANSdirsAwG5&3m-44la;~^&BsMu}#L&o1E*>y;E&EI75X2ovv5ysyv+3l3F?vie1~IjNc~N-O23;`Lp+3uo#$5z^xGM^8mL2y##PO(5nDD zfW8%Q*NoiV&U5z`(@AsX)&#@bdG6lEbN3zQ5-Q)GV7PK>F~ z9g4M29QuYLzr17rzo~t$mQ%j0iINw-tclWV;V)|<@`cw#xbFNO^1;u@-@w0dczbGt z?eY}Cx(MndkuN?4xe4;0Y2;`p_+rnBK7Xb23AL8+`LWI?muK>MUgs0)HR1CYI-gvg z$>-ZTpHTA&pMS0M$>o`R;u?>#wE2X(QTY5Tolh>$n9lgl&t#5I0~PpE-~PoK^wmuK>cYy1qKP)7@&+jTyno;S=@*!sjfVPcGN_6l?s?!A`94e+U0&()u^z zrH7|WDjo2>%i-zS9)@#j8P08FSh|tnJg>ui18lGNT1`r0^xvf8xYx6ff#c5J>M(KK z>)Fr1ap!Df<6h6c297&-I~(_U_BU|c(hfH6b?tM&xbt>FY}20g81!Wc{aOU^(qjBe z=%a~j4v!411J1r+PmB_V zClK)0_yfIl^$qp)L66_x;|X|bsv2r~f`OWbx~5%q z!};$$kk0=R=QrM$&VPsVzq2Qu|Bsw+D*TI_UlB>y|CsYH-Jj0?HRpe%@c)DJYkSi5 zFLVA-S319l22df{tMC_ae!1cYekKN9g~ZX9?&ns{zto$~uj728%%_+0cPsoKBj4^M z;{T%{yzKzK?sue)0AKezQp6n@j0K51a3})5g+<(bkpu3Ke>g<=U*yN4#2p-lD;pd9^)6rgy}Md=-ocWV2(_#b!xq1?$HxokOPp{N|9=c2 z7VDFs(RLOb!c44#0^w@_`oeV($8s{#S|8^E{a?u>d|Sg_VjUD%!TCZi>^U2LhlQ=N zLN3-#frer)u3w-F Date: Sun, 21 Dec 2025 17:56:12 +0800 Subject: [PATCH 25/40] fix tuner (#1701) * fix tuner * Update gradlib/gradlib/GemmTuner.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: amd-ruitang3 <145657428+amd-ruitang3@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- csrc/py_itfs_cu/asm_gemm_a16w16.cu | 2 +- gradlib/gradlib/GemmTuner.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/csrc/py_itfs_cu/asm_gemm_a16w16.cu b/csrc/py_itfs_cu/asm_gemm_a16w16.cu index 4d6b723e4a..051fde35fb 100644 --- a/csrc/py_itfs_cu/asm_gemm_a16w16.cu +++ b/csrc/py_itfs_cu/asm_gemm_a16w16.cu @@ -254,7 +254,7 @@ torch::Tensor gemm_a16w16_asm(torch::Tensor& A, int gdy = (Mdim + SUBM - 1) / SUBM; int gdz = selectedksplit; - TORCH_CHECK(gdx <= 16, __func__, " gdx (", gdx, ") must be <= 16"); // 16 = 512/32 + TORCH_CHECK(gdy <= 16, __func__, " gdy (", gdy, ") must be <= 16"); // 16 = 512/32 // semaphore.fill_(selectedksplit); args.ptr_semaphore = (void*)semaphore.data_ptr(); diff --git a/gradlib/gradlib/GemmTuner.py b/gradlib/gradlib/GemmTuner.py index 8b83ca742b..2809aab50c 100644 --- a/gradlib/gradlib/GemmTuner.py +++ b/gradlib/gradlib/GemmTuner.py @@ -24,13 +24,13 @@ import torch.nn.functional as F import aiter -from aiter import dtypes, logger +from aiter import dtypes, get_semaphore_workspace, logger from aiter.jit.core import AITER_CONFIG_GEMM_BF16, get_asm_dir from aiter.jit.utils.chip_info import get_cu_num, get_gfx from aiter.ops.shuffle import shuffle_weight +from aiter.ops.triton.gemm_a16w16 import gemm_a16w16 as triton_gemm_a16w16 from aiter.utility.base_tuner import GemmCommonTuner from aiter.utility.mp_tuner import mp_tuner -from aiter.ops.triton.gemm_a16w16 import gemm_a16w16 as triton_gemm_a16w16 aiter.hipb_create_extension() @@ -59,10 +59,12 @@ def call_hipb_mm( def run_gemm_bf16_asm( inp, w, out, bias=None, splitK=None, kernelName=None, bpreshuffle=False ): + sema = get_semaphore_workspace(inp.device) return aiter.gemm_a16w16_asm( inp, w, out, + sema, bias=bias, splitK=splitK, kernelName=kernelName, From ac6142e82445329c1ecbcb161444bbc27ebc572e Mon Sep 17 00:00:00 2001 From: mqhc2020 Date: Sun, 21 Dec 2025 22:35:50 +0800 Subject: [PATCH 26/40] add gen_fake for 4 gemm operators (#1456) Co-authored-by: Lin, Soga Co-authored-by: sogalin <39478626+sogalin@users.noreply.github.com> --- aiter/ops/triton/batched_gemm_a16wfp4.py | 35 +++++++-- .../triton/batched_gemm_afp4wfp4_pre_quant.py | 7 +- aiter/ops/triton/gemm_a16w16_atomic.py | 72 +++++++++++++------ aiter/ops/triton/gemm_a16wfp4.py | 35 ++++++--- aiter/ops/triton/gemm_afp4wfp4.py | 54 +++++++++++--- .../triton/gemm_afp4wfp4_pre_quant_atomic.py | 13 ++-- aiter/ops/triton/utils/common_utils.py | 9 +++ 7 files changed, 170 insertions(+), 55 deletions(-) diff --git a/aiter/ops/triton/batched_gemm_a16wfp4.py b/aiter/ops/triton/batched_gemm_a16wfp4.py index a10cc66bea..ffd8b0ba3d 100755 --- a/aiter/ops/triton/batched_gemm_a16wfp4.py +++ b/aiter/ops/triton/batched_gemm_a16wfp4.py @@ -11,9 +11,11 @@ _get_config, ) from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton.utils.common_utils import deserialize_str from aiter.ops.triton.gemm_a16wfp4 import ( get_splitk, ) +from aiter.jit.utils.torch_guard import torch_compile_guard _LOGGER = AiterTritonLogger() @@ -26,17 +28,36 @@ def set_use_gemm_splitk_bf16(value: bool): _USE_GEMM_SPLITK_BF16 = value +def batched_gemm_a16wfp4_fake_tensor( + x: torch.Tensor, + w: torch.Tensor, + w_scales: torch.Tensor, + dtype: Optional[torch.dtype] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[str] = None, + transpose_bm: Optional[bool] = False, + prequant: Optional[bool] = True, + y_scale: Optional[torch.Tensor] = None, +) -> torch.Tensor: + if y is None: + Bx, M, _ = x.shape + _, N, _ = w.shape + return torch.empty((Bx, M, N), dtype=dtype, device=x.device) + return y + + +@torch_compile_guard(gen_fake=batched_gemm_a16wfp4_fake_tensor) def batched_gemm_a16wfp4( - x, - w, - w_scales, - dtype: Optional[float] = torch.bfloat16, + x: torch.Tensor, + w: torch.Tensor, + w_scales: torch.Tensor, + dtype: Optional[torch.dtype] = torch.bfloat16, y: Optional[torch.Tensor] = None, - config: Optional[dict] = None, + config: Optional[str] = None, transpose_bm: Optional[bool] = False, prequant: Optional[bool] = True, y_scale: Optional[torch.Tensor] = None, -): +) -> torch.Tensor: """ Computes batched FP4 matrix multiplication Y[i] = X[i] @ W[i]^T with active activation quantization. X is quantized to MXFP4 during computation, W is pre-quantized FP4. @@ -72,6 +93,8 @@ def batched_gemm_a16wfp4( if config is None: config = _get_config(M, N, K) + else: + config = deserialize_str(config) if y is None: if transpose_bm: diff --git a/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py b/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py index 01add76bdf..92a8b30256 100755 --- a/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py +++ b/aiter/ops/triton/batched_gemm_afp4wfp4_pre_quant.py @@ -3,9 +3,8 @@ from typing import Optional import torch -import triton -import aiter.ops.triton.utils._triton.arch_info as arch_info from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton.utils.common_utils import serialize_dict from aiter.ops.triton.batched_gemm_a16wfp4 import ( batched_gemm_a16wfp4, ) @@ -32,6 +31,8 @@ def batched_gemm_afp4wfp4_pre_quant( _LOGGER.info( "batched_gemm_afp4wfp4_pre_quant will be deprecated in future AITER release, please switch to batched_gemm_a16wfp4" ) + + config_hashable = serialize_dict(config) if config else None return batched_gemm_a16wfp4( - x, w, w_scales, dtype, y, config, transpose_bm=False, prequant=True + x, w, w_scales, dtype, y, config_hashable, transpose_bm=False, prequant=True ) diff --git a/aiter/ops/triton/gemm_a16w16_atomic.py b/aiter/ops/triton/gemm_a16w16_atomic.py index 78026c80f0..38341b3efb 100644 --- a/aiter/ops/triton/gemm_a16w16_atomic.py +++ b/aiter/ops/triton/gemm_a16w16_atomic.py @@ -11,34 +11,34 @@ _get_config, ) from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton.utils.common_utils import serialize_dict, deserialize_str +from aiter.jit.utils.torch_guard import torch_compile_guard _LOGGER = AiterTritonLogger() -def gemm_a16w16_atomic( - x, - w, - dtype: Optional[float] = torch.bfloat16, +def gemm_a16w16_atomic_fake_tensor( + x: torch.Tensor, + w: torch.Tensor, + dtype: Optional[torch.dtype] = torch.bfloat16, y: Optional[torch.Tensor] = None, - config: Optional[dict] = None, -): - """ - Computes 16 bit matrix multiplication Y = X @ W^T using atomic operations for split-K reduction. - - Args: - x (torch.Tensor): Input matrix with shape (M, K). - w (torch.Tensor): Weight matrix with shape (N, K), internally transposed. - dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). - Note: BF16 atomic aggregation may have slight precision loss. - y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (M, N). - Must be zero-initialized for split-K (NUM_KSPLIT > 1). - config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, - BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, cache_modifier). + config: Optional[str] = None, +) -> torch.Tensor: + if y is None: + M, _ = x.shape + _, N = w.shape + return torch.zeros((M, N), dtype=dtype, device=x.device) + return y - Returns: - torch.Tensor: Output with shape (M, N). - """ +@torch_compile_guard(gen_fake=gemm_a16w16_atomic_fake_tensor) +def gemm_a16w16_atomic_( + x: torch.Tensor, + w: torch.Tensor, + dtype: Optional[torch.dtype] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[str] = None, +) -> torch.Tensor: _LOGGER.info( f"GEMM_A16W16_ATOMIC: x.shape={tuple(x.shape)}, w.shape={tuple(w.shape)} " ) @@ -50,6 +50,9 @@ def gemm_a16w16_atomic( if config is None: config = _get_config(M, N, K) + else: + config = deserialize_str(config) + # For compatability reasons, these keys may not exist in the config # TODO: This needs to be embedded in the configs later if "NUM_KSPLIT" not in config: @@ -89,3 +92,30 @@ def gemm_a16w16_atomic( ) return y + + +def gemm_a16w16_atomic( + x: torch.Tensor, + w: torch.Tensor, + dtype: Optional[torch.dtype] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +) -> torch.Tensor: + """ + Computes 16 bit matrix multiplication Y = X @ W^T using atomic operations for split-K reduction. + + Args: + x (torch.Tensor): Input matrix with shape (M, K). + w (torch.Tensor): Weight matrix with shape (N, K), internally transposed. + dtype (Optional[torch.dtype]): Output datatype (BF16 or FP16). + Note: BF16 atomic aggregation may have slight precision loss. + y (Optional[torch.Tensor]): Pre-allocated output tensor with shape (M, N). + Must be zero-initialized for split-K (NUM_KSPLIT > 1). + config (Optional[dict]): Kernel tuning parameters (BLOCK_SIZE_M, BLOCK_SIZE_N, + BLOCK_SIZE_K, GROUP_SIZE_M, NUM_KSPLIT, cache_modifier). + + Returns: + torch.Tensor: Output with shape (M, N). + """ + config_hashable = serialize_dict(config) if config else None + return gemm_a16w16_atomic_(x, w, dtype, y, config_hashable) diff --git a/aiter/ops/triton/gemm_a16wfp4.py b/aiter/ops/triton/gemm_a16wfp4.py index 40744fba68..2bc0983119 100644 --- a/aiter/ops/triton/gemm_a16wfp4.py +++ b/aiter/ops/triton/gemm_a16wfp4.py @@ -4,10 +4,9 @@ from typing import Optional import torch import triton -import triton.language as tl import aiter.ops.triton.utils._triton.arch_info as arch_info -from aiter.ops.triton.quant import _mxfp4_quant_op from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton.utils.common_utils import deserialize_str from aiter.ops.triton._triton_kernels.gemm_a16wfp4 import ( _gemm_a16wfp4_kernel, _get_config, @@ -18,20 +17,38 @@ from aiter.ops.triton.gemm_afp4wfp4 import ( get_splitk, ) +from aiter.jit.utils.torch_guard import torch_compile_guard _LOGGER = AiterTritonLogger() +def gemm_a16wfp4_fake_tensor( + x: torch.Tensor, + w: torch.Tensor, + w_scales: torch.Tensor, + atomic_add: bool = False, + dtype: Optional[torch.dtype] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[str] = None, +) -> torch.Tensor: + if y is None: + M, _ = x.shape + N, _ = w.shape + return torch.zeros((M, N), dtype=dtype, device=x.device) + return y + + +@torch_compile_guard(gen_fake=gemm_a16wfp4_fake_tensor) def gemm_a16wfp4( - x, - w, - w_scales, + x: torch.Tensor, + w: torch.Tensor, + w_scales: torch.Tensor, atomic_add: bool = False, - dtype: Optional[float] = torch.bfloat16, + dtype: Optional[torch.dtype] = torch.bfloat16, y: Optional[torch.Tensor] = None, - config: Optional[dict] = None, -): + config: Optional[str] = None, +) -> torch.Tensor: """ Computes the matmul Y = X x W W is an e2m1 fp4 tensor and w_scales is an e8m0 tensor. @@ -62,6 +79,8 @@ def gemm_a16wfp4( if config is None: config = _get_config(M, N, K) + else: + config = deserialize_str(config) if y is None: if atomic_add: diff --git a/aiter/ops/triton/gemm_afp4wfp4.py b/aiter/ops/triton/gemm_afp4wfp4.py index dec0560f3f..1085dd5d12 100644 --- a/aiter/ops/triton/gemm_afp4wfp4.py +++ b/aiter/ops/triton/gemm_afp4wfp4.py @@ -4,17 +4,17 @@ from typing import Optional import torch import triton -import triton.language as tl import aiter.ops.triton.utils._triton.arch_info as arch_info from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton.utils.common_utils import serialize_dict, deserialize_str from aiter.ops.triton._triton_kernels.gemm_afp4wfp4 import ( _gemm_afp4wfp4_kernel, - _gemm_afp4wfp4_kernel_preshuffle_scales, _gemm_afp4wfp4_preshuffle_kernel, _gemm_afp4wfp4_reduce_kernel, _get_config, ) from .utils.core import AITER_TRITON_CONFIGS_PATH +from aiter.jit.utils.torch_guard import torch_compile_guard import os from aiter.utility.triton.triton_metadata_redirect import AOTMetadataContext @@ -63,16 +63,34 @@ def get_splitk(K: int, BLOCK_SIZE_K: int, NUM_KSPLIT: int): return SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT -def gemm_afp4wfp4( - x, - w, - x_scales, - w_scales, - dtype: Optional[float] = torch.bfloat16, +def gemm_afp4wfp4_fake_tensor( + x: torch.Tensor, + w: torch.Tensor, + x_scales: torch.Tensor, + w_scales: torch.Tensor, + dtype: Optional[torch.dtype] = torch.bfloat16, y: Optional[torch.Tensor] = None, - config: Optional[dict] = None, + config: Optional[str] = None, skip_reduce: Optional[bool] = False, -): +) -> torch.Tensor: + if y is None: + M, _ = x.shape + N, _ = w.shape + return torch.empty((M, N), dtype=dtype, device=x.device) + return y + + +@torch_compile_guard(gen_fake=gemm_afp4wfp4_fake_tensor) +def gemm_afp4wfp4_( + x: torch.Tensor, + w: torch.Tensor, + x_scales: torch.Tensor, + w_scales: torch.Tensor, + dtype: Optional[torch.dtype] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[str] = None, + skip_reduce: Optional[bool] = False, +) -> torch.Tensor: """ Computes matrix multiplication Y = X @ W^T with FP4 activations and FP4 weights. @@ -91,7 +109,6 @@ def gemm_afp4wfp4( Returns: torch.Tensor: Output with shape (M, N). """ - _LOGGER.info( f"GEMM_AFPWFP4: x.shape={tuple(x.shape)} w.shape={tuple(w.shape)} x_scale={tuple(x_scales.shape)} w_scale={tuple(w_scales.shape)} " ) @@ -106,6 +123,8 @@ def gemm_afp4wfp4( if config is None: config = _get_config(M, N, K) + else: + config = deserialize_str(config) if config["NUM_KSPLIT"] > 1: SPLITK_BLOCK_SIZE, BLOCK_SIZE_K, NUM_KSPLIT = get_splitk( @@ -527,3 +546,16 @@ def gemm_afp4wfp4_preshuffled_weight_scales( "gemm_afp4wfp4_preshuffled_weight_scales will be deprecated in future AITER release, please switch to gemm_afp4wfp4_preshuffle" ) return gemm_afp4wfp4_preshuffle(x, w, x_scales, w_scales, dtype, y, config, use_aot) + + +def gemm_afp4wfp4( + x: torch.Tensor, + w: torch.Tensor, + x_scales: torch.Tensor, + w_scales: torch.Tensor, + dtype: Optional[torch.dtype] = torch.bfloat16, + y: Optional[torch.Tensor] = None, + config: Optional[dict] = None, +) -> torch.Tensor: + config_hashable = serialize_dict(config) if config else None + return gemm_afp4wfp4_(x, w, x_scales, w_scales, dtype, y, config_hashable) diff --git a/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py b/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py index d3738fd4aa..2d5cbe3e32 100644 --- a/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py +++ b/aiter/ops/triton/gemm_afp4wfp4_pre_quant_atomic.py @@ -3,9 +3,8 @@ from typing import Optional import torch -import triton -import triton.language as tl from aiter.ops.triton.utils.logger import AiterTritonLogger +from aiter.ops.triton.utils.common_utils import serialize_dict from aiter.ops.triton.gemm_a16wfp4 import ( gemm_a16wfp4, ) @@ -14,9 +13,9 @@ def gemm_afp4wfp4_pre_quant( - x, - w, - w_scales, + x: torch.Tensor, + w: torch.Tensor, + w_scales: torch.Tensor, dtype: Optional[float] = torch.bfloat16, y: Optional[torch.Tensor] = None, config: Optional[dict] = None, @@ -24,4 +23,6 @@ def gemm_afp4wfp4_pre_quant( _LOGGER.info( "gemm_afp4wfp4_pre_quant will be deprecated in future AITER release, please switch to gemm_a16wfp4" ) - return gemm_a16wfp4(x, w, w_scales, True, dtype, y, config) + + config_hashable = serialize_dict(config) if config else None + return gemm_a16wfp4(x, w, w_scales, True, dtype, y, config_hashable) diff --git a/aiter/ops/triton/utils/common_utils.py b/aiter/ops/triton/utils/common_utils.py index 2da76efe38..4729ccfdb3 100644 --- a/aiter/ops/triton/utils/common_utils.py +++ b/aiter/ops/triton/utils/common_utils.py @@ -5,6 +5,7 @@ import torch import triton +import json def prev_power_of_2(x: int) -> int: @@ -34,3 +35,11 @@ def switch_to_contiguous_if_needed(x: torch.Tensor) -> torch.Tensor: if x.stride(-1) == 1: return x return x.contiguous() + + +def serialize_dict(d: dict) -> str: + return json.dumps(d) + + +def deserialize_str(s: str) -> dict: + return json.loads(s) From 420f5daed2f653612708daa3400b77b67b6350e1 Mon Sep 17 00:00:00 2001 From: Lingpeng Jin <103567126+valarLip@users.noreply.github.com> Date: Mon, 22 Dec 2025 00:48:09 +0800 Subject: [PATCH 27/40] fix llvm issue (#1703) * fix llvm issue * fix copilot --- 3rdparty/composable_kernel | 2 +- aiter/jit/core.py | 33 ++++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel index 9a6e61de97..9bd67c2cf2 160000 --- a/3rdparty/composable_kernel +++ b/3rdparty/composable_kernel @@ -1 +1 @@ -Subproject commit 9a6e61de9787be2e7ed4a9566cb59a420c5d3f78 +Subproject commit 9bd67c2cf2fe8e4479a433bcd6d467e2ea9aedb4 diff --git a/aiter/jit/core.py b/aiter/jit/core.py index 3b91336d6b..70237ce003 100644 --- a/aiter/jit/core.py +++ b/aiter/jit/core.py @@ -375,12 +375,34 @@ def validate_and_update_archs(): @functools.lru_cache() def hip_flag_checker(flag_hip: str) -> bool: - ret = os.system(f"hipcc {flag_hip} -x hip -E -P /dev/null -o /dev/null") - if ret == 0: - return True - else: - logger.warning(f"{flag_hip} is not supported by hipcc.") + import subprocess + + cmd = ["hipcc", flag_hip, "-x", "hip", "-E", "-P", "/dev/null", "-o", "/dev/null"] + try: + subprocess.check_output(cmd, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + logger.warning(f"Current hipcc not support: {flag_hip}") return False + return True + + +@functools.lru_cache() +def check_LLVM_MAIN_REVISION(): + # for https://github.com/ROCm/ROCm/issues/5646 and https://github.com/ROCm/composable_kernel/pull/3469 + # ck using following logic... + """#if LLVM_MAIN_REVISION < 554785 + #define CK_TILE_HOST_DEVICE_EXTERN __host__ __device__ + #else + #define CK_TILE_HOST_DEVICE_EXTERN""" + import subprocess + + cmd = """echo "#include +__host__ __device__ void func(){std::tuple t = std::tuple(1, 1);}" | hipcc -x hip -P -c -Wno-unused-command-line-argument -""" + try: + subprocess.check_output(cmd, shell=True, text=True, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError: + return 554785 + return 554785 - 1 def check_and_set_ninja_worker(): @@ -541,6 +563,7 @@ def MainFunc(): "-Wno-macro-redefined", "-Wno-missing-template-arg-list-after-template-kw", "-fgpu-flush-denormals-to-zero", + f"-DDLLVM_MAIN_REVISION={check_LLVM_MAIN_REVISION()}", ] # Imitate https://github.com/ROCm/composable_kernel/blob/c8b6b64240e840a7decf76dfaa13c37da5294c4a/CMakeLists.txt#L190-L214 From f174268f3862c9546ce16e83c8c21250257bf621 Mon Sep 17 00:00:00 2001 From: ClementLinCF <162283536+ClementLinCF@users.noreply.github.com> Date: Mon, 22 Dec 2025 11:03:24 +0800 Subject: [PATCH 28/40] feat: Adaptive topk algorithm selection based on input characteristics (#1578) * Add radix-base selection * Remove explicit template * Update the selected k condition * remove pos < k guard * code format * Update csrc/include/rocm_ops.hpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update csrc/kernels/topk_per_row_kernels.cu Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update csrc/kernels/topk_plain_kernels.cu Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update test_topk_plain.py * Update TODO message * Update csrc/kernels/topk_per_row_kernels.cu Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update op_tests/test_topk_plain.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * format test_topk_plain.py with black * Disable triton test for a resonalbe execution time * add explicit template instantiation * fix explicit template instantiation * add explicit template instantiation * Add bf16 support * Fix linter * Fix build errors * Fix condition * Fix build and test * Update conditions --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Lingpeng Jin <103567126+valarLip@users.noreply.github.com> Co-authored-by: MHYang --- aiter/jit/optCompilerConfig.json | 3 +- aiter/ops/topk_plain.py | 7 +- csrc/include/opus/opus.hpp | 2 +- csrc/include/rocm_ops.hpp | 19 +- csrc/include/topk_plain.h | 9 +- csrc/kernels/topk_per_row_kernels.cu | 84 +- csrc/kernels/topk_plain_kernels.cu | 1322 +++++++++++++++++++------- op_tests/test_topk_plain.py | 175 ++-- 8 files changed, 1212 insertions(+), 409 deletions(-) diff --git a/aiter/jit/optCompilerConfig.json b/aiter/jit/optCompilerConfig.json index ad1a9b3309..52e76078bc 100755 --- a/aiter/jit/optCompilerConfig.json +++ b/aiter/jit/optCompilerConfig.json @@ -1077,7 +1077,8 @@ "module_topk_plain": { "srcs": [ "f'{AITER_CSRC_DIR}/pybind/topk_plain_pybind.cu'", - "f'{AITER_CSRC_DIR}/kernels/topk_plain_kernels.cu'" + "f'{AITER_CSRC_DIR}/kernels/topk_plain_kernels.cu'", + "f'{AITER_CSRC_DIR}/kernels/topk_per_row_kernels.cu'" ], "flags_extra_cc": [], "flags_extra_hip": [], diff --git a/aiter/ops/topk_plain.py b/aiter/ops/topk_plain.py index dea2c654b7..cd768b01e9 100644 --- a/aiter/ops/topk_plain.py +++ b/aiter/ops/topk_plain.py @@ -13,7 +13,12 @@ def topk_plain( x: torch.Tensor, topk_ids: torch.Tensor, + topk_out: torch.Tensor, topk: int, - largest: bool, + largest: bool = True, + rowStarts: torch.Tensor = None, + rowEnds: torch.Tensor = None, + stride0: int = -1, + stride1: int = 1, ) -> None: pass diff --git a/csrc/include/opus/opus.hpp b/csrc/include/opus/opus.hpp index bc3631e2a2..f2b96e4483 100644 --- a/csrc/include/opus/opus.hpp +++ b/csrc/include/opus/opus.hpp @@ -907,7 +907,7 @@ template<> OPUS_D float min(const float&a, const float&b) { return template OPUS_D T med3(const T&a, const T&b, const T&c) { auto max_0 = max(a, b); auto min_0 = max(a, b); return max(max_0, max(min_0, c)); } template<> OPUS_D float med3(const float&a, const float&b, const float&c) { return __builtin_amdgcn_fmed3f(a, b, c); } -template<> OPUS_D __fp16 med3<__fp16>(const __fp16&a, const __fp16&b, const __fp16&c) { return __builtin_amdgcn_fmed3h(a, b, c); } +template<> OPUS_D _Float16 med3<_Float16>(const _Float16&a, const _Float16&b, const _Float16&c) { return __builtin_amdgcn_fmed3h(a, b, c); } ///////////////////////////////////////////////////////////////////////////////////////////////////////// // buffer load/store related OPUS_D constexpr auto buffer_default_config() { diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index 908865ae07..c8262eed72 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -1635,10 +1635,15 @@ namespace py = pybind11; py::arg("final_output"), \ py::arg("final_lse") = std::nullopt); -#define TOPK_PLAIN_PYBIND \ - m.def("topk_plain", \ - &topk_plain, \ - py::arg("values"), \ - py::arg("topk_ids"), \ - py::arg("topk"), \ - py::arg("largest")); +#define TOPK_PLAIN_PYBIND \ + m.def("topk_plain", \ + &topk_plain, \ + py::arg("values"), \ + py::arg("topk_ids"), \ + py::arg("topk_out"), \ + py::arg("topk"), \ + py::arg("largest") = true, \ + py::arg("rowStarts") = torch::Tensor(), \ + py::arg("rowEnds") = torch::Tensor(), \ + py::arg("stride0") = -1, \ + py::arg("stride1") = 1); diff --git a/csrc/include/topk_plain.h b/csrc/include/topk_plain.h index 5a658e491d..087c157196 100644 --- a/csrc/include/topk_plain.h +++ b/csrc/include/topk_plain.h @@ -6,5 +6,10 @@ void topk_plain(torch::Tensor& values, torch::Tensor& topk_ids, - int topk_num, - bool largest); + torch::Tensor& topk_out, + int topk, + bool largest = true, + torch::Tensor rowStarts = torch::Tensor(), + torch::Tensor rowEnds = torch::Tensor(), + int64_t stride0 = -1, + int64_t stride1 = 1); diff --git a/csrc/kernels/topk_per_row_kernels.cu b/csrc/kernels/topk_per_row_kernels.cu index 14eae78163..89331c52df 100644 --- a/csrc/kernels/topk_per_row_kernels.cu +++ b/csrc/kernels/topk_per_row_kernels.cu @@ -420,7 +420,8 @@ __device__ void filter_and_histogram(T const* in_buf, IdxT* histogram, bool select_min, int pass, - bool early_stop) + bool early_stop, + IdxT k) { constexpr int num_buckets = calc_num_buckets(); __shared__ IdxT histogram_smem[num_buckets]; @@ -893,9 +894,19 @@ __global__ void radix_kernel(T const* in, int const pass) { const int64_t batch_id = blockIdx.y; - const IdxT row_len = phase == Phase::Prefill - ? rowEnds[batch_id] - rowStarts[batch_id] - : rowEnds[batch_id / next_n] - next_n + (batch_id % next_n) + 1; + + IdxT row_len = len; + if(phase == Phase::Prefill) + { + if(rowStarts && rowEnds) + { + row_len = rowEnds[batch_id] - rowStarts[batch_id]; + } + } + else + { + row_len = rowEnds[batch_id / next_n] - next_n + (batch_id % next_n) + 1; + } auto counter = counters + batch_id; IdxT current_k; @@ -965,7 +976,8 @@ __global__ void radix_kernel(T const* in, histogram, select_min, pass, - early_stop); + early_stop, + k); __threadfence(); bool isLastBlock = false; @@ -1187,7 +1199,8 @@ __device__ bool filter_and_histogram_for_one_block(T const* in_buf, Counter* counter, IdxT* histogram, bool select_min, - int pass) + int pass, + IdxT k) { constexpr int num_buckets = calc_num_buckets(); for(int i = threadIdx.x; i < num_buckets * 2; i += blockDim.x) @@ -1371,11 +1384,25 @@ __global__ void radix_topk_one_block_kernel(T const* in, __shared__ IdxT histogram[num_buckets * 2]; const int64_t batch_id = blockIdx.x; - const IdxT rowStart = phase == Phase::Prefill ? rowStarts[batch_id] : 0; - const IdxT rowEnd = phase == Phase::Prefill - ? rowEnds[batch_id] - : rowEnds[batch_id / next_n] - next_n + (batch_id % next_n) + 1; - const IdxT row_len = rowEnd - rowStart; + + IdxT rowStart = 0; + IdxT rowEnd = len; + if(phase == Phase::Prefill) + { + if(rowStarts && rowEnds) + { + rowStart = rowStarts[batch_id]; + rowEnd = rowEnds[batch_id]; + } + } + else + { + rowEnd = rowEnds[batch_id / next_n] - next_n + (batch_id % next_n) + 1; + rowStart = 0; + } + + const IdxT row_len = rowEnd - rowStart; + if(threadIdx.x == 0) { counter.k = k; @@ -1448,7 +1475,8 @@ __global__ void radix_topk_one_block_kernel(T const* in, &counter, histogram, select_min, - pass); //@TODO CHECK UPDATE CODE + pass, + k); //@TODO CHECK UPDATE CODE __syncthreads(); scan(histogram + use_one_pass * num_buckets); @@ -1811,6 +1839,35 @@ void standalone_stable_radix_11bits(void* buf, } } +// Explicit template instantiation for standalone_stable_radix_11bits +template void standalone_stable_radix_11bits(void* buf, + size_t& buf_size, + float const* in, + int batch_size, + int64_t len, + int* rowStarts, + int* rowEnds, + int k, + float* out, + int* out_idx, + bool greater, + hipStream_t stream, + int next_n); + +template void standalone_stable_radix_11bits(void* buf, + size_t& buf_size, + float const* in, + int batch_size, + int64_t len, + int* rowStarts, + int* rowEnds, + int k, + float* out, + int* out_idx, + bool greater, + hipStream_t stream, + int next_n); + // AIR TopK end static inline __device__ uint32_t floatAsSortableUint(float x) @@ -2410,6 +2467,9 @@ int64_t invokeComputeTopkLastDimWorkspaceSize(int32_t numRows, int32_t stride0) return buf_size; } +// Explicit template instantiation to ensure the symbol is available for linking +template int64_t invokeComputeTopkLastDimWorkspaceSize(int32_t numRows, int32_t stride0); + void top_k_per_row_prefill(const torch::Tensor& logits, const torch::Tensor& rowStarts, const torch::Tensor& rowEnds, diff --git a/csrc/kernels/topk_plain_kernels.cu b/csrc/kernels/topk_plain_kernels.cu index 4bf732756c..7c03823ae0 100644 --- a/csrc/kernels/topk_plain_kernels.cu +++ b/csrc/kernels/topk_plain_kernels.cu @@ -49,10 +49,251 @@ utils::hip_check_((val), __FILE__, __LINE__); \ } +// Forward declaration of topk_per_row kernel from topk_per_row_kernels.cu +namespace aiter { + +// Phase enum for distinguishing prefill vs decode paths +enum class Phase +{ + Prefill, + Decode, +}; + +template +__global__ void topk_per_row(const float* logits, + const int* rowStarts, + const int* rowEnds, + int* outIndices, + int stride0, + int stride1, + int rowOffset); + +// Forward declaration of standalone_stable_radix_11bits from topk_per_row_kernels.cu +template +void standalone_stable_radix_11bits(void* buf, + size_t& buf_size, + T const* in, + int batch_size, + int64_t len, + IdxT* rowStarts, + IdxT* rowEnds, + IdxT k, + T* out, + IdxT* out_idx, + bool greater, + hipStream_t stream, + int next_n = 0); + +} // namespace aiter + +// Forward declaration of workspace size calculation function (at global scope) +template +int64_t invokeComputeTopkLastDimWorkspaceSize(int32_t numRows, int32_t stride0); +extern template int64_t +invokeComputeTopkLastDimWorkspaceSize(int32_t numRows, + int32_t stride0); + +// Forward declaration of helper function to call topk_per_row kernel +template +void topk_per_row_kernel_launcher(const float* in, + const IdxT* rowStarts, + const IdxT* rowEnds, + IdxT* out_idx, + const float* out, + int batch_size, + int stride0, + int stride1, + int k, + hipStream_t stream); + +// Helper function to determine if topk_per_row kernel should be used +// Based on: n + K log²K ≥ 3 × Factor(n) × n +// where Factor(n) = 1/3 + 1.6/(log₂(n) - 9.5) +// Simplifies to: K log²K ≥ 4.8n/(log₂(n) - 9.5) +// TODO: We need to confirm whether, when n <= 2048, we might choose +// radix sort because the denominator becomes very small; does that +// still yield the best performance? +template +__forceinline__ __host__ bool should_use_topk_radix(IdxT len, IdxT k) +{ + const double n = static_cast(len); + const double K = static_cast(k); + + if(K <= 1.0) + { + return false; + } + + const double log_n = std::log2(n); + + const double denom = std::max(0.0001, log_n - 9.5); + + const double rhs = (4.8 * n) / denom; + + const double log_k = std::log2(K); + const double lhs = K * log_k * log_k; + + return lhs >= rhs; +} + +// Gather kernel to extract values based on indices (uniform length) +template +__global__ void gather_topk_values_kernel(const T* __restrict__ in, + const IdxT* __restrict__ indices, + T* __restrict__ out, + int batch_size, + int len, + int k) +{ + int batch_id = blockIdx.x; + if(batch_id >= batch_size) + return; + + const T* in_row = in + batch_id * len; + const IdxT* idx_row = indices + batch_id * k; + T* out_row = out + batch_id * k; + + for(int i = threadIdx.x; i < k; i += blockDim.x) + { + IdxT idx = idx_row[i]; + if(idx >= 0 && idx < len) + { + out_row[i] = in_row[idx]; + } + } +} + +// Gather kernel for variable length with strides +template +__global__ void gather_topk_values_strided_kernel(const T* __restrict__ in, + const IdxT* __restrict__ indices, + T* __restrict__ out, + const IdxT* __restrict__ rowStarts, + int batch_size, + int stride0, + int stride1, + int k) +{ + int batch_id = blockIdx.x; + if(batch_id >= batch_size) + return; + + IdxT start = rowStarts[batch_id]; + const T* in_row = in + batch_id * stride0; + const IdxT* idx_row = indices + batch_id * k; + T* out_row = out + batch_id * k; + + for(int i = threadIdx.x; i < k; i += blockDim.x) + { + IdxT idx = idx_row[i]; + if(idx >= 0) + { + // idx is relative to rowStart, need to add start and apply stride1 + out_row[i] = in_row[(start + idx) * stride1]; + } + } +} + namespace topk { + +// ============================================================================ +// TYPE TRAITS FOR DATA/COMPUTE TYPE SEPARATION +// ============================================================================ +// +// Design Philosophy: +// - DataType (DataT): The storage/I/O type for memory operations +// - ComputeType (ComputeT): The type used for internal computations +// +// Mapping: +// - fp16, bf16, float -> compute as float (better precision, consistent ops) +// - int -> compute as int +// +// This separation allows: +// 1. Memory-efficient storage with compact types (fp16, bf16) +// 2. High-precision computation with float +// 3. Easy extension for new types (e.g., fp8, int8) +// +// Usage: +// using ComputeT = compute_t; +// ComputeT val = type_convert::to_compute(data_val); +// DataT result = type_convert::to_data(compute_val); +// ============================================================================ + +namespace type_traits { + +// Primary template: maps DataType -> ComputeType +template +struct ComputeTypeTraits +{ + static_assert(sizeof(DataT) == 0, + "ComputeTypeTraits not specialized for this type. " + "Supported types: _Float16, __bf16, float, int"); +}; + +// Specializations for floating-point types -> float +template <> +struct ComputeTypeTraits<_Float16> +{ + using type = float; +}; + +template <> +struct ComputeTypeTraits<__bf16> +{ + using type = float; +}; + +template <> +struct ComputeTypeTraits +{ + using type = float; +}; + +// Specialization for integer types -> int +template <> +struct ComputeTypeTraits +{ + using type = int; +}; + +// Convenience alias +template +using compute_t = typename ComputeTypeTraits::type; + +} // namespace type_traits + +// Bring compute_t into topk namespace for convenience +using type_traits::compute_t; + +// ============================================================================ +// TYPE CONVERSION UTILITIES +// ============================================================================ + +namespace type_convert { + +// Convert from DataType to ComputeType +template +__device__ __host__ __forceinline__ type_traits::compute_t to_compute(DataT val) +{ + return static_cast>(val); +} + +// Convert from ComputeType to DataType +template +__device__ __host__ __forceinline__ DataT to_data(type_traits::compute_t val) +{ + return static_cast(val); +} + +} // namespace type_convert + namespace utils { -// Supported types +// Supported types (for validation) template struct is_supported_type { @@ -198,60 +439,62 @@ __inline__ __host__ __device__ constexpr int calc_capacity(int k) namespace numeric { +// ============================================================================ +// BOUNDS AND SENTINEL VALUES +// ============================================================================ +// These functions now work with ComputeType for internal operations. +// The sentinel values are defined in ComputeType space (float for floating-point +// DataTypes, int for integer DataTypes). +// ============================================================================ + /** - * @brief Gets the absolute lowest possible value for a numeric type T. + * @brief Gets the absolute lowest possible value for a compute type. + * + * Uses -infinity for floating-point compute types, and the lowest finite + * value for integer compute types. * - * Uses -infinity for signed floating-point types, and the lowest finite - * value for all other arithmetic types. + * @tparam ComputeT The compute type (float or int). */ -template -__inline__ constexpr T get_lower_bound() +template +__inline__ __device__ __host__ constexpr ComputeT get_lower_bound() { - static_assert(utils::is_supported_type_v, - "Unsupported type T: only _Float16, __bf16, float, and int are implemented"); - if constexpr(std::is_floating_point_v && std::is_signed_v) - { - return -std::numeric_limits::infinity(); - } - else if constexpr(std::is_integral_v) + if constexpr(std::is_same_v) { - return std::numeric_limits::lowest(); + return -std::numeric_limits::infinity(); } - else if constexpr(std::is_same_v) + else if constexpr(std::is_same_v) { - return -__bf16(0x7F80); + return std::numeric_limits::lowest(); } else { + static_assert(sizeof(ComputeT) == 0, "Unsupported compute type"); __builtin_unreachable(); } } /** - * @brief Gets the absolute highest possible value for a numeric type T. + * @brief Gets the absolute highest possible value for a compute type. + * + * Uses +infinity for floating-point compute types, and the maximum finite + * value for integer compute types. * - * Uses +infinity for floating-point types, and the maximum finite - * value for all other arithmetic types. + * @tparam ComputeT The compute type (float or int). */ -template -__inline__ constexpr T get_upper_bound() +template +__inline__ __device__ __host__ constexpr ComputeT get_upper_bound() { - static_assert(utils::is_supported_type_v, - "Unsupported type T: only _Float16, __bf16, float, and int are implemented"); - if constexpr(std::is_floating_point_v) - { - return std::numeric_limits::infinity(); - } - else if constexpr(std::is_integral_v) + if constexpr(std::is_same_v) { - return std::numeric_limits::max(); + return std::numeric_limits::infinity(); } - else if constexpr(std::is_same_v) + else if constexpr(std::is_same_v) { - return __bf16(0x7F80); + return std::numeric_limits::max(); } else { + static_assert(sizeof(ComputeT) == 0, "Unsupported compute type"); __builtin_unreachable(); } } @@ -259,42 +502,56 @@ __inline__ constexpr T get_upper_bound() /** * @brief Gets a sentinel value for a search algorithm (e.g., Top-K). * - * @tparam FindLargest A compile-time boolean. If true, returns the lowest possible - * value (the starting point for finding a maximum). If false, returns the - * highest possible value (the starting point for finding a minimum). - * @tparam T The numeric type. + * The sentinel is defined in ComputeType space. For finding the largest values, + * we use the lowest possible value as sentinel (so any real value will be preferred). + * For finding the smallest values, we use the highest possible value. + * + * @tparam FindLargest If true, returns lowest value. If false, returns highest value. + * @tparam ComputeT The compute type (float or int). */ -template -__inline__ constexpr T get_sentinel_value() +template +__inline__ __device__ __host__ constexpr ComputeT get_sentinel_value() { if constexpr(FindLargest) { - static_assert( - !std::is_unsigned_v, - "Cannot determine a meaningful lower bound for finding the 'largest' unsigned value. " - "The lowest value is 0, which is a poor sentinel."); - return get_lower_bound(); + return get_lower_bound(); } else { - return get_upper_bound(); + return get_upper_bound(); } } /** - * @brief A generic comparison function for search algorithms. 💡 + * @brief Gets sentinel value based on DataType (converts to appropriate ComputeType). + * + * This is a convenience overload that deduces the ComputeType from DataType. + * + * @tparam FindLargest If true, returns lowest value. If false, returns highest value. + * @tparam DataT The data type (fp16, bf16, float, int). + */ +template +__inline__ __device__ __host__ constexpr compute_t get_sentinel_value_for_data() +{ + return get_sentinel_value>(); +} + +/** + * @brief A generic comparison function for search algorithms. * * Compares `val` against `baseline` according to the search direction * specified by the `FindLargest` template parameter. + * Works with ComputeType values. * * @tparam FindLargest If true, checks if `val` is greater than `baseline`. - * If false, checks if `val` is less than `baseline`. + * If false, checks if `val` is less than `baseline`. + * @tparam ComputeT The compute type (float or int). * @param val The new value to check. * @param baseline The current best value. * @return True if `val` is "preferred" over `baseline`. */ -template -__device__ __host__ constexpr bool is_preferred(T val, T baseline) +template +__device__ __host__ __forceinline__ constexpr bool is_preferred(ComputeT val, ComputeT baseline) { if constexpr(FindLargest) { @@ -310,6 +567,19 @@ __device__ __host__ constexpr bool is_preferred(T val, T baseline) namespace sorting { +// ============================================================================ +// SORTING OPERATIONS (Work with ComputeType) +// ============================================================================ +// All sorting operations in this namespace work with ComputeType values. +// The template parameter T should be the compute type (float or int). +// The idxT parameter is the index type (typically int32_t). +// +// The sorting algorithms use: +// - DPP (Data Parallel Primitives) for small-stride shuffles (≤8) +// - Wave intrinsics (__ballot, __popcll, __shfl) for larger operations +// - Bitonic sort/merge for efficient parallel sorting +// ============================================================================ + template struct BitonicMerge { @@ -492,26 +762,30 @@ __forceinline__ __device__ T shfl_xor(T val, int stride) } } -template -__forceinline__ __device__ constexpr T get_guard(const bool x) +/** + * @brief Gets guard value for bitonic sort comparisons. + * + * This function returns boundary values used in bitonic sorting. + * Works with ComputeType (float or int). + * + * @tparam ComputeT The compute type (float or int). + * @param x If true, returns lowest value; if false, returns highest value. + */ +template +__forceinline__ __device__ constexpr ComputeT get_guard(const bool x) { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) { - auto inf = _Float16(0x7C00); - return x ? -inf : inf; + return x ? -std::numeric_limits::infinity() : std::numeric_limits::infinity(); } - else if constexpr(std::is_same_v) + else if constexpr(std::is_same_v) { - auto inf = __bf16(0x7F80); - return x ? -inf : inf; - } - else if constexpr(!std::is_floating_point_v) - { - return x ? std::numeric_limits::lowest() : std::numeric_limits::max(); + return x ? std::numeric_limits::lowest() : std::numeric_limits::max(); } else { - return x ? -std::numeric_limits::infinity() : std::numeric_limits::infinity(); + static_assert(sizeof(ComputeT) == 0, "get_guard only supports float and int compute types"); + __builtin_unreachable(); } } @@ -709,14 +983,27 @@ struct BitonicMerge<64, ascending, T, idxT> namespace buffer_load_helpers { -constexpr int MAX_CAPACITY = 512; +constexpr int MAX_CAPACITY = 2048; using int32x4_t = int __attribute__((ext_vector_type(4))); using floatx4_t = float __attribute__((ext_vector_type(4))); -using bf16x8_t = uint16_t __attribute__((ext_vector_type(8))); +using bf16x8_t = __bf16 __attribute__((ext_vector_type(8))); using halfx8_t = _Float16 __attribute__((ext_vector_type(8))); using index_t = uint32_t; +__device__ __forceinline__ static int32x4_t +asm_buffer_load_dwordx4(int32x4_t srsrc, + int32_t voffset, + int32_t soffset, + int32_t aux) __asm("llvm.amdgcn.raw.buffer.load.v4i32"); + +template +__device__ __forceinline__ VecType +buffer_load_dwordx4(int32x4_t srsrc, int32_t voffset, int32_t soffset, int32_t aux) +{ + return __builtin_bit_cast(VecType, asm_buffer_load_dwordx4(srsrc, voffset, soffset, aux)); +} + } // namespace buffer_load_helpers // --- Wave-Level Priority Selection Primitives (AMD/HIP Optimized) --- @@ -766,21 +1053,39 @@ struct BlockTopkSort; template struct BlockTopkMerge; -// WaveBuffer: Manages per-wave register storage for priority candidates -template +// ============================================================================ +// WAVE BUFFER (Stores priorities in ComputeType) +// ============================================================================ +// +// WaveBuffer manages per-wave register storage for priority candidates. +// Key design: +// - DataT: The I/O type for loading/storing data +// - ComputeT: The internal type for priorities (float or int) +// - Priorities are stored as ComputeType for consistent computation +// - Conversion happens at I/O boundaries +// +// Template parameters: +// - capacity: Power-of-2 buffer capacity (>= wave size) +// - DataT: Data type for I/O (fp16, bf16, float, int) +// - IdxT: Index type (typically int32_t) +// ============================================================================ + +template struct WaveBuffer { + using ComputeT = compute_t; + static constexpr int slots_per_lane = capacity / opus::get_warp_size(); static_assert(capacity >= opus::get_warp_size() && utils::is_power_of_2(capacity), "Capacity must be power-of-2 and >= wave size"); - T priorities[slots_per_lane]; + ComputeT priorities[slots_per_lane]; IdxT positions[slots_per_lane]; int lane_id; IdxT target_count; - T sentinel; + ComputeT sentinel; - __device__ WaveBuffer(IdxT k, T sentinel_value) + __device__ WaveBuffer(IdxT k, ComputeT sentinel_value) : lane_id(threadIdx.x & (opus::get_warp_size() - 1)), target_count(k), sentinel(sentinel_value) @@ -792,13 +1097,16 @@ struct WaveBuffer } } - __device__ inline void reset_slot(int slot, T val = {}, IdxT pos = {}) + __device__ inline void reset_slot(int slot, ComputeT val = {}, IdxT pos = {}) { priorities[slot] = val; positions[slot] = pos; } - __device__ inline void flush_results(T* __restrict__ out_vals, + // Flush results to output buffer + // OutT can be DataT (for final output) or ComputeT (for LDS operations) + template + __device__ inline void flush_results(OutT* __restrict__ out_vals, IdxT* __restrict__ out_indices) const { #pragma unroll @@ -807,7 +1115,7 @@ struct WaveBuffer const IdxT global_slot = i * opus::get_warp_size() + lane_id; if(global_slot < target_count) { - out_vals[global_slot] = priorities[i]; + out_vals[global_slot] = static_cast(priorities[i]); out_indices[global_slot] = positions[i]; } } @@ -815,10 +1123,14 @@ struct WaveBuffer }; // Helper for merging sorted sequences (used by multiple strategies) -template +// Works with ComputeType internally, reads from ComputeType buffers +template struct WaveMergeHelper { + using ComputeT = compute_t; + // Merges a sorted k-element chunk with the buffer's existing Top-K + // Input is in ComputeType (from LDS or previous computation) // EXAMPLE (finding Top-4 largest, capacity=64, k=4): // Wave-distributed storage (64 lanes, each lane holds slots_per_lane=1 value): // Lanes 0-3: [80, 85, 90, 95] (current top-4, in ascending order) @@ -843,8 +1155,8 @@ struct WaveMergeHelper // // Extract top-k=4 (last 4 in ascending order): // Lanes 60-63 now contain: [85, 90, 95, 100] - __device__ static void merge_sorted_range(WaveBuffer& buffer, - const T* __restrict__ in, + __device__ static void merge_sorted_range(WaveBuffer& buffer, + const ComputeT* __restrict__ in, const IdxT* __restrict__ in_idx, IdxT start) { @@ -854,56 +1166,64 @@ struct WaveMergeHelper { if(idx < start + buffer.target_count) { - T candidate = in[idx]; - if(numeric::is_preferred(candidate, buffer.priorities[i])) + ComputeT candidate = in[idx]; + if(numeric::is_preferred(candidate, buffer.priorities[i])) { buffer.priorities[i] = candidate; buffer.positions[i] = in_idx[idx]; } } } - sorting::BitonicMerge::merge(buffer.priorities, - buffer.positions); + sorting::BitonicMerge::merge(buffer.priorities, + buffer.positions); } }; // Forward declarations for kernel wrapper functions -template -__global__ void __launch_bounds__(512, 2) topk_filter_kernel(const T* __restrict__ in, +// Note: Kernels use DataT for I/O and compute_t for sentinel/internal computation +template +__global__ void __launch_bounds__(512, 2) topk_filter_kernel(const DataT* __restrict__ in, const IdxT* __restrict__ in_idx, int batch_size, IdxT len, IdxT k, - T* __restrict__ out, + DataT* __restrict__ out, IdxT* __restrict__ out_idx, - T sentinel); + compute_t sentinel); -template -__global__ void __launch_bounds__(512, 2) topk_sort_kernel(const T* __restrict__ in, +template +__global__ void __launch_bounds__(512, 2) topk_sort_kernel(const DataT* __restrict__ in, const IdxT* __restrict__ in_idx, int batch_size, IdxT len, IdxT k, - T* __restrict__ out, + DataT* __restrict__ out, IdxT* __restrict__ out_idx, - T sentinel); + compute_t sentinel); -template -__global__ void __launch_bounds__(512, 2) topk_merge_kernel(const T* __restrict__ in, +template +__global__ void __launch_bounds__(512, 2) topk_merge_kernel(const DataT* __restrict__ in, const IdxT* __restrict__ in_idx, int batch_size, IdxT len, IdxT k, - T* __restrict__ out, + DataT* __restrict__ out, IdxT* __restrict__ out_idx, - T sentinel); + compute_t sentinel); -// Kernel function pointer type alias -template -using KernelFuncPtr = void (*)(const T*, const IdxT*, int, IdxT, IdxT, T*, IdxT*, T); +template +using KernelFuncPtr = + void (*)(const DataT*, const IdxT*, int, IdxT, IdxT, DataT*, IdxT*, compute_t); // Helper: Map block-level strategy class to its corresponding kernel function template -template