From fd734db96efc665a6f36b43af056518cdfd3babd Mon Sep 17 00:00:00 2001 From: antsaukk Date: Fri, 26 Dec 2025 11:41:02 +0000 Subject: [PATCH 1/3] add py and cpp interface changes --- aiter/ops/mha.py | 9 +++++++++ csrc/include/torch/mha_v3_fwd.h | 3 ++- csrc/py_itfs_cu/asm_mha_fwd.cu | 3 ++- op_tests/test_mha.py | 21 +++++++++++++++++++++ 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/aiter/ops/mha.py b/aiter/ops/mha.py index 9dca7b3210..ad0889512d 100644 --- a/aiter/ops/mha.py +++ b/aiter/ops/mha.py @@ -254,6 +254,7 @@ def fmha_v3_fwd( bias: Optional[Tensor] = None, alibi_slopes: Optional[Tensor] = None, gen: Optional[Generator] = None, + l_tpf: int = 0, ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: ... @@ -1234,6 +1235,7 @@ def _flash_attn_forward( how_v3_bf16_cvt: Optional[int] = 1, cu_seqlens_q: Optional[torch.Tensor] = None, cu_seqlens_kv: Optional[torch.Tensor] = None, + l_tpf: int = 0, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: (_, seqlen_q, nhead_q, hdim_q) = q.shape @@ -1274,6 +1276,7 @@ def _validate_cu(name: str, x: Optional[torch.Tensor]): _validate_cu("cu_seqlens_kv", cu_seqlens_kv) if can_impl_fmha_v3_fwd() and seqlen_q > 128: # Prefer CK for decode cases + print(f"TESTINGGG fmha_v3_fwd = {l_tpf}") out, softmax_lse, S_dmask, rng_state = fmha_v3_fwd( q, k, @@ -1290,6 +1293,7 @@ def _validate_cu(name: str, x: Optional[torch.Tensor]): bias, alibi_slopes, None, + l_tpf, ) else: out, softmax_lse, S_dmask, rng_state = mha_fwd( @@ -1700,6 +1704,7 @@ def forward( how_v3_bf16_cvt: Optional[int] = 1, cu_seqlens_q: Optional[torch.Tensor] = None, cu_seqlens_kv: Optional[torch.Tensor] = None, + l_tpf: int = 0, ): is_grad = is_grad_enabled and any(x.requires_grad for x in [q, k, v]) if softmax_scale is None: @@ -1731,6 +1736,7 @@ def forward( how_v3_bf16_cvt=how_v3_bf16_cvt, cu_seqlens_q=cu_seqlens_q, cu_seqlens_kv=cu_seqlens_kv, + l_tpf=l_tpf, ) if is_grad: assert return_lse @@ -1829,6 +1835,7 @@ def backward(ctx, dout, *args): None, # how_v3_bf16_cvt None, # cu_seqlens_q None, # cu_seqlens_kv + None, # l_tpf ) @@ -1848,6 +1855,7 @@ def flash_attn_func( how_v3_bf16_cvt=1, cu_seqlens_q: Optional[torch.Tensor] = None, cu_seqlens_kv: Optional[torch.Tensor] = None, + l_tpf: int = 0, ): """dropout_p should be set to 0.0 during evaluation Supports multi-query and grouped-query attention (MQA/GQA) by passing in KV with fewer heads @@ -1918,6 +1926,7 @@ def flash_attn_func( how_v3_bf16_cvt, cu_seqlens_q, cu_seqlens_kv, + l_tpf ) diff --git a/csrc/include/torch/mha_v3_fwd.h b/csrc/include/torch/mha_v3_fwd.h index 9ec33136fc..49c94168ab 100644 --- a/csrc/include/torch/mha_v3_fwd.h +++ b/csrc/include/torch/mha_v3_fwd.h @@ -19,6 +19,7 @@ std::vector fmha_v3_fwd(at::Tensor &q, // [b, sq, hq, d] std::optional out_, // [b, sq, hq, d_v] std::optional bias_, // [sq, sk] std::optional alibi_slopes_, // [hq] or [b, hq] - std::optional gen_); + std::optional gen_, + int tokens_per_frame); } // namespace torch_itfs } // namespace aiter diff --git a/csrc/py_itfs_cu/asm_mha_fwd.cu b/csrc/py_itfs_cu/asm_mha_fwd.cu index 2efc1d7828..fa91208d97 100644 --- a/csrc/py_itfs_cu/asm_mha_fwd.cu +++ b/csrc/py_itfs_cu/asm_mha_fwd.cu @@ -155,7 +155,8 @@ std::vector fmha_v3_fwd(at::Tensor &q, // [b, sq, hq, d] std::optional out_, // [b, sq, hq, d_v] std::optional bias_, // [sq, sk] std::optional alibi_slopes_, // [hq] or [b, hq] - std::optional gen_) + std::optional gen_, + int tokens_per_frame) { auto q_dtype = q.dtype(); TORCH_CHECK(q_dtype == torch::kFloat16 || q_dtype == torch::kBFloat16, diff --git a/op_tests/test_mha.py b/op_tests/test_mha.py index 824caf1f4d..a75895090b 100644 --- a/op_tests/test_mha.py +++ b/op_tests/test_mha.py @@ -91,6 +91,7 @@ def run_ck( return_attn_probs=False, cu_seqlens_q=None, cu_seqlens_kv=None, + l_tpf=0, ): (out, softmax_lse, S_dmask), us_fwd = run_perftest( aiter.flash_attn_func, @@ -110,6 +111,7 @@ def run_ck( cu_seqlens_q=cu_seqlens_q, cu_seqlens_kv=cu_seqlens_kv, num_rotate_args=1, + l_tpf=l_tpf, ) if dropout_p > 0.0: @@ -213,6 +215,7 @@ def test_flash_attn_output( mha_type, dtype, input_layout, + l_tpf, ): torch.random.manual_seed(0) torch.cuda.empty_cache() @@ -302,6 +305,7 @@ def test_flash_attn_output( deterministic, return_lse, return_attn_probs, + l_tpf=l_tpf, ) out_ref, softmax_lse_ref, dq_ref, dk_ref, dv_ref, dbias_ref = run_torch( @@ -414,6 +418,7 @@ def flash_attn_output_benchmark( mha_type, dtype, input_layout, + l_tpf, ): return test_flash_attn_output( batch_size, @@ -430,6 +435,7 @@ def flash_attn_output_benchmark( mha_type, dtype, input_layout, + l_tpf, ) @@ -493,6 +499,7 @@ def test_flash_attn_seq_padding( deterministic, mha_type, dtype, + l_tpf, ): torch.random.manual_seed(0) @@ -591,6 +598,7 @@ def test_flash_attn_seq_padding( return_attn_probs, cu_seqlens_q, cu_seqlens_kv, + l_tpf=l_tpf, ) # 3. Run Torch with padding_mask (forward pass only) @@ -681,6 +689,7 @@ def test_flash_attn_seq_padding( l_causal = [False, True] l_local = [False, True] l_deterministic = [False, True] +l_tpf = 0 parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, @@ -806,6 +815,13 @@ def test_flash_attn_seq_padding( help="""input_layout. e.g.: -i BSHD""", ) +parser.add_argument( + "-tpf", + "--token_per_frame", + type=int, + default=0, + help="""Tokens per frame for causal attention. Default is 0.""", +) if __name__ == "__main__": args = parser.parse_args() @@ -818,6 +834,9 @@ def test_flash_attn_seq_padding( if args.deterministic is not None: l_deterministic = [args.deterministic] + if args.token_per_frame > 0: + l_tpf = args.token_per_frame + collected = [] for ( dtype, @@ -844,6 +863,7 @@ def test_flash_attn_seq_padding( mha_type, dtypes.d_dtypes[dtype], args.input_layout, + l_tpf, ) collected.append(ret) test_flash_attn_seq_padding( @@ -861,6 +881,7 @@ def test_flash_attn_seq_padding( deterministic, mha_type, dtypes.d_dtypes[dtype], + l_tpf, ) df = pd.DataFrame(collected) From 5f90b7e3d5f67d8b72ce6037a54e5223819e8065 Mon Sep 17 00:00:00 2001 From: antsaukk Date: Fri, 26 Dec 2025 13:51:19 +0000 Subject: [PATCH 2/3] add complete interface change in both cpp and py --- aiter/ops/mha.py | 1 - csrc/cpp_itfs/mha_fwd_generate.py | 6 ++++-- csrc/include/mha_fwd.h | 12 ++++++++++-- csrc/include/rocm_ops.hpp | 4 +++- csrc/py_itfs_cu/asm_mha_fwd.cu | 9 ++++++++- hsa/gfx950/fmha_v3_fwd/codegen.py | 23 +++++++++++++---------- 6 files changed, 38 insertions(+), 17 deletions(-) diff --git a/aiter/ops/mha.py b/aiter/ops/mha.py index ad0889512d..eb82fc1bd5 100644 --- a/aiter/ops/mha.py +++ b/aiter/ops/mha.py @@ -1276,7 +1276,6 @@ def _validate_cu(name: str, x: Optional[torch.Tensor]): _validate_cu("cu_seqlens_kv", cu_seqlens_kv) if can_impl_fmha_v3_fwd() and seqlen_q > 128: # Prefer CK for decode cases - print(f"TESTINGGG fmha_v3_fwd = {l_tpf}") out, softmax_lse, S_dmask, rng_state = fmha_v3_fwd( q, k, diff --git a/csrc/cpp_itfs/mha_fwd_generate.py b/csrc/cpp_itfs/mha_fwd_generate.py index 741bb3640b..d1fd832efa 100644 --- a/csrc/cpp_itfs/mha_fwd_generate.py +++ b/csrc/cpp_itfs/mha_fwd_generate.py @@ -99,7 +99,9 @@ int how_v3_bf16_cvt, const void* seqstart_q_padding_ptr, const void* seqstart_k_padding_ptr, - bool is_v3_api_check) + bool is_v3_api_check, + int magic_const, + int tokens_per_frame) {{ int head_size_q = args.hdim_q; int head_size_v = args.hdim_v; @@ -178,7 +180,7 @@ def get_v3_api(): - v3_call = "fmha_fwd_v3(traits, args, stream_config, is_v3_api_check)" + v3_call = "fmha_fwd_v3(traits, args, stream_config, is_v3_api_check, magic_const, tokens_per_frame)" gfx_list = get_gfx_list() v3_arch_list = [arch for arch in ["gfx942", "gfx950"] if arch in gfx_list] diff --git a/csrc/include/mha_fwd.h b/csrc/include/mha_fwd.h index 65ff3ea1d5..77577e8fbe 100644 --- a/csrc/include/mha_fwd.h +++ b/csrc/include/mha_fwd.h @@ -89,7 +89,9 @@ __attribute__((visibility("default"))) float mha_fwd(mha_fwd_args args, int how_v3_bf16_cvt = 1, const void* seqstart_q_padding_ptr = nullptr, const void* seqstart_k_padding_ptr = nullptr, - bool is_v3_api_check = false); + bool is_v3_api_check = false, + int magic_const = 0, + int tokens_per_frame = 0); __attribute__((visibility("default"))) float mha_fwd_splitkv(mha_fwd_splitkv_args args, @@ -177,6 +179,10 @@ struct __attribute__((packed)) fmha_fwd_v3_args p2 _p30; const void* ptr_kseq_padding; p2 _p31; + unsigned int tokens_per_frame_magic_const; + p3 _p32; + unsigned int tokens_per_frame; + p3 _p33; }; struct fmha_fwd_v3_traits @@ -231,6 +237,8 @@ namespace gfx950 { float fmha_fwd_v3(mha_fwd_traits t, mha_fwd_args a, const ck_tile::stream_config& s, - bool is_v3_api_check = false); + bool is_v3_api_check = false, + int magic_const = 0, + int tokens_per_frame = 0); } } // namespace aiter diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp index c8262eed72..96d4e5be94 100644 --- a/csrc/include/rocm_ops.hpp +++ b/csrc/include/rocm_ops.hpp @@ -790,7 +790,9 @@ namespace py = pybind11; py::arg("out") = std::nullopt, \ py::arg("bias") = std::nullopt, \ py::arg("alibi_slopes") = std::nullopt, \ - py::arg("gen") = std::nullopt); + py::arg("gen") = std::nullopt, \ + py::arg("tokens_per_frame") = 0); + #define MHA_FWD_PYBIND \ m.def("mha_fwd", \ diff --git a/csrc/py_itfs_cu/asm_mha_fwd.cu b/csrc/py_itfs_cu/asm_mha_fwd.cu index fa91208d97..fa335116f7 100644 --- a/csrc/py_itfs_cu/asm_mha_fwd.cu +++ b/csrc/py_itfs_cu/asm_mha_fwd.cu @@ -312,6 +312,8 @@ std::vector fmha_v3_fwd(at::Tensor &q, // [b, sq, hq, d] softmax_scale, p_dropout, drop_seed_offset); + + int magic_const = (uint32_t)(((1ULL << 32) + tokens_per_frame - 1) / tokens_per_frame); float t = aiter::mha_fwd(args, stream_config, @@ -323,7 +325,12 @@ std::vector fmha_v3_fwd(at::Tensor &q, // [b, sq, hq, d] quant_scale_enum::no_scale, true, false, - how_v3_bf16_cvt); + how_v3_bf16_cvt, + nullptr, + nullptr, + false, + magic_const, + tokens_per_frame); TORCH_CHECK(t >= 0, "invalid argument for fmha_fwd"); } else { diff --git a/hsa/gfx950/fmha_v3_fwd/codegen.py b/hsa/gfx950/fmha_v3_fwd/codegen.py index 2563390b5f..038ef1db0c 100644 --- a/hsa/gfx950/fmha_v3_fwd/codegen.py +++ b/hsa/gfx950/fmha_v3_fwd/codegen.py @@ -172,7 +172,7 @@ class fmha_fwd_v3_kernel }; template -float fmha_fwd_v3_dispatcher(const ck_tile::stream_config& s, mha_fwd_args a) +float fmha_fwd_v3_dispatcher(const ck_tile::stream_config& s, mha_fwd_args a, int magic_const, int tokens_per_frame) { if(s.log_level_ > 0) std::cout << ", " << FmhaFwdV3Name::fwd_v3_name << std::flush; @@ -182,6 +182,7 @@ class fmha_fwd_v3_kernel { tune_opt -= 2; } + tune_opt = 0; // disable tune for fmha v3 for now fmha_fwd_v3_args args; args.ptr_o = a.o_ptr; @@ -219,6 +220,8 @@ class fmha_fwd_v3_kernel args.ptr_kseq = nullptr; args.ptr_qseq_padding = nullptr; args.ptr_kseq_padding = nullptr; + args.tokens_per_frame_magic_const = magic_const; + args.tokens_per_frame = tokens_per_frame; auto traits = fmha_fwd_v3_traits{a.batch, a.nhead_q, @@ -303,7 +306,7 @@ class fmha_fwd_v3_kernel ); } -float fmha_fwd_v3(mha_fwd_traits t, mha_fwd_args a, const ck_tile::stream_config& s, bool is_v3_api_check) { +float fmha_fwd_v3(mha_fwd_traits t, mha_fwd_args a, const ck_tile::stream_config& s, bool is_v3_api_check, int magic_const, int tokens_per_frame) { float r = -1; if (t.use_ext_asm == true) { if (t.data_type.compare("bf16") == 0) { @@ -317,7 +320,7 @@ class fmha_fwd_v3_kernel if (is_v3_api_check) { return 1; } - r = fmha_fwd_v3_dispatcher(s, a); + r = fmha_fwd_v3_dispatcher(s, a, magic_const, tokens_per_frame); } else { if (a.batch_stride_lse >= a.nhead_stride_lse) { @@ -325,7 +328,7 @@ class fmha_fwd_v3_kernel if (is_v3_api_check) { return 1; } - r = fmha_fwd_v3_dispatcher(s, a); + r = fmha_fwd_v3_dispatcher(s, a, magic_const, tokens_per_frame); } } } @@ -335,7 +338,7 @@ class fmha_fwd_v3_kernel if (is_v3_api_check) { return 1; } - r = fmha_fwd_v3_dispatcher(s, a); + r = fmha_fwd_v3_dispatcher(s, a, magic_const, tokens_per_frame); } else { if (a.batch_stride_lse >= a.nhead_stride_lse) { @@ -343,7 +346,7 @@ class fmha_fwd_v3_kernel if (is_v3_api_check) { return 1; } - r = fmha_fwd_v3_dispatcher(s, a); + r = fmha_fwd_v3_dispatcher(s, a, magic_const, tokens_per_frame); } } } @@ -392,7 +395,7 @@ class fmha_fwd_v3_kernel if (is_v3_api_check) { return 1; } - r = fmha_fwd_v3_dispatcher(s, a); + r = fmha_fwd_v3_dispatcher(s, a, magic_const, tokens_per_frame); } else { if (a.batch_stride_lse >= a.nhead_stride_lse) { @@ -400,7 +403,7 @@ class fmha_fwd_v3_kernel if (is_v3_api_check) { return 1; } - r = fmha_fwd_v3_dispatcher(s, a); + r = fmha_fwd_v3_dispatcher(s, a, magic_const, tokens_per_frame); } } } @@ -410,7 +413,7 @@ class fmha_fwd_v3_kernel if (is_v3_api_check) { return 1; } - r = fmha_fwd_v3_dispatcher(s, a); + r = fmha_fwd_v3_dispatcher(s, a, magic_const, tokens_per_frame); } else { if (a.batch_stride_lse >= a.nhead_stride_lse) { @@ -418,7 +421,7 @@ class fmha_fwd_v3_kernel if (is_v3_api_check) { return 1; } - r = fmha_fwd_v3_dispatcher(s, a); + r = fmha_fwd_v3_dispatcher(s, a, magic_const, tokens_per_frame); } } } From ceb294e27b05e080fe0e56cb63225226d9407e16 Mon Sep 17 00:00:00 2001 From: antsaukk Date: Fri, 26 Dec 2025 16:23:54 +0000 Subject: [PATCH 3/3] replace binary and save original under _temp --- csrc/include/mha_fwd.h | 1 + hsa/gfx950/fmha_v3_fwd/codegen.py | 2 ++ .../fmha_v3_fwd/fwd_hd128_bf16_causal.co | Bin 30512 -> 33720 bytes .../fmha_v3_fwd/fwd_hd128_bf16_causal_temp.co | Bin 0 -> 30512 bytes 4 files changed, 3 insertions(+) create mode 100755 hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal_temp.co diff --git a/csrc/include/mha_fwd.h b/csrc/include/mha_fwd.h index 77577e8fbe..60ba683ad9 100644 --- a/csrc/include/mha_fwd.h +++ b/csrc/include/mha_fwd.h @@ -7,6 +7,7 @@ #include "aiter_hip_common.h" #include "fmha_fwd.hpp" #include "mask.hpp" +#include namespace aiter { struct mha_fwd_traits : public fmha_fwd_traits diff --git a/hsa/gfx950/fmha_v3_fwd/codegen.py b/hsa/gfx950/fmha_v3_fwd/codegen.py index 038ef1db0c..0bd6450d51 100644 --- a/hsa/gfx950/fmha_v3_fwd/codegen.py +++ b/hsa/gfx950/fmha_v3_fwd/codegen.py @@ -102,6 +102,8 @@ class fmha_fwd_v3_kernel { int length = strlen(name); std::string kernel_func_name = "_ZN5aiter" + std::to_string(length) + name + "E"; + std::cout << "Loading kernel: " << kernel_func_name << std::endl; + std::cout << "HSACO ptr: " << hsaco << std::endl; HIP_CALL(hipModuleLoadData(&module, hsaco)); HIP_CALL(hipModuleGetFunction(&kernel_func, module, kernel_func_name.c_str())); } diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal.co index dae2c6e62f1ff32079ce763400521c845aa63639..085e17d52f4dce31fc8c80f5bc2ad0df4e3eb831 100755 GIT binary patch delta 7102 zcmeI1TWAzl7{|{{HalUvHkVqptBpxE*<`%L+3T$Fwz;bDZlXqGvfU)RR@5XlS(Pg8 zrcJb1AEKv7pwgyDYg>FsnTJXdb_+g;Q0Nw+^raN)gHRCclS09Mvol}Lt{mt~eR2=P zGw1t#|Nl;QXa5<`%sm!vUlgYs#9;Euis3}EmShLcq|8_gFcs^Eb;g?b^XHlU#v12b zofXi`S(AU7KaUzq@~$O&;3Oy%Va*rPCc#zn-+uH1Ay(O4ZTRDCyb2)_ls8pVtJol% z($gQyM%MNS%9zwG7 z5q-Y!i~W23gp;XKy;l%ODbz}#)<%B(*}hyPRfJ@!)CwWXix}@PIeeg*M78QL8j4v1 z8q`2T4H`&{qXArE87Jz?8Ap9B@LJTr!8q#S3y{dvp~0K10S#PW;6j7NjH5w4@Oso& zGmd)rurm#)cQTH8nIuR?MuR%mfCekTUBYf#_JIO-MP3hG-KM}5K#4Q@1OXANlJ0q#MA&5Wag7q}Po zdl*N(54aEYU5uOhg#P`^_|ae=Ye0hl@BkVdU>r|C1+Jn#!Z_-Kz=NnCWZcjvM6wnd z)}ld_HK0Kg@Fp}k!Z@D5I^gS2{~qJ0Uk`je>c<$*>bJr7bOSVOK!Z4IK!c6IH=@C5 z#_%A=3)H74@8uX#?JddQQl+>mk#Q2Aq)T0D}%R;DpR(;G0p;37IXxx1gRAGF$bK z*@_07km&@2PBh?z%r@ZLP|pdO?ZCIAo)a=V^pM$s2Aq)D34A9Sa6)Dm@Lj0qgv@T> zyHU>z8TiAM*@Fh0kl71-FB)({rVDr%>Nz3P4ZIumypY)k4g1i56EbkimD!I5oRB#H z`~d1XA=3lA2lc#=IS360(SQ>&hkzeK15U{F0`EmVCu9x-Ka6@_$n-%&9~y8%rXP4e z8gN1;3_J{aZLTR~6~i{|?@(7!;qIky_foifDcrsMPj@eER_i0{WUBm`_QUpH#g`c_ zg}au*T}$Dv<(0Z?v5VThuCFb{V?~px7CPaRwa?7tNDUkdv#uh{;Jb|!qw zA})QVxrTnXEeifMl_KOa{7j_uEqHzOIH_Z#Qhey7O(T^Rv#^P(4~sF8yUr^Ui)eCW zN{;!Ipc)Gd_&mXw5)H)ssy{jq84N~(5!K@xiVV0t-XSFrm9tybRdH)VBKKgoEO*$m z3yFO4LVdAWAB9gRcfgJ>42k@4A-77%YO{VFc2jbPA(zNp5~{GxFh5||28KiyM(0b} z1K4VSA(5V~*@O9i;Qz-kB=Qez1>}yQ>_Q?B-pQ>JGGf-3<@je`|9uk#hw}wfW}TeN zuTPux2Os9wjfX1|d2}hi{*~E(=HvYOcZ9$;3H$~hy#N*nP!>zbjQO|*Wq06!_kiM0 zsGAQc949X(R1Y~G8HgSaD@wpWf7sS)9=d&M5UW0qid9cLHYlyiyv*+oV6Uon;9+ye zd_${(4obVPU2pKhE4$sT3L1M`u-WZbo3ZL&n{C#6{mmYFt9mc}a_pGcT}>;-8*M%2 zB@P#U=Xk(|SNyD#UV`OaliqdG`>=dyF5&B8iwA5Wc!r=4cwEyNSIhBXPMuMm*-y*k zm2@s%X2Z94<}AG*4^)&~H#$npkNas6tvD4BOU}{GQyazG=jbJv=FZdmFjXa}?X>G( D$ZiyN delta 4357 zcmeH~OKcNY7=`~k*l}W9VW$FOP{4K)$0k5L9>))!32_o03FJX|LmC{XEFwq?6;%~s zJH$nmA`zG_sy@`mqNNpTX{S=Vpc;_6U{eW4t-7GELwld@3Vz2#$}*7sEb zxzBKZd7)0059wi9F*`q@1moY|sBU7xz<_rGB(RLBlMOXq@$q;*1oar#6Hu=X)P; zx3;Q9vE>w-?qWq*3c!;ZVWF!^Ruabo;K0G3qvS2_qj-$(N{QeyBgO zikaNbxq}hD-%OdxQFxS88yOh1aE@onnYkZ53FPq1PCI~3~d(N&v^qQ+^SLt zJPZvB-o$w`qjm7tfww?g1eZBi7_EoD9()5)uCze}+IS$rXd?m}5om?B3f{qaC!+}b z2)GQD1@Gd#hfx%M6dYf1NfCU2^C3nt_%ZM{%2BCJ1crIwAfq?}aRd_3gy5r`XBZ{n zC&5$Dl;By;Uu3ii{wDCv(9MFs#Q9l9Ti|a2-)elqts?LW51ePT4gNL+wnMjr$Kp2q zA{BLQPI1|n{=RfZx6j!0WmPlk>fsh`K6!7+({xfhW^cz?I-nimDlDC}1O5*1PH3m# zmQG5;PlI7;IeZUlOuJ%U?0X(#-h;Jcu^1h;h3Zuq;w_dxduZs{Z}Nu^%! zKBJTRM8MKXdlA@+z&_|c!7ZKC55FHAzsu5q;FeArgg*#A1RWCG(nNhjc+ zKwtzq0$%B)hwbfE(ht>Iv}fq4nrCqtdbXD#N&B%sX{-9hKBLz^(rSl)mKK+w#UvHjKL|+#S_M?|4LznT0K<^C!?d; zu~;^q8cRe@X2+FGGL=!1qtR?O*_KIWVsbPU8ynB0!sbK7-Kpx_*Vuwu%Vi!oDF1}N zlKaQ7BefRx_$UV*dd=z*VM}fL=dh)-7B;LX2Zhd=gPi%-_=Bw-%0V}=k+2pvPACUO zu(z`o_9-aj`O~0aW89y@hQ?avokRZn1@q(lj^R+MCimq9Hr>Qw?GoglcwqbuK7U~R=)$F3 zJWUg$C$keHkw`pJ8Kx6_D9h=%aFvwE%8^)ya697NVi-$yiedb*n`)~BIuf!7bShC} zs2FGNmb=qNtN+hD*%Y7&L)~avkH#)_S!{=A$^Kqx>^;3Sewh(vD z^46dAtAmBONBmNR4yw1OmaDDPHR?A7kJESnH7lUb7ZM(E+ZxKIyO)Q(PI0H*XVk&z zxF^qVe4c;v`Ny+PduxRIh3P(+4oHznwf5q+n$XMTi6<-Z^B29&x4654sjH{!8T|tz C|D?eH diff --git a/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal_temp.co b/hsa/gfx950/fmha_v3_fwd/fwd_hd128_bf16_causal_temp.co new file mode 100755 index 0000000000000000000000000000000000000000..dae2c6e62f1ff32079ce763400521c845aa63639 GIT binary patch literal 30512 zcmeI53wRXQmFI6)KP1a7+_b@N(Kc>`Xj%e;)kc1>&HdCsW z1jg8lWZCOrpIbf6M==u8iK_K#!-L^Agmf;|=yj$5PHGHJDz# z=z4^!HEQQR0qT2TbAjUG+4GjIVyxlY{vBn(T7TyDKuu+!qW0NmzLn{(*;adnRUZ6y zroXgqm%qa2FD(t!*1lB}@R#}a)Rfl+U@0rBfzDT3?Jo_yzOABai@)MqnU($>fp0E< zBeQDj*4jWF`&MRc`Qw4aw=;M9D|Q8Z+siA<{*v$N+p=rx)F%4mFMR4ZsMCt6RRpK_WippEYS{T>*HWw*+NGjmHyZ*T%@(o)yKiU zvW1>LD*drrxLRvrRv!oZ$`%&%QE9*}9AF^6R=i@4)<|z3$NS1g&hMjAzl~Hw`SsdT ze}%tBJMJ>>!lVT*eFeVCh5p;HGS2#IhWb5D!SnGtI2x~mmt%17YGL;&I37d&9;e_$ybeyr>)@SO9BkX^ z*DeI-PQ~ItmmHmYFJ1>9#OvUr7#zI5-KW1Eo{Plbqem#5iPyo|cpaRJ!NG~`KHVkZ zLJS^yoPe2{t=)Z`j;t6Q z^*kMUaeF9;+r#u2JiN7&pN`61y6Eu6;Hc;6m>su=`Eh$Fj={t0yL)6;=MBqZaMJTs ztc=^k+PFPzjKRZ+-MS3xykT<;E_$Abvba42Y15Skg z%W*sg7d=nJiMTzSjN8LIgYgjCxXH=tx*Fe3{$7DTBKh)xEugzoL9wrK$Z(da80>U>{`e@bC3)t?<|R z_EgnuSDkYkAF=(+?OVTOk{?6CH97fPcLe>ut$WIR!Lsa}0^gRc+4;Uw|E^kp#hec_ zt82=4`|AR?58d51_FiU9U?)bDdmQ399N%DgJF|9Mb&apIYFB05<~K5H_wCqHRqKEqWvezM&j-$X{ZB@_5u@kY z^9QQk0}iDp2DIlFR6Xsl$q3S=(0_@FZs9L&*$aNV?59R(Na#iTYD4&ng8m!WTOHtT zI3wg!?sb%*r0n}O7V1ZgR@Q_@YR@_Jz32ZwT0_)uFd9XB&6bGLYQYjw9yKvqA}&&x zERigt#S-xn6NJbzKn6DxtwMyJ{fM9}gDg>qv;bc4I4IvML{5PbFak2{e*y4hgdUT`z%!D=u9%48pg#$&QMNKAm6rX2Q5I7lAc#v6KVQdSmp~J`;J4~#p!)Q*D>LbsD zCx<-EjULZIPbew;5IYba8BLez6s8|aZ%z-T%jqFYv!}rlPK3?UnBFWjW;6>=V*hkc zYlg=oXQZ3L9zi&e%o@T*(px;O9>F7fo=iXJX%w1`XN?Dq=|&l?MH9X(BXh7nJKQmH zDe;Algo(r>9SOz6){c=)#Fsjv>}wtLEsC_7PJmX}@I?o}IE$VP2eG#o+YQ4`PC9?ya9*4k^*}+b$ zQ(#u7BWT6;bVYZd)A|Hs(?^VaEaZ47IAi1se=%d^hzjyyeb~Y7bB<&o#|TAlARWp^ z@ILc?i72yIJ7o4+M`Yr-)=2UAR?{d?tI?I-it{Ub5))z8|4O&;i%r?=9vuz$$oaL95>N`ooe(>gOt-QQ}J<>cfB(-PK7pKt{0MRm@)O7fNGIeH$;e7ayug``2;5Dz2+G9Ho%$%5oS@*w$; z0?1UzbjVD_XYcdb*ZCk_@AuPJdlKY(!z*kD!mr?Sr+A1W9_VwUn1~`C=yRobY;icO zR;MXA#B2)Ux*4R;xzUX47d~f(YpMr2bAzW@DTjDDYO(}nuU6lNcy_oO96avmI8-l$ zGIUm4Kdd$!D>20uOmy0Uh|gZ_akzT#Bin4jsLdW^BA(kr8yg*c*YUz{a>_8p=OKC@ z!-oZ%)c8M+b|j;W_wLf#MCItPVaj`q4tpTTX5+nL{|4-HV8nM>M@IP%;UOaI;_DczQ_ESg{^eHqJ@RZcR5;EB-8u%;yX?8V`9cs z`Yy{ZG*I~-c9+McriNH52!x)5!DAfs&BXBi_eg+MVO%Qf{Q6UkwpdHLLCUcMr`l>NKpRE0N=^YYc>y!?5Dm#;;5`DPa{U+?1OTa+`<`4DmbT^s&&t;DeC=+j^y1CG(`!&Uo(2Ihv|4Lfr6i(@tWDAhjN!0`Lzq6hYLFkQ2c zQSD<5ECc!s*vEn6G<&vc&o!{|(2s|G0ysglPg3oZ4J;G-OxP!a6E(Z2+KUV<3;HbB zv%zf5K1a3BGq4=!b705Ms*Aary+pOIFt9x6^I)F@PSWhFRQnnO%ZEN6_Q~L6&F)w2 zr3O5?&Mp?fJ_Vej*#oM5n}JP*ek$zKz-gL&r)sY?u<6iGhkXV(L$mKv?RyMtCiF95 zp9Ri>JulxBZbTlOfEckNkMZOQxRm_xf{Z+nVj=`mOgKdz6V4(Y6Ah}p5u})K&fziP zoX72lRQppP#e{PKj|t}@Za=Krp9U!=oQru(IG1vJi)w!sq?mA)@R)F};PzJ4{v1d# z;atUI!nuaqkE-?;L5d0IIvx|w4cz{cYCi^2OgJCqG2!%a`*GF&O^{;3>E|)wEamoB zRr_lo#e_4!W5T(O+fSj zgtM00zpL8YK#B?HE*=xkJ>33*YX3e+G2z_DW5W3axBo!3e+W`cIRApjgtLBx!C}mh zhORaBE#D~(y(zE4^YlL+?XYe$;eC^<8me0M3>}ZKfdT%w6SRvI0{S!Q^C>T zXfO>-1IK`4KsV?H$AV))59k5Y!E`VK%mBxMELv51~>zp3C;v(fwMrD z=#@r^g;J_GTN*9SmD0rd(im}}n1rDmeBN{}+Dj5dV+hkckGPqk(8PI+}?07##jxsu0iDQh8=ZWb?#|y*>M#oWNw$bqtagx#TGI5I0@eSe(qr=O_zMnTK zFEr4cs7}pA#=1@6>-RQ=%}u88eU_$hk(yKK@8fju*Wb74Udz}QpWUP8FrbpdfJzPn zDme_O{(kRg_&~dJo8_lU>-NI-{8e|N_4H*j|`?Z}$ zvj^MhkPOH;$au&E2-&aiGnzB8JrR-x$%f=Wav{rLr(7rx+mj&qkjan&$P@_KFD)oC zPsR2$$aKgI$V|vA2-z=}6q)bG^~CnWNRsX2(@FQ!^<=C^v0YhLWF9sqO|bp-nIzla ze3E4Q+p|fwAO9%HcK%!v)xF^_GTVOolO)^43rV(5JCkfbyOd=6yPqf7KD(^gKQD*< zm%m7Iv0$gGNMNJD)OmDI40fh15-<%c*y&zM_rPGMXNADX=gpl(<}oN|tf6~fuygzd zfsF+-eRQu2c4n0dEFH|*M)$U0XWn*R@7BH|^Ej0AtLUB<>@27i*aUFu9=eAGJEuP( zu!-QzdVy*2OgU}uT()=aO8Fvv|D$6)GX378ZI3%CJWd{mK95g6OH$%m;b|`~U+@Y_ zxl>M5ct%cA_(_D9FGYCy=WM8=Kg;e{_#-bbU-t6yFXR+Oe@=EP{7HnDuS9tHS1eW0 zcd;~uzxML-4KFYMPR>yDSLN{ve-q*5&m+A2dzP!{udzuA|JlpSH@&?42YH&Jzb?;E z_%9J&z7^r+FX&vvl1F=SPxT_t5E1_Qr zJEr?xT&3Apsr8XZuZDg#>}$X^nq8MiuZ4as?CZdFnmwS_-)3Ozpj|t~<+^)}~DJGo%k;jDdYuv8SqbVkw zFY=gh9^-a>9!)Xf{5p>b=PTT<&!Z_OoZsXz;e3tT^?5YKg!5ZGCY*0@yFQPmm~g(y zW5W41x9jt0iV5dm@|bYG%kBC+nqtEF9Uc?THg4DF(G(NTb{-SX_qko4M^j8Vzt3aB z`60LK^Jt0*=U?%daE3=1w0?!nxaLR12D6q|dmDM4TlggB?58>BHgnE@ZncT(oLSId zru=*1*SNm;80V5#IG4S~x%>^zm2dNUXG?g!)$els+BVMh?{jYakaJTQ#C?&Db#7gQ z85e%!flSi!ZO1s5sT#Hm;%Rg5yxWWX(~JC*?3WgtHj5~WC>Mf-U=dgZ&IV_LbHF*^ zTyQQ(^)8p3HqS$O9?J8<`QQR@0k{xc2rdE_fyH1kNcFC)J8fQ!@?w;ifJ?xo;8JiI zxC|@-OTgvea!^LSE>Wg_gPEGtxZqh%{epQFNa#D6}jPTcw5K^U@;m1x0tWq|j0<9+ei0FG)+pmlfTu zb%mCt;y0va;&G`&d=;PLpI1Jwf3TrvM3o$`C{BrSxv_gDcS}C5y?_J&xxk`LXS}ndKtr5Si z=%`O}t$0dWCw^C2FTSVfs4sGZ_&sT(_<{7O_ya{p{gRu+k0hViq2%yl$Ssgh>X$4; zzFY*E4VeR(3z-KYJM~x2$MyoqLdYUWF=R1>?9^|$1lvm?%OE9?<&YJSEZ8ZBUy1Ei zkkybikhPF?5VBK0=Xz{!fNX?33fTnlLC8-1pMB-UidZYCku8K9~0O- zaLFqIn-4B~O<)Va8qceY}a2DEc*WT7U*t-3N8A6chY_S zRG~%R|4zD_&lXzr{qUr_^zHqvq#XA;B{dI{D>i@gb$0haohIW76tlXcwf2Z6XXSh4g@PB`tfk}T( z&kg-nXd3j`f{bjvcfS6o98b`$2QD>VzdL?#XO17lny_4oWwf6x8&^}J7i&-wKCe9vH>>ly6xJOh1>hsKe$Ydw+1g^4uI zOQf+|y6#=TI4zOJW3}rbk;Y49kj6iWG@ePM@kt_$MYe!+ogmVfBay}#i8QwOG1jus ze;@ATXZrPXV)S!i^mAbJb63LxyJ^xE_PtWrb>G@nDJ#R~-KCc## zo~I?!Gqgl{Zk9;T%I478C^>^$8)X{iO47P1)&OJE)U{D^ur^9gA8VtKp4Ri|UK?e% zvNp=JrPEsV>l)A+4lP*2fz}9U#(D}%u{Hs%4`8OXW3bK$U)y7RP*B%!c!gQq9*ZSt zhK<-@v7}j<`Jw|8hV4ZoBo8aRU^f^rjYkgK=cFlXirDdc-fs9NR+{Y zJ3cdLH@c6yr z-2Ab?{NR?q5!e>6?8gEt1-JfGppG$VOM65rZ9gpqp$mR0uyXLRzZ2MFV8t&5wjHee z2fpuP?H8o&?VVDvEh3e-ot7SJJ0opx`vhx$?C*MDGd_0k!Jt{D`H6OQexhBSpJ-R- zC)(BdiFS2BfLPE)d5@2`-;kbY|DCkAtxMY1_G{_!wyV+;ZNHJc z)b47tOnh*!S*CfBc6DB)U7Z(cSLa3A)p?P2bzY=hofm0W=SA8vFH*)aWuuXWz=i}P z*3<}gHr({!kmMf2vKWKip=jf4U9ps8G9Qw3pg_$Sl)*OuITC)2`0Pw5#(m?dp6?yE-4! zuFl7_tMf7Kn2*_lW44SovS-28u}0PkK0nULo&#T)$lKD|?y7&TJ+=NQbVqZI>_zaU z$wu}P`0_L(I|hDZ7T@=!_N@A2?K$;F+g$Z8wx!m;)RtC%tj%2?p?0^JW#WTJ&2oWk z30p9~-NWWy5GD)w4q)8T{3_PPsO~b-6M@Bu$8d07@SRbV!Dcq2jcA$IR zDCj~bWslGxYik;XCeu10rttmnEi`zJcp5!2t+6nRrN5tkFujG=Sx5-IDPz0YL+eSU zdk%PHQ&{$Z<5Kbc8{O)OqpfoQ#XG&h#*3v2I zx10T?4TGXdVDDivzW!xZ^>Q!$gQ|WbCgU!#n zRQ*SoRHT1l4i#PrEDIC=^e@a~!z+P(gh^QXSD8{sVEfnVwTvkrGWI{y%Rg^+0y|sU zbN>UXpS`@|ZDWh7FWz%orEDBi^_ywNknZ)mvYo(g^wNJv)w6ot&e%+*4@O92w&tGt zk5KGu{lb5QzOVHQS!QKbU4Uhl?W@GXi!5_n<*v-2zc$F!|4Cn0!!m2C%KUYHmRT3r zTc_;p-%(x)b)cfAN?Qn%WtLX$phYPeE#g?`-vS0vn{RQl3jdZsg)b{Be|Gm~*6q#N zlhw`a?AiI6Hg}4)Z+2FmsAsc8>GlrdoLns%MkIA7h7u9I{ujMFJE?fQ8iO23artF-#sby8!Ry1!=E>Z@v& zf(`Pm*|qDcMwhyyX4l%U@e$aKy87>o;uSP%*KyU2@*cEp;#i3CmC}@{>V`_MrdQd! zJP~@Tuia;EuOB*^SGfImOL>v+tZjo>_eQBp?}1+0_gecqqt#rkoFm%Q&Yw-y&s23< zeObLO$W?p?S=~