From 247bdf4d6b7fc9551247e9285a022b19d6e8aac6 Mon Sep 17 00:00:00 2001 From: zanzhang Date: Fri, 19 Dec 2025 09:43:58 +0800 Subject: [PATCH 01/10] nhead=64 non-ps ready --- csrc/py_itfs_cu/asm_mla.cu | 8 ++++++++ ...A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co | Bin 0 -> 31872 bytes op_tests/test_mla.py | 6 +++--- 3 files changed, 11 insertions(+), 3 deletions(-) create mode 100755 hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co diff --git a/csrc/py_itfs_cu/asm_mla.cu b/csrc/py_itfs_cu/asm_mla.cu index 192137659a..9425d6c447 100644 --- a/csrc/py_itfs_cu/asm_mla.cu +++ b/csrc/py_itfs_cu/asm_mla.cu @@ -171,6 +171,14 @@ void mla_decode_stage1_asm_fwd( "/mla/mla_dec_stage1_bf16_a16w16_subQ128_mqa128.co"); impl_ptr = &impl_a16w16_bf16_subQ128; } + else if(gqa_ratio == 64) + { + sub_Q = 64; + static AiterAsmKernel impl_a16w16_bf16_subQ128( + "_ZN5aiter42mla_a16w16_qh16_m64x1_n16x1_coex0_mask1_psE", + "/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co"); + impl_ptr = &impl_a16w16_bf16_subQ128; + } else if(gqa_ratio == 16) { if(persistent) diff --git a/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co b/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co new file mode 100755 index 0000000000000000000000000000000000000000..0f0a9b8200e6210652fb93747299170b54ee4ac1 GIT binary patch literal 31872 zcmeHw4R}=5x$erINeFu;%Mg;VgP{%~h9QI@2>}K~oq_Nt0Ro~0s}2(~A(8w!2?!R4 z%@7bNDn_J8(MANMYAt%vgFQAGy=c*cUP3({>Cd^e_t@KWZhNlnJ?-tUbKh^Ty~1Q1 z2n0E~eVXumynDUt@7>?bUi;hM-ZQglcEubeDam7H;?Kg)u_SYwz$3Reh>eP5;@m71 z|8r(%DWIvZM^t|fBj#WdQ_Xfn%Q;P|Hw-t664OPQ`auF?k{=^@X5vY|C`)89{YdM{ zR% z8AHgCS~oNYo~@fy+E1C!k<5x)XUjR$6U^DCP%S~HFIuSK&m~f(Bf*S)%5)~^^hL`& zO#WPOyVO~nV9Gvqx)XHzqIHVbeOaXx>PawXpF-y)=ya8YmP?_P3Fhom=<)=eu9DCS zDRgCmIr|h^o1oJLg_=vo;g(hYdVf&P+bo8#$_0X!n!bjJU3FTou>bli?7y)e{SUSH z7B2Bh51i8bF+{8jPPJX>0Nby00Gxm5gmGD~;`1%OHS2IHs;_J9amB8FjMK{%yRUSB zy;nNG{(cN_xW%``H?yV34ITX$qn8^FUg-cwu5^GG`!T@5mRKho?Z*hcoN(+)2RMGE z1H93X0bXeF)i4#R8Q@C7!zJg#gj1)*q?Z@S zp87aJuM3`&u0-msF;dSZn6%Fm@Tp%U==DWQHG;&YJ+arRPZP}9x6gq4mPcA_O%37Hv}46eKmFK z1Jyn{b$>ufh1ZM=N?b|yH1OBgt!}iw`Z=Duyr1Ma?rQL__tn(%QS(wm{0IOHOR2w zs$S+r%r-Qa$=<)8Mh{rQJ)TfmdipG+%=lIY9&xBLC< z2CvD<4ead_kD0gW)x<2^47}N5XKmOSnWssLCT)vGQK6(n@IQ7mC1MO=O^G13W+-7y<4A>c9>l8>U2#0zAM7l~Yq9rvW>d z;@F{B9Z{>bAPQYkMG?rDf!kP$qYK*#3wK)BmQE{c>$D{E(5A@Ft_XXx)6(W{9~B*C z?89}@>kM?~=IZ*WQQ;44iZRtAKr7~=wxgq?MH`1h*GG>IpB^sS>;h7osCPj-IVbA9 zrCkTn=5@C(U^_Zbv-VCmdjz-#wy!hFzSc?ahA8`yiI1E3sEJ=U@f!vXjSe;DT@hVj z;IQa01Jj~u24+RG49tn<7>M={Qbr5y4Mckb(cVC`HxTU&B!4KIE87<>?$(!7+PeNQ zVr!Sq-tOEryXeuPw)ySzXU%W77ENe(O?*W!onY(wHR?;@4>$H@BB7?~dQMJQXYX{D zPWZk)t7vTdTz0r~tlb&X#~$CDIoBCtOz(<1o$6=NO^Y+HafY(k)-!IW!m`-5GnwNR zmO0*8n~CiLgWGy0b1`1HLf1W;okg`{U3(rF>ng0LF)}AO*@*Ejw%J)|@JYm>Yypl) zt_%@6+4no=j&-*exR2{Pd%e?Lq;^f)JZf9kO=V@>smSR~NKdZ~r>Ez$^vZLoYKlG|Wn) z^JgVDhq9A{AuG1s>FN5kVZ*{pDv#^#TgYeaokOB-@3WGlsOB+<&&sL1ZYVVga^uon6YqRt|FDw4WX4i`?(&tre? z)MvP%b;-G0X4Tm+6EjR4ZepgujnR!dC55AI^r7|bucY4?>}QnrbKYN%=KlJ&b@5It zV>`O?IV1eWOn-4TH^nANEFcm$TyS}9)-I@XN$;KzRhug7Nbdgzsg zKUhSdl`S#KsUCrLU0`}dU?y8?)N3Arxw=3{L|{I<)u?xR1QzK6iz5Qd*fOKu?GZRb z7wCxy?DcvK37hYkXN*3~Bhab~OpOS%vjawbx<_E9E>MdI%w^9S^$w4~d|jY3BCv=Z zH0p~z0?TxP?uft{?2u9K@dz|DHeVx-;UQc8|9xLejZt>2Q?aBBiLzHaS&}t5%C>dh z@ci+cQ_|F6>L8awQ{_nXHDgl0%_K znYAEnwdO~MxQzhWZMD{hS<3lxN=ofFjdPrmKO~yh?!dW7&lBfEHWcTlJe&`))luZ_IA>B{s+VP1ucKa%sh4G0j{pD{5i$MEvaHup)??~rS=J+v!Ts5o{$^R$ z>nQ6n^|CDM8RAZcb9;||vn=a%D#ye$x>AYPYAI|P>~`34*d4GtVRyq;!m41^uo_q` z>>gM>tP$1>3&L7q>tO3)_ro^ALNFZ`hPA=A!M4M8z;?nOf$f5|!*;{=!1luS!S=(R zhIPOWzz)I=!;Zk7hrI}U33e3r3hWr{HP~_3cVTb9I$;smTd)(bcVH)B@4-&NK7gHu zoq>G_`v~?i?1!))!~Pj|4)!zHzrcPD`vvUZV84X@JM2@~Z(#oc`yK4}us_274Eqem zHtC82v%*qfsW260hjCasECZGa%Z6#Nk+59YmtYQ99xNYL0CU0$VMQ<(tQb}bD}zme zxna{`GhjEtJg}Lta@ZW$JlHL;3fQ7e6koPE3+)tNZgCbBP<*)^?~&~BskLFJh>tr8 z3++~;Tv!mrwgbn2QBFY^p5wjNBX*+`BJ{hrf^1Zl6uB!QA zW*5gkj)xTRDa70D7?0=Eacwh?Yg^25P4Q1`4Bm5$UmtUf=bt~Ojbi|HI^Obh%p1o5 z>RIeDPJPX?tk+|Yaq4fDWj%{M#;JeoF;4Tx9^=&CEcf(}J;tej>@iOLV~=s_Z~LepIoTLE{oBm?()1ny^wQSR^vJjg6Y`G`m0ixeB7 z-%*N6)ue!6xeZ-zO@C^GFaAe6x>;(hQF!osI*_0!UQM(c2MY|E>MZ0S!iFVgc zLc1qKJ7zWK%r@tAnsZLSj5%Mv#5rFT?dEP)yZ(^6wQES&+O^Tz*0t%x&MpW0=?``# zDI?>3N_=~AI9IOORNg%_oJ+VTP2k>PsE_kK)$hv^_4{)KK0PX&E9dGMjXrU&5A`{4 zy=Z%IoWR2q1Rg0t`?%MS+CM*8)W0}Q;7d1(Ip}(Y>W_NSFYa|F{*~FH{jvGs+`e8@ zYX90oj1zNCC+_$XaHj96{N39`8{;@I&-KK=u|mvovG)VD>%1$R8y_2NdQ-eM5kJN@ zUq5R5RzS3sbI}??{E0fTU+*-aeN3Dn>rdV*>fc){>ScdgW2pbB4Wj;o2QYq2e_4Ne zv#39_Rn*J=v<6ZC4<8csA3comW8w^|ryK?yL-g8c><8VBSz#+--rOoH>mK>BMZ-dg zK5|UCZpm1J%0~b0`j4@h=@FQv3(SrP96|Ao97C>catv9P^|ZFpn4}|fe3hXmdt#Nf z&lu;i&wexnYZrUMz}=sVd7d=*z3eFi_fvh(oLHJkFakW_+nHk zHDcNV_TK579EE8)&TRWQhc%tC?)0Q5S*Aq`B|8h%WQRjd_c}rj_7B|-+p@ONz|p`w zU>&w ziYhmSqMMKlWAjcsyb8-@(QXGKaioj7b8p;d@S|r5el++jlXqbK%NlpasLz`%>ht3I zjhQR>F){r{oBb${b+PksI<%)-Tp_3DuRnf``A5yeAgbyp)Ji^Bm z?H0l(6s>~rNkv;k_>`hm5At+A_j#DcbFX-&VBcgfA=F9fYqc+MR^2E85+J z-&3@egl{TZ72(^8R!#V>qSX+-uV}S|-HLV(;Xf)`J>gkJYb5-MqBRr#RMCQjpD0=@ z;lC={I>KKm+IqrYE86{pzg4u2gnz(=72#hLO(#sUXko%+i`GUs)S_)8OtWa)35Qv< z9fVmHZ6{%lMSFyBlttS`INGAM6JBr8b`y@XXnP1JShT%_B^GTT;be=ppKzK*dz$b@ zi`GHtwP*(jXIr#`g!3)hVZwzLjgDO>)}3A619tYZaqJvruNru)2z;0LXy>7HS&c0{h8#BP~^Zv%}0q*xIY30DDy%}lqfF0hvH1?*^F2de6 z@I(dp1K!fKO5j27%(P{|!``K7%YjF{RcY*9quojNzJaIi1pmA@n6?u5qBopY4SdPl zo<`@yT5KQn9!RSPzT$l)tr>XC`&L@eBi74Slwb3nNz>( z8_tD9+^`Y+cfG$sdn3l#rK9|YSK$#+-ra_Br#FM^^Nsqw+fj~q^LRv*_wPjcEpI8; zZ!zjScAAo^gA9A_y)D>E8fu;8GRxVfhVZX#o`@(vC;lyMBpiEe<|?}<;ZY~ zIZZh-;D^-jisBty1JD>3jd#rJ9%|RGc!#bta9X^ic*nekqPY^rJ9JHxB;t{T@eXM< z>l4O1q{XaH81K;h2ta85gz*l|kNV4scSsZZ^yFqp1Im{b?~pF@TCV5X%q$zR2hGt} zyhHP%4aGmQ52JYHLh%mmE7e~p-k~|EelXrSeJSzIh4P7m@ea=SgYnMaT)Y#XlacXG zd~Qa@JIu@}>2w}Kyd!fO@T$7ASJALJh zuzL5zJ4v{Q`QV?zxaQUk+h7g%&i9zPcsiFEaZg-Mjrc_I5AhW5B+C0-DBnYUDBekw z_eqrRq4pH-B+C0-EZ;-@DBekw_n~}Gd>)5#JC~9JGU6YZ$BoJF7f2f|~pG0{c%KuP3 z#XnRp`^&h8@;_8h@lSl7hj_~WTvq%;{)~-(=-$}n#y=RNSNucw&tl^rnxn7yhvtoq ze`rjKU*hj+C5VTlkBoRHJ{}s3e=aHhxzK%=!T4t|{)x}`6k(mA{7!t{rx<*qe9sK< zl;4TZ`*^@7%J)=&r~FQQ-lr0LqI}Pt;3>ZopZ8e_elXv2{vKHe{PCZa?|Jhtz4JZt z{wDe(^F9?W(L;~mm$)+da2NUK?&Fy0{@h(Tzd6UI9*5 z=jrQySE4-6#qM_{%JW>x{jNlLp8nkLqJ2+vkBj)rxu+GI!->t+Q0|7tioIt=<=A^v zRF1tTMdjFgP*je+=VaU;!t*rrya;1@v^eVC7RBk-P2jb7Zw`)XyRuHKAZTWB7@H(ezxF85btss{21cr z?qe5vuIEzjV^N#<=Wsq3cMQfEmmOye=1vB4C-J$H_?(H1EiRNhiO-qHT#5(lQKH;Q ze9lDXQZAG`iO-qHT*^v}-{0JcyjQdlV;IlE0O_8PycZ}TltlT2i=9^!D4%dC=hZ~{gu(Ob z;CU6#s}4S&ierE9ygGPZ9Xzk%8B?4Wzv%NSK8@AI&f+rx92YY`0pdFmjBALq=Cl6- zvvh&k5rHGvPmFraBQQr7I5HyeS{g%sJ~R{0yKC2dNy!4{0O|Snk-!naYk~NzTZo>& z&m5&NC*gJYo(PuQZQqQ~rvcM|d{$xTnpv(8WOnmUo@9{Oea$D&F#28yK07yb&Fn9Q zXj{W`_*t7OtvSFEo0ePol1i1|x>V)2EmQfIZ&&$>r}C%v ztNfc!tNfV`l|Oqx<%bTc{JFy_|JD(ezwo@uzx|@hzw?sHUp}hx|MH5;Up=Pszk5yP zuOCTe%_?gN-`4^S{&!4OOUw@(U|MhPw z|J5&5{=ffS<-h(^<^Sh5D*x?&sQmx_PUV02y~_XeN0tBO&nl08rt+jD8@E_&JUQ9M zhYYdtp+jxlX0!3MG#kI>8XF%r%*KZgxACkj8y_*k#&dFP{Mu`6eAFl#zwSC4A3fT} z$BeP@>#w)*v14s~+&CK_Kiu)xL_F0}E*i*2(u+q#tGwl3p4TaZ7WwVhXEMKlqTd_iXr$+^Sr^gRFAH?S;+no9N^qnI3o)bIe>@&vY*IF02 z+cWU_#teMEk-q1HdBB_H3`;?%nThAiV|7s8AnH`@^Kj^i&rmw;nKsqGu96E+Wm(x-lDzcP@iY;ZKGRqVk zbMsDX$Su%kO|-brjO0ubd``?~MzomEnv9$*>PN=)yLPJJuZ`)KWA z0(6u>$K(Ofp<9KHLg;WoM=5kn82}x-On;1rj)~AQ2|A_@fR6NJ;SV=-%zzFLbd(Q( z4mMQimlb0{`bA}0zX)U2uUhz_9)4(sA6f^% z4|4sggN_F1xEDIs4uB51e%%8djnENa{bx>9S=aqX6V>D06OIQbw6~3pd$<&+Xg^~T)(~w9h;zI3v@g<06MPP`jug> zUxcyi7nN!KB8*wTw!;rQ;fG!D!|nm_gIvEJf{urw<1y%XVgPi=^=k)oJOUl<(6MI# zbjbDVYtZp1bUY3nUmpM+a{by19s8l913C^4fDXBSJqaC8LB})DacBT^$n|RfYmL|rgGQKzZ zQC3zEE-TBoPItGDM9yv`K7vB}^w5KNFqg2Qf znt4SBoN4$PX~V! z_?h76fRAW`zXANs;AerK3;tZL;6;Aov$G;UVPKKY3@nZaETcAhy1}_40%x#cM%m*L zXnc?7`TT@}?^ee9$gRZXCro}u9A9kmo;beD2de?ck-q7l2<3{#Ni`1~2`+9Q>W&SAwqwFa5m&{9WMv-~-^LzwZEl zH~1>>HQ=SczXIL|eiiuD;IEp$Dc*3xPjuYI`rC*{Og`4%#FM|NeXPHUCw~(kip~9|QjccF=+B ze-!-V;J*%D`gfP+(^x$M{&;wH%6H}hcM&b z)8?)eq*m$g_%MO;XFPFB#`wc`G8%v783I+a95nH_pGBLDO|u^Btnrs`md#=hRMa)% z*D}T70YgjlPz( z&CN~0R(@0*0CP`ZRjW~O)RYywr`AlVDl2u@6bGi%OrAP9P~~6c_PhO4OUtVLRVAg9s*9%t z#`Vbl+y-2UzacEr*1O7X^1qMkJHq6*kMG?c&-CJN2rDY;%}+4-({a4pT9 z-E(^Li%foAS#N%Y$rr`7uQvJIxbek{JcOOA={?Tb=CS?YMJ>z@vTAT3N}^yTm8({ z8d%?IwDvdDtpXjW4>sYgR9%fX5_7F;YM>w9Vf4c^t^O)tEqaSBfvnzN6{z=>xThAI zo5dwFXUomf($b#IvY9=bQ_JLLdD%3%>7Lcot!#2nH}|ZbZslcD&9)`QbKJ7qoSrF4 zrp%nw&)SbS(Np%ze%$kY|z-UUaof% z9cDvWFZD~j4)yf$L0K=?SBXKhzXY+~$P!~?v1uGiCKWMtOZ1#?YHY=qCxWH^a{cf5 z-6Qk&p?iJ@+URXImfOI|FzQSDK!5UhmcQHFENH_=v?=|E@6a^X+}IyTM8A fvHkV?L`m+ytdsM~axFFz_5alDuf}zVE3*Fweo?0r literal 0 HcmV?d00001 diff --git a/op_tests/test_mla.py b/op_tests/test_mla.py index efe8b47f71..9d32b61dca 100644 --- a/op_tests/test_mla.py +++ b/op_tests/test_mla.py @@ -445,7 +445,7 @@ def test_absorb_decode_fp8(): err = None us_asm_decode = 1e12 - if (dtype == torch.bfloat16 and kvtype == torch.bfloat16) and nhead in [16, 128]: + if (dtype == torch.bfloat16 and kvtype == torch.bfloat16) and nhead in [16, 64, 128]: err, us_asm_decode = test_absorb_decode_bf16() elif kvtype == dtypes.fp8 and nhead in [16, 128]: @@ -475,7 +475,7 @@ def test_absorb_decode_fp8(): block_size = 1 list_dtype = ["bf16", "fp8"] l_kv_dtype = ["bf16", "fp8"] -list_nhead = [(16, 1), (16, 2), (16, 4), (128, 1), (128, 2)] +list_nhead = [(16, 1), (16, 2), (16, 4), (64, 1), (128, 1), (128, 2)] parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, @@ -555,7 +555,7 @@ def test_absorb_decode_fp8(): "--batchSize", type=int, nargs="*", - default=[1, 3, 5, 16, 32, 64, 128, 256], + default=[4, 6, 8, 12, 12, 18, 16, 24], help="""Batch size. e.g.: -b 16""", ) From df189328319d93b0897cdf6393e99830f9898a3e Mon Sep 17 00:00:00 2001 From: zanzhang Date: Fri, 19 Dec 2025 10:43:19 +0800 Subject: [PATCH 02/10] update ps nhead=64 --- aiter/mla.py | 13 +++++++++---- csrc/kernels/mla/metadata/v1_2_device.cuh | 12 +++++++++--- csrc/kernels/mla/reduce.cu | 2 ++ csrc/py_itfs_cu/asm_mla.cu | 21 ++++++++++++++++----- op_tests/test_mla_sparse.py | 3 ++- 5 files changed, 38 insertions(+), 13 deletions(-) diff --git a/aiter/mla.py b/aiter/mla.py index 6f4cd2150a..7173cd419a 100644 --- a/aiter/mla.py +++ b/aiter/mla.py @@ -267,7 +267,8 @@ def mla_decode_fwd( if num_kv_splits is None: num_kv_splits = get_cu_num() if nhead == 16 or ( - nhead == 128 and q.dtype == dtypes.fp8 and kv_buffer.dtype == dtypes.fp8 + nhead == 128 and q.dtype == dtypes.fp8 and kv_buffer.dtype == dtypes.fp8) or ( + nhead == 64 and q.dtype == dtypes.bf16 and kv_buffer.dtype == dtypes.bf16 ): # Natively support cases pass @@ -282,18 +283,20 @@ def mla_decode_fwd( else: assert False, f"{nhead=} and {max_seqlen_q=} not supported" - logits = torch.empty( + logits = torch.zeros( (reduce_partial_map.size(0) * max_seqlen_q, 1, nhead, v_head_dim), dtype=dtypes.fp32, device=device, ) - attn_lse = torch.empty( + attn_lse = torch.zeros( (reduce_partial_map.size(0) * max_seqlen_q, 1, nhead, 1), dtype=dtypes.fp32, device=device, ) - final_lse = torch.empty((total_s, nhead), dtype=dtypes.fp32, device=device) + final_lse = torch.zeros((total_s, nhead), dtype=dtypes.fp32, device=device) + + import pdb;pdb.set_trace() aiter.mla_decode_stage1_asm_fwd( q, kv_buffer, @@ -314,6 +317,7 @@ def mla_decode_fwd( kv_scale, ) + import pdb;pdb.set_trace() aiter.mla_reduce_v1( logits, attn_lse, @@ -324,6 +328,7 @@ def mla_decode_fwd( o, final_lse, ) + import pdb;pdb.set_trace() if io_transformed: if persistent_mode: diff --git a/csrc/kernels/mla/metadata/v1_2_device.cuh b/csrc/kernels/mla/metadata/v1_2_device.cuh index ad64bce238..81660cf483 100644 --- a/csrc/kernels/mla/metadata/v1_2_device.cuh +++ b/csrc/kernels/mla/metadata/v1_2_device.cuh @@ -356,8 +356,14 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba (q_dtype == at::ScalarType::Float8_e4m3fnuz || q_dtype == at::ScalarType::Float8_e4m3fn); const bool kv_is_fp8 = (kv_dtype == at::ScalarType::Float8_e4m3fnuz || kv_dtype == at::ScalarType::Float8_e4m3fn); - const bool natively_supported = - (num_heads == 16) || ((num_heads == 128) && q_is_fp8 && kv_is_fp8); + + const bool q_is_bf16 = q_dtype == at::ScalarType::BFloat16; + const bool kv_is_bf16 = kv_dtype == at::ScalarType::BFloat16; + + const bool natively_supported = (num_heads == 16) || + ((num_heads == 128) && q_is_fp8 && kv_is_fp8) || + ((num_heads == 64) && q_is_bf16 && kv_is_bf16); + if((natively_supported == false) && (num_heads % 16 == 0)) { qk_batch_ratio = num_heads / 16; @@ -371,7 +377,7 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba uni_seqlen_qo = 1; } - TORCH_CHECK((num_heads == 16) || (num_heads == 128), + TORCH_CHECK((num_heads == 16) || (num_heads == 128) || ((num_heads == 64) && q_is_bf16 && kv_is_bf16), __func__, ": only supports #heads in [16, 128], or (#head, uni_seqlen_qo) = (16*N, 1) where " "N is in [2, 8).") diff --git a/csrc/kernels/mla/reduce.cu b/csrc/kernels/mla/reduce.cu index c2d727c5b9..1ba51d9329 100644 --- a/csrc/kernels/mla/reduce.cu +++ b/csrc/kernels/mla/reduce.cu @@ -608,6 +608,8 @@ __global__ void kn_mla_reduce_v1( NUM_HEAD, 16, HEAD_DIM, 512, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ MLA_REDUCE_CASE_EF( \ NUM_HEAD, 128, HEAD_DIM, 128, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ + MLA_REDUCE_CASE_EF( \ + NUM_HEAD, 64, HEAD_DIM, 512, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ MLA_REDUCE_CASE_EF( \ NUM_HEAD, 128, HEAD_DIM, 512, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ else MLA_REDUCE_ERROR(NUM_HEAD, HEAD_DIM, NAME); diff --git a/csrc/py_itfs_cu/asm_mla.cu b/csrc/py_itfs_cu/asm_mla.cu index 9425d6c447..c3b5189d9a 100644 --- a/csrc/py_itfs_cu/asm_mla.cu +++ b/csrc/py_itfs_cu/asm_mla.cu @@ -173,11 +173,22 @@ void mla_decode_stage1_asm_fwd( } else if(gqa_ratio == 64) { - sub_Q = 64; - static AiterAsmKernel impl_a16w16_bf16_subQ128( - "_ZN5aiter42mla_a16w16_qh16_m64x1_n16x1_coex0_mask1_psE", - "/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co"); - impl_ptr = &impl_a16w16_bf16_subQ128; + if(persistent) + { + sub_Q = 64; + static AiterAsmKernel impl_a16w16_bf16_subQ64( + "_ZN5aiter42mla_a16w16_qh16_m64x1_n16x1_coex0_mask1_psE", + "/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co"); + impl_ptr = &impl_a16w16_bf16_subQ64; + } + else + { + sub_Q = 64; + static AiterAsmKernel impl_a16w16_bf16_subQ64( + "_ZN5aiter42mla_a16w16_qh16_m64x1_n16x1_coex0_mask1_psE", + "/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co"); + impl_ptr = &impl_a16w16_bf16_subQ64; + } } else if(gqa_ratio == 16) { diff --git a/op_tests/test_mla_sparse.py b/op_tests/test_mla_sparse.py index c93170b0c5..c7cbb9669d 100644 --- a/op_tests/test_mla_sparse.py +++ b/op_tests/test_mla_sparse.py @@ -458,6 +458,7 @@ def test_mla( dtype_q=dtype, dtype_kv=kvtype, ) + # import pdb;pdb.set_trace() # generate kv topk per token & convert indices into per token token_indices = generate_topk_kv(kv_indptr, decode_qlen) @@ -632,7 +633,7 @@ def test_absorb_decode_fp8(): block_size = 1 list_dtype = ["bf16", "fp8"] l_kv_dtype = ["bf16", "fp8"] -list_nhead = [(16, 2), (48, 1), (128, 2)] +list_nhead = [(16, 2), (48, 1), (64, 2), (128, 2)] parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter, From 4e07275107a969e0f4c5c5f77c0277230e3ea4f6 Mon Sep 17 00:00:00 2001 From: zanzhang Date: Sun, 21 Dec 2025 22:19:17 +0800 Subject: [PATCH 03/10] ps ready --- aiter/mla.py | 6 +++--- ...W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 0 -> 31904 bytes 2 files changed, 3 insertions(+), 3 deletions(-) create mode 100755 hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co diff --git a/aiter/mla.py b/aiter/mla.py index 7173cd419a..d5c8e637b2 100644 --- a/aiter/mla.py +++ b/aiter/mla.py @@ -296,7 +296,7 @@ def mla_decode_fwd( final_lse = torch.zeros((total_s, nhead), dtype=dtypes.fp32, device=device) - import pdb;pdb.set_trace() + # import pdb;pdb.set_trace() aiter.mla_decode_stage1_asm_fwd( q, kv_buffer, @@ -317,7 +317,7 @@ def mla_decode_fwd( kv_scale, ) - import pdb;pdb.set_trace() + # import pdb;pdb.set_trace() aiter.mla_reduce_v1( logits, attn_lse, @@ -328,7 +328,7 @@ def mla_decode_fwd( o, final_lse, ) - import pdb;pdb.set_trace() + # import pdb;pdb.set_trace() if io_transformed: if persistent_mode: diff --git a/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co b/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co new file mode 100755 index 0000000000000000000000000000000000000000..32129cd44c8de35e407307f0a14460312ebf61c6 GIT binary patch literal 31904 zcmeHw4SZD9nfJ+^$p?3mLkLN@L69Lt7(y755W;|{GZ4PyDbg?vF73N^yZg5LcDviY{dC^{bI(0tG7bcS zY`SkV`91mH^MB6AbDx=e&U2r8@64v@b7v@Vac(OUFAF=*;>>LVx4CWKAU5X46X#+{ z_|I7~O8`x6-9r5Z44IutOfu^cEypyet}xInN=y}HY6lY-ll%z5GZPQ^L|Gz>Xh*gl zZWQ{%ySieXD6(ZXbHi+>pm;g$WP7=Pi#}^Cr~SL!@k}?dg=k0PUFnO)GWH9>`PeEq~dghhPEM3A_@Fm}x^2!FEbG5&|#$Vm===Ot7U;V0vAG7?eFFAcH z8`t`(y}p$z{S6Jz*ZY0t-uvpS8vW3em)C>yHq`l6`VXzDt}XLbA9U9E*7y(BHP(9< zJ?E^gsA%vvvV+css{8#(FF4ovs@M9xtE+0tza^@B%hpy@`0JZRvuEpk<+4#yj7Epc zHh7ILWKWG7>io}EjW6t>na`1#b8nk2$4reeX19iFF&cf@h8q4{D9yCT7_nP39Wfex z*=8Ohf6iGd?aYrcWVd#@Vl?`)?G*3(;!tLY3Pa=V|Ht3WsF7_HPoCk4mGUwRr~7YxOJintDGQcs_Cwa$XTcHD%-EW%Jv(3 z(Eeb9ciuve^uQ^-2VF$E;8fGqcJR>Ec7XMV7K|%;7oTtNu3m?wsJg1L(-kc}=%8h4qj;RR@bg7^ww4TYdSv=p&oS9#TO^8wu6&b+rgELR@!-W^Zgj1)* zpjQ^ip87aOtBbBlS7YjJ5vHDxF=)37@Ts52X!T{AY6OXkI%BU>zlt$t_m=)HMyqRL zX-Q|;{eB#aF=)4@z8{wqquI5wb&fp5-;Ya;G3aI3s>Nt_QCnYkuBxwHTjy=?uUg}; zY4lc9t@oFEY3crek_xXGCzLpo?5yCcu3A-Nef0}GHF-bIe$2VXx87S(?Q8VjS6jc@ z?A*}7^>ku+uGo9W{0Tmse%2QiuBrBUeFeq$6%>2#t;D}I#YO82yfp>I_`kB&zdql) z#@DdAz+2Za{coLh^;PS9js8wkF4)SBI_v%S;;gpIkdNW*j*EN2*|4gv9=)xtXmxJ{nwOBrl(Zq{mWzuoc=d)EH!!?!KqN$4fh3Zf+1 zvYzdJ51YYja&QB?y2NefZMsx33%3Aou_Uu5?2OFQ#D(LwhQm-KCWOudwlT%NO|ja; zR_n+x>V*|WAY%q@WeN5UY%46-Zed&6t*oit63+vhLfborhKCw`@uY+hx?(%fCyhfo zrODMgJUracyX{O0V2;~zd zB!o%{Qp1<;;Z8uK;9ch`re!_RMJ|hlV}3w(8iCCXcIi z4%^m#nzcf9Kz2j+wujl*+UXM%W}le&n29@0{JM!x8rUz~&+y0c@NxqOga;Uy5>7EN zBb;GiRyfN*)PIn&V5n~(>Klmq2BN-!sBa+IznT4AyUtFxFPMKuUl?4tLuYTdw@lC5 znb$PCb@sH`t=7D;t>c`p=!Ii#9iI$p?kL3j=)%4@2{ldEv$BFZd#Al{?DzC(d81lq zvP12ok{tnk)UnO!GaUhVu_NqosGo;7ElBV02xPG4vo43iGT7F$>0=a@KE_d*j_r{K z*L*g80X|&0&U-dH@+wC;cRw)7nOjYLq>pv5L1Ub3vm@8w$aD+A(SKq~=L8XXA88G&;31 zi4~nmLLP8zYHDRLH8qE&mYz>i6ZF|A8{0OqT{_>TyA-h>x5DO3*iNWm{EYazKxTY> zz=~~GYN|eIz<}Vw(qp>oR`OYEdt%sSd_IC~N{(yFlu9bw@j=yH*?Ks8gs$U57#@*h zU=wT%I&yP%GRMfEBhMbr%S(3SP|qBbTM7F|h8=myNk{}!IhZ>+C!77LU7zBDtqac& zVOE_DFfq-lCLfm4{((7W9NOK7f??*|=g#%J*3zk>H;vv@!BQo|oCB+$wh8s#Lnz+_!u zYDi!@TV&`px4*!a-z~667w8HJoWc$o zdbeAkVPjo?aSSJJ`OE!WOB=)NXuD!bNDQ-A+gY47KFqeZ-}wBoTN0GqfE#OG0@m@2 z>A8Whq6Na?P2sRI*g##e2XrMHG6ZscphQUvq{pj)jQBucC$o+WTCF+ZM8t7ezg$*p zWsoIYC?_OTe#2Pfl$^wHcB>t0k)AEqL)H)LQ#RH^%=O_BHyB8^{vjK; zs=6#LoMy?X9B3I?NyjD4wP`F~*GFJ|!}dUIhqE^#9zegQE)1P4OFh{}YbLcNy(~+; z4!s_smu0C(0Du#Zi1ub#>UEU$2)!&zy>WtpHM((t;ZIJDd9gr4CD`XdBH)IcFFJvF&X-FGnKjZ-95aclAdB}^9 zmmo(VuRxAMUV|Kiya725X@`U$Z$VB#-hrHiyazc2`2cbnau)I-y9M{Y93ms=cpBPqUo2%nM6F-es{hlr2ca&wcdMmcw67~6In14cP1 zDTmrq4BeKiCL>Pm)W?reD@Dwjn@rmw9QW~vt>f>-b#+zE39@8y?BjSy0G~j7ax(hk zIdojx%;VY?aa>dU6B&bd9^*Ge9OF3`j%nidU;Gsy(~*TGmmMhmu0Efp>J@D_2!au zBw{F3ZK2o-xQoRZxW_GUA5)BSn_J)kW--c#-2z{v*a+>8P;3M|%Hj2A*Z?(e`r*hjG6N%7ey;dV|J@de@B?^{yL_dXI~G%xaFAX^!bI$DDix zW4?TuW4@9GyE zLby9c;GO}{NBN%gdozT7UzWh9hX#kpvD${CP1O5CZT8K+&^@X%O+hYL_Y>iwho z&rcBg7bgjP=_WA-*VI6>-9-YfL)trdFNp5_>8e`4x;!*_95ps*@rAkJb6lCV+ZgB3&p%1T+{GR@aMu@N zo+k``4|~eMeWdRk6O$Fgxd7U?QLGL;z`kkVVfHNpUkodSMoc@By>~h*OJQ1;BQtrl z-I~hSnbf!^Sh__E#5;1;c)MLq_1FV;_Wd(<+v29-z~R7bU^Z|Ba0JlqaR-KbrUtS- zC4mv18G+QHLsYvZFED&;o{|k20m&RXL^KSeIId zQ(=3m#Tjt8U;bg{LPg6ZT%u@sgv%ApNqCo{ZwS-Awp^qPYn-E80}TW<@I@d|1(D5I&-4vj`tmv|9-uSG2i=Pbk`a!lx9i zl<*lvTSRzJ(H0YaQ_+?ZeoN7o5x%TwcM!g+Xm=96u4s1?epk^}5WcBsWrS}lS~=mn zidI4RzM@qUo>8=W2>)KustM02S`Fc+idIMX6Gf{h{7lgr3IA2m))D?v(bg0Ghoaq2 z_!~vrNcel4SP}kN(R9K%ixwn|w`fg-{Vdv6!W4`45a9rewv8~uqHQP4vS>R9hg!52 z!r>OJmGB0Owu^AIMcYj{)}rkpEU;*M2`5;zeT0)N+S7zLS+q7nk44*0INhQhAe?Q{ z4iV0?Xmsp4Fz>W@_9wHKjbrBsd)2_BdEi?;uO#OKcX{4QE&}fMoK1EC_jrDq%w9L@ z9b?}$@c0z)dp*BSb_4f$l#~+S)1I`HS->_=b_#pbs25^y8+c+a`2C*3lv3aU&(xH~ zz(bxzDa(L|J!L8EU8CMf_P&9q?gan5r#@u`@I_BBr5yN@r!|Gv#7b-*@$65j2EO8X zC8Z8{)bmzKy<5zejVQn7Ih&%-67fRY9U@*>5B`|vrzxRo5jUIxe#=wH^;->nTMNo3JX3i{ln?Ae z`5n(9uFo;_hxed-(o@DmqWt1Ml;88z^GzqakD1y~_mn5dFE@5NfchVJTDgp&^tq-_ z?iM}?)tSBs3SS`l?ucVEzB6N{wc{qGYrLb+HS9kU5_r^(zB}U`-uJ~je|7OrbWTRbJJGor8SgMNr=-(5gm_2hmSnsmb4)Vc zk+~)r@5r2!jCW-2Nya-e2PNYjnTz`GjCZ=r8DSOajCbO24fDZ2264`<8?wO~?3(W} zbMds68F5ckPL23j@elD7@5IXcTrA&1Z7AM}mG_C2@1gn>@5IXcTq@r~?I_-fmG_~1 zPjnuKayyrk12W>(u93CDAOBtXo;Uy8 zHQytz-_jaI@lLG#&!ys>Sb3lB;+z`*ewSaIQ}K-J^Ji&f{?! zPwg)m?@<20tk>gshjI>=6Yo&&!K@cE-XWV!eav`=Y&G>U;~laCF$wK+%y@^!H}zK> z?~rYg@ebL7@?T88N5(s}kCeZ;SiD2~LHfRU=S)woZ}sJS5ZClw-_kDU`qp1VywjKO zIg6VCx?bP9l6+6|A4R?=ANQ|++tS=IbF*u!3-?Zc6xZA}?-L`>L-7!;b+O`~Sb3iA zu6M=C^IYnBSFAkGMn&bw zYf@B>yaq+($ZJkUyny>@=zbC8_`>;N*VZtWW0#U25cgI(lId9`JiAl5#o??R$=*Gk zUZAj$!%>O)xJQbe$@0*B9PggaAU-$O;D-}GP4Jn-=j9oEHu2L1KZtmz)8I!CKXWg; z*nK^h6W7GLF7<`Dqc6_5;y9x(chZ+TiO!uw=S*a5ak1PCtS|D8Y`dBx32cBtGHja z@BUOA`+e(b-@4kjuHqh3tczdvx{9Z{TGf8lwRf9pk+fBPkszkEdHfAfmUUp=bw@4Tk+*N>_EZ{JY) zcaN*Qqg~~1hE)D{Z>jw46Dt4SJ1T$oq{_elp32`prSczqpz<@PRsO@XD*yWrRsIhj zsr=l>D*xn1D*yDyD*vZ{RQXTNtNefcROO%jv&#SFXDa{KpR4?T|C`Ex`3sf*pMO{R zfBZ`2|M_c`|K`6`{(pa~^56eX<$wHx%K!W)m4`o9d0d>0TP!vnA8+G{i8kJ^pN-pW zHlC7VF_<#X6K5(FoXJpv;pg}gCm1X1CU1#G%huZkCVKzQ|xQ&k(VdFR4VB@1k z+4$(uHa=#IjgK8`!tDF)%+1)G`S@;*Wy^GZ`Ev1H z9dqcrI)2ptAfAhCa^&RDcZJ~lOOh!UKPPcg>m;1Nr;X03OtXv()G=|7xbgij&`+B) zx0TNQ!wxlh77icr9Hb*T-KHAf^+EZ1m#S9MbCA?;(2HU%})JeiKn0UtW zV$X!=>{$CiJUclM&rUil`GGu3VIbd96ezM3W#MV25rMcI3hiC+sMM9mTz1 zM~mr?F|cDC>=+L_Cia3Isqw-eF4!>zcDP|jNiW#J`UyKG!H%0?hX-~{?*%*b6k*3? z*l{!Lm7WaZ5{!|hcF6f_4(wO}J8pv=xA%e_ za{gKdJMM%XD_}=?FW4dHujR1gF4*CN9sXXhL(X4!z>d3NM;Yv>=mk6E{Ph*s;e{P5 zVaKXou;ZG|Ujxkfi!gHjqB6~2gc0*sCHzngKh(hwjlJLpIe%5bjy163Uf8j=7wnMp z*FCVK26oiLj&;3Yhn&Aw!;V_m(EvN{>jgXH{Iwo-Y=j*;>}cu*JLLSe0d_n9J2u0P z=3cNv&R_S#jsWZk!j7%IV27N)z6v`w!HzAkIFZ@`RifW@d)gA6m~q`3wFr)Ya8s?0Xte@$L?ORL(X4cgB?3z$78VL z>%CxyoWJ(Kj(xDB4R##p1v}*Y^#tsA3U)jLI}Y}O9diEK3p<{M9s6O&pRo4*E{^A};{{6%G&zX&7duUBv#*vRGKhTz+gf18Wkm@6qs*ZcJg z8sEKZnSOsugR1f+=0G|Mh1(DauP$X*2%%iJ0~MAI2i2*H!vo1fJmlY;0lQW3x!|4P3&9tIcP9yc4ES;2$Ag~;enqn2UErsHcY`khucr!r68M|I zd%#Zz-47WKNb88@F7j`H-f(f{50@0!Ji)@c#&`T{G7-)7?`Is1M@=yi>QvC zYH+TQz$t8iQFglp8sDvXA>W|j`;PIsa4Avw29uu>#pj#6JBlwd`H~C#EbPZz>_;i~ zV=?wa`g=C`dEgg86uk8J9PkUk-v<76@Y3JQz~2de1^9CC(%;L$ z-v!F;~M*MP4FzYe_g_iFI9;2Xf- z2VVMnJ@}2_b?{B#rN1|Te*pYu@Xg?*zwZYh03QUu6}|8?-v-+RFC1K$Sz0C?%|C%``i{u%HG!ApPd1^+bo{ooIQ zm;Qbd{5QZq3;sFq*UaCU@DKTW5HQl;#FM{?kMuY36Vuf7#kq z-m034T5m;dz1LS=?X7L_*45WmR8{*M#x>Nh99LbvZp~OQ_xM*f8U=erQNC+pMR8eC zp{pX_UtBR^;sk%0Z>7uU@=Yu(D)*HY6pk;?FZPe_H2Vu1a4%koY+qAX75!cW@dt1N zZRBf^QAlKYxsfbmxc?TfM0R{kSAML?ugL4lyG-6HDs(ERnf$V4os_XL-{h-jbmf%=}M$V?A@$*OvPl zeazYDU*Bld_N}Q}3EE#>UyF}kc{x6K%(=354gC-gqaUhi^pycC(OPT?WYxYhf3>&3 zRXow$%+D{JBsUADO_iI4g)=%gC%8H{T?KNpWJ0mrbd_{AE1J;R%vI9atYkuwS+}4d ze}Zf_qjQLY;;FJJ@aOZPpf<{dxG?_z z@k+Ivmt>jR(p)E-$@xzrU1z5k&3`l&!foRv=SAEX!hRbL*UI)x&UYr}nDwPzwl`^( zg$?Rk>g9YX(Qa0hdTGDJVbEJ5+ROP_V!hd3f|##liLp`GG!8A33Ym6GbYG}yY(Wo_Wg4%WO-{Dg z9v8f}R^ZR&LM4iZt>&fy^YEYS&yOUrE!%%PikJSAxD Date: Mon, 22 Dec 2025 16:01:50 +0800 Subject: [PATCH 04/10] n32 ready --- aiter/mla.py | 13 +++---- csrc/kernels/mla/metadata/v1_2_device.cuh | 35 ++++++++++-------- csrc/kernels/mla/metadata/v1_comm.cuh | 21 +++++++++++ csrc/kernels/mla/reduce.cu | 4 +- csrc/py_itfs_cu/asm_mla.cu | 19 ++++++++++ ...6W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co | Bin 0 -> 31744 bytes ...6_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 0 -> 31800 bytes op_tests/test_mla.py | 24 ++++++------ 8 files changed, 81 insertions(+), 35 deletions(-) create mode 100755 hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co create mode 100755 hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co diff --git a/aiter/mla.py b/aiter/mla.py index 7d56a950ec..a22a6fc4d7 100644 --- a/aiter/mla.py +++ b/aiter/mla.py @@ -235,7 +235,8 @@ def mla_decode_fwd( ) if num_kv_splits == 1 and ( - q.dtype == dtypes.fp8 or (q.dtype == dtypes.bf16 and max_seqlen_q == 4) + q.dtype == dtypes.fp8 or (q.dtype == dtypes.bf16 and max_seqlen_q == 4) or ( + q.dtype == dtypes.bf16 and kv_buffer.dtype == dtypes.bf16 and nhead in [32, 64]) ): return logits.view(total_s, nhead, v_head_dim), attn_lse @@ -270,7 +271,8 @@ def mla_decode_fwd( num_kv_splits = get_cu_num() if nhead == 16 or ( nhead == 128 and q.dtype == dtypes.fp8 and kv_buffer.dtype == dtypes.fp8) or ( - nhead == 64 and q.dtype == dtypes.bf16 and kv_buffer.dtype == dtypes.bf16 + nhead == 64 and q.dtype == dtypes.bf16 and kv_buffer.dtype == dtypes.bf16) or ( + nhead == 32 and q.dtype == dtypes.bf16 and kv_buffer.dtype == dtypes.bf16 ): # Natively support cases pass @@ -285,12 +287,12 @@ def mla_decode_fwd( else: assert False, f"{nhead=} and {max_seqlen_q=} not supported" - logits = torch.zeros( + logits = torch.empty( (reduce_partial_map.size(0) * max_seqlen_q, 1, nhead, v_head_dim), dtype=dtypes.fp32, device=device, ) - attn_lse = torch.zeros( + attn_lse = torch.empty( (reduce_partial_map.size(0) * max_seqlen_q, 1, nhead, 1), dtype=dtypes.fp32, device=device, @@ -301,7 +303,6 @@ def mla_decode_fwd( else None ) - # import pdb;pdb.set_trace() aiter.mla_decode_stage1_asm_fwd( q, kv_buffer, @@ -322,7 +323,6 @@ def mla_decode_fwd( kv_scale, ) - # import pdb;pdb.set_trace() aiter.mla_reduce_v1( logits, attn_lse, @@ -333,7 +333,6 @@ def mla_decode_fwd( o, final_lse, ) - # import pdb;pdb.set_trace() if io_transformed: if return_logits: diff --git a/csrc/kernels/mla/metadata/v1_2_device.cuh b/csrc/kernels/mla/metadata/v1_2_device.cuh index ce1b405a79..e466aa00c4 100644 --- a/csrc/kernels/mla/metadata/v1_2_device.cuh +++ b/csrc/kernels/mla/metadata/v1_2_device.cuh @@ -371,8 +371,6 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba torch::Tensor& reduce_final_map, torch::Tensor& reduce_partial_map) { - constexpr int32_t kPackedQoLenPerWg = 128; - const hipStream_t stream = at::hip::getCurrentHIPStream(); hipDevice_t dev; @@ -400,7 +398,8 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba const bool natively_supported = (num_heads == 16) || ((num_heads == 128) && q_is_fp8 && kv_is_fp8) || - ((num_heads == 64) && q_is_bf16 && kv_is_bf16); + ((num_heads == 64) && q_is_bf16 && kv_is_bf16) || + ((num_heads == 32) && q_is_bf16 && kv_is_bf16); if((natively_supported == false) && (num_heads % 16 == 0)) { @@ -409,7 +408,9 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba num_batches *= qk_batch_ratio; } - TORCH_CHECK((num_heads == 16) || (num_heads == 128) || ((num_heads == 64) && q_is_bf16 && kv_is_bf16), + TORCH_CHECK((num_heads == 16) || (num_heads == 128) || + ((num_heads == 64) && q_is_bf16 && kv_is_bf16) || + ((num_heads == 32) && q_is_bf16 && kv_is_bf16), __func__, ": only supports #heads in [16, 128], or (#head, uni_seqlen_qo) = (16*N, 1) where " "N is in [2, 8).") @@ -441,15 +442,19 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba params.qk_batch_ratio = qk_batch_ratio; // launch kernel - MLA_METADATA_DISPATCHER( - max_seqlen_qo * num_heads_per_head_k, - kPackedQoLenPerWg, - params.uni_seqlen_qo, - topk, - dispatch_mla_metadata_v1_2_device( - params, - stream, - max_seqlen_qo, - dev_prop.warpSize, - dev_prop.maxSharedMemoryPerMultiProcessor)); + MLA_NUM_HEADS_DISPATCHER( + num_heads, + MLA_METADATA_DISPATCHER( + max_seqlen_qo * num_heads_per_head_k, + kPackedQoLenPerWg, + params.uni_seqlen_qo, + topk, + dispatch_mla_metadata_v1_2_device( + params, + stream, + max_seqlen_qo, + dev_prop.warpSize, + dev_prop.maxSharedMemoryPerMultiProcessor))); + + } diff --git a/csrc/kernels/mla/metadata/v1_comm.cuh b/csrc/kernels/mla/metadata/v1_comm.cuh index da8e73534e..b762f70e68 100644 --- a/csrc/kernels/mla/metadata/v1_comm.cuh +++ b/csrc/kernels/mla/metadata/v1_comm.cuh @@ -384,3 +384,24 @@ private: MLA_UNI_SEQLEN_DISPATCHER((UNI_SEQLEN_QO), __VA_ARGS__); \ } \ } + +#define MLA_NUM_HEADS_CASE(C_NUM_HEADS, ...) \ + case C_NUM_HEADS: \ + { \ + constexpr int32_t kPackedQoLenPerWg = C_NUM_HEADS; \ + __VA_ARGS__; \ + break; \ + } + +#define MLA_NUM_HEADS_DISPATCHER(NUM_HEADS, ...) \ + switch (NUM_HEADS) \ + { \ + MLA_NUM_HEADS_CASE(32, __VA_ARGS__); \ + MLA_NUM_HEADS_CASE(64, __VA_ARGS__); \ + default: \ + { \ + constexpr int32_t kPackedQoLenPerWg = 128; \ + __VA_ARGS__; \ + break; \ + } \ + } diff --git a/csrc/kernels/mla/reduce.cu b/csrc/kernels/mla/reduce.cu index 1ba51d9329..e24d4f4f59 100644 --- a/csrc/kernels/mla/reduce.cu +++ b/csrc/kernels/mla/reduce.cu @@ -607,9 +607,11 @@ __global__ void kn_mla_reduce_v1( MLA_REDUCE_CASE_EF( \ NUM_HEAD, 16, HEAD_DIM, 512, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ MLA_REDUCE_CASE_EF( \ - NUM_HEAD, 128, HEAD_DIM, 128, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ + NUM_HEAD, 32, HEAD_DIM, 512, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ MLA_REDUCE_CASE_EF( \ NUM_HEAD, 64, HEAD_DIM, 512, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ + MLA_REDUCE_CASE_EF( \ + NUM_HEAD, 128, HEAD_DIM, 128, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ MLA_REDUCE_CASE_EF( \ NUM_HEAD, 128, HEAD_DIM, 512, NUM_WG_PER_SEQ, NAME, __VA_ARGS__) \ else MLA_REDUCE_ERROR(NUM_HEAD, HEAD_DIM, NAME); diff --git a/csrc/py_itfs_cu/asm_mla.cu b/csrc/py_itfs_cu/asm_mla.cu index c3b5189d9a..d880476da3 100644 --- a/csrc/py_itfs_cu/asm_mla.cu +++ b/csrc/py_itfs_cu/asm_mla.cu @@ -171,6 +171,25 @@ void mla_decode_stage1_asm_fwd( "/mla/mla_dec_stage1_bf16_a16w16_subQ128_mqa128.co"); impl_ptr = &impl_a16w16_bf16_subQ128; } + else if(gqa_ratio == 32) + { + if(persistent) + { + sub_Q = 64; + static AiterAsmKernel impl_a16w16_bf16_subQ32( + "_ZN5aiter42mla_a16w16_qh16_m32x1_n16x1_coex0_mask1_psE", + "/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co"); + impl_ptr = &impl_a16w16_bf16_subQ32; + } + else + { + sub_Q = 64; + static AiterAsmKernel impl_a16w16_bf16_subQ32( + "_ZN5aiter39mla_a16w16_qh16_m32x1_n16x1_coex0_mask1E", + "/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co"); + impl_ptr = &impl_a16w16_bf16_subQ32; + } + } else if(gqa_ratio == 64) { if(persistent) diff --git a/hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co b/hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co new file mode 100755 index 0000000000000000000000000000000000000000..af4f8732ae2379b1c006cb754cb6c2c7f2225025 GIT binary patch literal 31744 zcmeHw4}4VRneLY}lMv2KK0`>t35Gg^2tx=%5&{eeI%5+4NPvK7!K%ZAOh_dEOag+% z;V=YLiW(y#BHD<6RH;Q5UF_0ibg80?E>P>0w(h09>)!6(c5kGitL2l+%WqoC|*iG*{{!uTNCiDu5GCLu4wM7TvJ^gXx<{az0l;ZlAV&{bUIkM z-e(LUM`~T)6nLR_a#25JzDP35Z!ME^rpB4GPoY|zPG7Z9!=H<#Oh=p<`;_U7)9I_0 zd4T*m|8}XfAkLJ1>U78H^i}H=@B7jUDby2Z&OU|CjnnBW2`!aEE8@)Ar_g0_I$b59 z7*E!7ATm-NJ5Cw~)X&c2m?9;eq;QCiv) zc7Kq-;!N77)DIGp<8-?^TIb6{{DXwlIFnw2RxM7q3u=ASwW_&sO_Q%Bu&O@L(CVwM zT^FeG(W(1GN-Dfze4xZ9$({!Oy4qC@*4MwpQ+35U z>&{N?hXU8?Dnz6`gMFZ|@44>tuMJc;H^ObS_a6Bfd~Kk)rM9u5Lna8s$M`S(Pt|D_ ze43Z}5VOS;;4ow`!fAr=m^`(YHj+522md(@F4HpW$CDHPe!K+?8Kn~9PT!9`XZ`2{ zZ$ZI@UcHrh%)Cv!);EE?$zo@1*eRK(Nr)tDjYLqPB!%%mb~q_)3}H?5!?Ko4aFFWR$TF z*F~-~(4C#F>!U`6KC~&uRF49!n2Xwuj*J#<91dL{Jvwx1xM15i~ceq?58F^Vd7&ZK4s!F1`dr3HRfF&S#IF4 z$S?y_A}I!DL^2G_iewpx_774<3+)X=djrwlK(sdy?F}SLvN(I_9tgUE}P|pg!*Crt~?^AY*!W#OYMOh-_SxevLDj!M2=nI~A6}ww_6!ps@4_ z&YE;=k2Sb0XVMqpjmvZ0v&orXGtRa1fpM<9IvOK=qLYo7;9{Gcc?O?A9Lg5ph{TE@ zp_9GaHD{c=W32nAuCq70-1%zv^i9*ZOrJAHPhiuJ=h4>}XB}NbB5vdFBgAIrx@XR;p|T?#d*{y4BROMq zy#jwjV{#2_gT|0EFZVI#92;`xJ0kh{b`K8Q%n5ll&_6cf%(o{asY~Th-i+KF_UA5r zrW;xppUY-eoeeWF&BWm*rVHE<*`QNWIO0YhTIc?g^c#cyjL?40{m-MhzrJlnUX zw(cQ1>%N|~Vf%}pj>2wx1maCL%E@7Yc9vk2Q#}IHb%9z~U^Y{XdWT0~t}f6S7MRa0 zMty-tV6iUH9Tqr~S&e#+M_?(fA5GVo8aD}zdWF%3g#}vKVxyew5op&1riKNkvn58o z<`I~!3v`49=CWIjdZ$NVzAmsJEU=g@HR|0Sfirc1p0L2)8bhWZpEvq2k3g#~FgYyH z&h{GhsUCspx+t~ZeWF8`R8o2phMME;mIHdS4g5J|J-)(p3dt)aD& zj>$BZsOw{}4q|&awj((kaNfYUW?ckzvMlRK8?CL>m+EC%*6XO(qv~Z@*5gEhpfjq! zS(f!W%6e42EX#VF@NgY3s=ry5^*YLWRJ|{|oyt+?K3%E6d$k0% z6m~mo8SD<&ov^!MD`1tdDp)nF26hju4%Pr`f;GchVQXRQVE4l|z=ALx7J{|Gw!$8Q zZG&xxJql}wb-;GOcEWbScEk3-o`ZG5_QLkT4!{n=UV>bz%*!!@Pun%FUU}s<-!9Iq40{aQOorC=X_K&cCg8dTqFR)+1 z{uTBc*l%I~4*NaqbJ!nYe};VlV;glvfmvZmuw}xOw zEC-eg8w+#7@?iNe7pwqQ1S^J3g}GrfU^8Ji!aT5>VWqIyu(_~XVCArd8z~NHbLQD8 z4%zI?A4_q_L--rXoRC}-a*8;rGcV6>HOhHoBiMG}7%U+A*s+XQnx))0}h0Wz6~NCC>S}Xg6n* z+Wm**E!{&x*6t0~w(gC`w|6_(&wtdOpp1<5De)bNp=`NkQ+dbGP&VPt6oI>jp+3g< zRKGhz)bGg>_}r*aww$YTH2TE6Kh$UM^`h;*@d6J_6nL-@?PK0QYX8y{QGaN&h$N%zki!(V;l$Oxt{oA z%f%cQ`+J~vU3Z1DV`GDjZ;AIN?8n&V`$uix4v4mLE?Pr~KVB>L>z#VEkBSpy{fT=; z{rhV~z3fkG4D~;`Uetg10LG8%FY8Zj67^@chuGJHF-b@Igi1qC=A=q%pE1tkU;K0k)-Lv>fjhnw^E_?vyV$b^?xFgg zIk8wVd=^0ePKwom``C93JjlLl;Gu|8WW=;%+54xmvJ|FeIWz6!9oAIFPNyb3&C)Ge zFwvQ(CORBys@D;8u)jO)uq|yH4IB;30p0*-Av2ft*SNG4)KsjS=O>U}Ph5CT#90hgV_QEOOd`NE~T})7i6j z8~o@J!H))?Ve$^Fe;MP?81*@2qCO|4-TbEy`s4Z?^U!JgliOSCgFNTyOHn# zMe`7DQnZ^1w!goPGuH{leEwuf-KMSG5LmPP9%^jfsNgk=_OAK^TUc7Sk!MWbWaiFK#l zyVuTMHIALb>~#Z=jZtrjH z9^f9Yl2QtM&YPAp7ue~|Nnvjp?ZWJb1|Ba5zt>xoQUToOy*Xtm@PKzo$}-?VZ)FO5 z&uDjoePH0pJHfx?ZBAJMJmd|fQ~_V{cBIfbu?E|Ry?ayYfUkL9OKAcg@xGnX>=El_ zE6Q(p&!p&cMZC~?hlm%}fj{c~MM}6%#0}?yB5v3K{`=nFqP-F0wCgAz^C~miiG-W(nlG#2z$9U-1sji#8Ph$UcnXl?%l?w69cup?HVpr24^l z=hUUdI~U3)4#qn;-w(z+|K;MH*qn@vcVcrhGTvckPD!Wp5aJz~TaxjP%rVJ$N9LMj zyd!f?GTxE7CmHX^9F&ZAWG?DIGT!McXN1+eC*DcGb-f4w6vAh2-LUo6Q15(?nTw}$ znGyHIf8ynN zDE~uqQv5^pavT}gQ2vMNDgKF<=b`)$)l>XK^|HT=dno@y^%Vca=6Q&x{Lf{@KjhEo z_=m2IU2gn?F?z*6bp0$k{-HVgihpR{==g`mr1&NFnpT{6NczZ#cVgqA!T9Hr;-3p$ zcNvU-2IHUDd`~{s8Ora(=6wpl$IJK31W);$*u0Mie7t;5Ie5zN#O8e}z{kt?+zFoY zJF$7672pT+J?F2Hb;2M2QTd*?{?a?&Bk#4Na}>op@$x?xi+AGXefo-b;^loV7VpH% z`&>%A6EE-6E8f9pb=vQK#XIyl9;ZR-f6;h{@&{(Se#bkM$GDVuhjI^QySVWVX*TQQ z#yh0dtdAS-kPgHkw9j$l9h%>)zwCI2v_;1|q~&ts9nxpUH(zZ&@$x+Vx!y(l z9`70#@t1Q=D>{c0ovWeT4UH9j&5Fv=*Qlr*eNBqW(bu4;9DU8nxITpYY3P0t#`H)* z#Jx3w)3IAA2#R|vopySr1~z+kJ?@cWr?b3tAIE#AGKkO1 zGx*WOmk2(S`22i>&mq1{@FR$KxeR^`@pE>w3*FaqDc7;6P3(O*Uy3^h!KH*~L z)p+@YzRs)h@(C9^ug1$KT*`ShUOr*)ygGPZ#r>*-_ow36A3U!Po>vFYtGLG$=f$u3 zyo#r>y4hJg6TtCd=2IY^oi{#1oHg(L7nq?7%nS=0!9F$WHIKk7UEs*Dz-ws?`Fv

COn@8Oabzeyx=t@ zt{`Nd=AS*yAoKJ!pFPj$`vZ7ccJP|AuLWsa!+rP}8!N0?z!4jlS^44$mEXEV<+m+W z`PXk(`SN8d|Hd6Ezw1tw`|eh`e}&3dR;oNurSetPDzB|k`RaRAUSFs3#s-z&+obZA zW|gmLRr!5uRla_m%D;KP${*OE@{K{2Z_-u1Ii&I}Z7P3otI8jKNaf$!rt(L&tNgJ? zRsMLp%Ae>^`I9?T{?tyDKfO!k&+JzDvwKwj?dMefe5cA^*sJpW`&9no0hNE}pvqr< zN#)-?r1I~*qViV{tNgECQ~B#hRQ}g*sQk^ND*v1BtNaJYRNmdC^0&e&|J%1!{=?%c zfA<}gzjs3AfA_x1KRBuKzyDC>r%$Q;M`u+2Cfm{8r_^`*)T9-`}hJ z51*_2Pk&VTU;eD}$QLS4NU(8>#l{m8ZG6ZO8y`B<#%(qmPf4-yYp${JVZ&^E_;4G~ z$guGdBWye?%f_$0*2YJTvhnM#v+>cRZG6la8^8W~8y`2$#>bDh@d*=beBwkKpESwF z3kz+0@?;yIGR4NHO|$Xo({22Q8*F^mEE~V+CL8y9ZM>w!#>>iVe9jyjpEu9O=g+tC z1q*C^(IQ*PCR?|f*w$@)w+Hg)CG*QWqQCp0-2Cnb!Y_UI!?I=i^ySONcR!TVcR&23 z>p?s}+2+j6rSG1=_hHy6XP-7cx5hf(-I0dp8`JQ7BYhtP^ME(YX_m3UCMNDLkJdqX zgNReL&&8o9o}qNw(`~BpeG-)GcdKd*Jwr)zg`CNDntLDSOdpzjzD+Wo-^BBo7kZvV zXGgn+L&tFFa#{+4`Ie$!fu%TDY?+ExqogbojMln#InHdN@C4jr?g!wVf{1E51s5jti-$Boc&Gjz-z z03Ge7KW>1Io1mivI_3<3j#MuELC??7bptF-PLIHR9naOGoEH}8GTIC^xB`#BVm-wu z7lj2*rPyG&B{xXV#o`;Ox>@Rmb7{^@T_-Ha)FZjMHMkgz?=L_(-{gzW^K;>ka`>YH z{#ZHy{*dd~Jm^>e9gCskwgJ!~*RflmV|9+K!;q%RzpW4bhJRneFLCFu4C(r-GT*nBb*D)&7Iz||^jy(i_Y==ME;g1~y z;19WuJq#U>K*!_I@#FyLkn7ks=y()5I-q0c0O*kG*tekLG3a;#I-VK;9daGp1s!{! zqZ2yz4S){0jy(+>&qBxZ(6N62bjWpVH*`D)9ebhUzyRow>)12U@ongM0XkkB03BC- z9UE@0V}#M`7?o)qBaB+d-o-WM%gPTXC+pLP4h-@&)pP zdhh&Tn|DS>$&3XeH@N`0M!Q|tQ&Pm(RL1uYKgNp3hKh@Gtux#mBazb^Nx8=G`E8kF z3^5hot?2IXU~ZX16!$14+r@XzI_=2`c+?7cL^JQ`zimz)uE04g3nb z;N9S7g7<(g1+S+HemeMB;Jx6>z;~t#eg^m(!QTviHu$h6_#43A1il3P9PsC|1uya! zUz`>B3j_0YW?(^BU@^7PQw`1?7C4g)Gs+&1K;wHpW&T3z$zLe=u3r2-xs{mwg~`v1 z;R{UO6T=sqeCc_9F7~G!`%{7aS&IFUexC<^0rZv!v=ehc`8;Fo~E9lZ4WeDI6F z-wOWg;HBS}fxi>{3h-6nrQer>zYDw{d;q-k`yJr#244xj8oc!TH^BSAuLQpe{8jfm z#UW1ki;m-HzZ-Fi$w&L0c=9{7kM=w9VSie&Khp2D;OoKP3w{lF z>GylUH-K*jzZSgo`)crw;9J1o2VVMp9rz94b?|NArQg?se*pX@@LRx3zuyl&2tEXU zD|qSmZ-U+mpvHnWUNJH`v& z?P$DarU_KdGQI%bc>V5+NF!~0ZuZA|Y`n6~_GYmM%4-|&Ym74-_>D$Kd3mX$sKAAP z4(Fu0#)egaW?z%PwT6BW!`IU2YpuZ#W;8d|xA<228+$<9)s4+Qe_frgvBlTa+*n;(7igK((!6p~UESLHiD2#ttZX$3j_Tq9 z_q6J%mBmHw>Vm-3>M7Hv1SSFxwxF;c;&;UPIW@JY%Ehzlp}y5cYWt?>71J+k2WC8zm;cqO>=^(Bzwn zdh^RoerZhmDw8jb8Q*x5p))zqdz|~t_J{B4&2KUJ^bNiFc9TC^-`~j1n zdtYzE@o&)2*bm&}>jx;4YGF%X+3LoLVf~78ZHs z6or$^3S_q_g|Zvk7E5+&nOAO3E0!7xr+KGJ)?HBCpPyyvPapaz7hmb}h2918#E~ce z4*HAqXN7wN>Cs5>k{c$<-@BF7R=Hs!^`*62c9ZM5#JOfWTF+@Ne4H>|a-GLLIPCY4 zNTcj8*K>)vW_wvL`*e|{(P1`}^-{mY>riir>MzeX66N{E9LCrOxlK3@ zW*nL(88**95J7YS;6xsg+ DE>>Eu literal 0 HcmV?d00001 diff --git a/hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co b/hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co new file mode 100755 index 0000000000000000000000000000000000000000..1f28af088bce0de9eb28afc7a0b0561b8d5df5b6 GIT binary patch literal 31800 zcmeHQ4SZD9mA`p2`QS}*4Iv3H7-R?$h7g7%gfJlL3=sJW5Kt7XI!wrfMDk@45G)Qa zLqMbm7?C2PjR;89T6EE(OOw&1RxMhfuB&a`rQLPc-EDW>?zY=cXaDEjcf({H2n5-5 zf1Angi6l%)9rz^WK};P+E4A5*O#TGV!vovn=HqD<{z0%MXNC3t3H?s8F<$RgU2t&V1) zcRbP)8%2?Qm~3vC?GzL*rk!jr_iynhEtRx?7dxKx`69HV@hdAmERK8y%npgs{BoxM6;(GeU-9NQjA82E7p09 zE@V$F>l*z}*Gw$xqnXc;nPoSZ$}v-8jM=N9T8u_twxNbU7fCbiF-GjwOh=4HU$&Wt z$e;6Wm39`y7_wJ8T`?Mc*>;Ne{g!fRs5{1(y&5_@Mx!fa=uBy7d5kf8HFRl=Mpwwt zGHK}Y7-RNoXmyN6=QY%vG7dGb@YVX7X?kMzJvy$@YPy5MBn<#w>;ay!8KLkq?wy^GH_dsnT&QdC>h((Q_!edwo$D|TIO z2YW8JgMEGI;83%7k#~A?w;MY8&_@q99Jt&L4qt8u&-bB&1I>|6IMRn6dN|?e<#urF zayvNQhYp@=_SQD6Eb=zi`s=$t5TQPF)Wa7iF1LfXF1Le|m(s!IgolgHg$XB5i9s(d zkUjZfj8^AelP<^9n8>7_~v2ze^YZ!Lw&nU6Nr=f82_n4 zngwU|GB0A*aRCP-lMzZ2gxln)zLa6a>BtQ16kPj>8~$R#wR3K;fF_|A)s?9QgP?zX z+x=5QE_#DvgUz*eT%w<=b9 z*lHaeM!m422xQE_%`Cy*g>8ieJ1uNurFP@YTLRV}%`lNA4 zr?k1+M}|ildY9d<>mx@7%QF< z!i0oSIbo_2Y6Q4}I#3680NG$AbOhiAhNzsG5c&*2c81vNongfqA>f3iI14L!^E$f__%?C!h;OIEDJ9) zaBz6Afhpk>12e)I24;n`3`G40C>w_Q2BN-!sBa+Z8;JS_(tfnEzv|T4sm_H9PV0+W z7j4(s_d0i$=I_XFo6|mL#+-I*{`mF@&X@F}@wTpYcwdU}e!8$PPC`x7^{lL*&fe@S z8vlKLM*f)gS?o~fm}EylA9JiVeU>8tZ+3+p4)yc!hK1=@IRY7M(;1gTVHs@mne=fA zOCRT`PRI6WgWGf_eIY(%dCt3A9r@K`oV)KIBPd$rS*uXasoow8}ltXX;-n{rfl6>chIM>;n$haLTl-k6t~ z+vaeLP8yq9oy3YyCm|m=J~g#En3|f)Qp?XKsR{ZVl#Okh*e+k-(&x6$-QiNizRZQ) znXsQwfejh)je*SgrhpaOuGCb0%HYAlMdinI*Ie>jduL+UWqekGY+9~s+O%pa+wq~* zT{(Isz+@I@lvCXT({+JbNMJTo487egFjp7o z2noz*7DHd)7FetcbcF;?V^%}&b_<+Ib4cS=rj4z_M!npq!$JbBY>`n;atln>1*V1s zrnAL{UULh~)&<%_0(05ThTh>8n6C>g2nj4^w-|btTi`TZpgSb6r;S0=k53tGm|LJ# z7nl?hn9TMY`c${TbX}kp5}3`NHuQG4z+7FRBP1}N9We9-Zh^(RKvziMGL5!vS58Q%{)RCZ zD!GZ_oOV0rEImien`{u~u^h~sm=nUIt~HQ8m%mTTO;yX|!oN=&ovJR43#VCftA|)d zSJPZc$7C9d*Y#1D2eCZ_+u@w`SUb?KsS86V%TiCa(cDUHNiWM%uS2g#=w(^zu}~mb zjc9L{rCvu_kI>7q)Ehx5=JM|G5w9_nrCvu_kI>7q)HB2?3~NAld$TO{I+Y{VKV2!u zd$kyH3*=VFQpoL)J0N#KmP0Bam5?e(HRNtcEuo`pOQc>!_+@)G1ILC!$lhkO9}5b|TlPa%H`ISctYBH9 zFOXkA{tfaOf94q+DuG@m5Eknv8g+ zTOU78troFOUNUWmaNNfu4vW7B*J4#QH^`F3v5(^+0ek}S$;s%C=hAU)GmmRq#BrT# z9M_25%y_N)7{4~+7|%U-OdH1lbUHrrbj%ya0Q4;K7^k*oS?cx3W1QNXWvOS8$2hf* zJjQAK$YY$^o8|8I<}oeplgGT&%VS#VWm)Q(c}z>aEK9u(eY0Dv+ZUW85m%t92+1)gAuM)@tb zz>_S=D4%i*e4nvje29=W-JWJ(raRX_ZFz}-!zdm(%zk+~bLWQfLsHn%R=9ro0m=1Hysh2S3ix)ZO%c9<_ zR<-MQNt?P7gVwI~*0!zPDf?+>-vf32u{V64DH;{_frME$7ukLo`=N$8)SBJhRl z#TayMPx>Psw2OM5iGQh7)IT~WnBCiZO7&lvkA5Q7bmERJ0%!W3%C9XEb&TV{T%23V9UU8NcwM|VAs_lS-#@DRhF{c`W6>N!{D~T|UvJi-engxg^>5uH z^zW<|dfA@l7;1lVozTB`Kl+boFZHKdh5pPYp_lDx4x;w&KPdDcJcRxu;tbN$`7Rwp z^xkOfN2l#Gf>y$uSrt}xdf0~+4HG5W$Ufz~Wy~`aE1?gg{ps2dv6=1`n4t^I3<(@c z@r~?5&TX;}S(bX5+o(^nBYj+jVNd3S3Tv-1&cmPoB$2Xuj~ck^3o*}Q2ET_rVcz|2OePGH1II{mVwWQl_Ddi9nIc3m6fG1Ez6OaJl1YaW$bin++!@= zq6Oj|d1}1fuBLkI0XzG_X}j%~wvoV*z#L!>a1?M9(C%>uMtY_Pay&BwqdYeSQio@& zc1wO>u@S;PqjD$4)=>c z%3P#qd4x+8EuV0iqB#lgRI~y@pQ05J`W3C1utw2J2*O(k5dXwwMSDcTK$ z_bZy4uvO8f6K+zpnS>82+D(KHDcWqphZSuu;iHOHM);VbEg*bC(aH&*QnbZ{2NmrW z!fz_tt%ToFw55bED%$OYFDu#|gs&>vU4-9NwB>}aD_RBN_Y|#?@NGq_B79fTstHdk z+TDbIt!TA`A1PWr;m3;BNcb~FYa;wa(OL-qS<%)I{!-D_68?vx-ADLqMO#n!JDgY% z{z=hv!Z?c-B#gIcZG?j?+GfHOi?)SuutnQSm|@Yj5oTGm?S#WE+D^ie7OkD|T8p-e zaI8h!O*r17?IA3*XnP4KS+sqGQ!Lt(gx6cN4nmJb+fP_((GC#Kv1o?~=UX&7b{&{^ zc6#H>-!<^~H1K;p zze;uk_j#0*nZPGKX(_XT9iE&N_PS9o#J*?Xi8Ao}Jw++yzyqG?DYpO*c^0QE1s?WP zq_DS*dT+6J4Lo@V_-8#$Da(P+dx9yIz!yC2DYPb5WBZ6_e@ZRzCC^JKjliRxH&UA1 zV!mua`4!KZ6n(ac7dmbi@xog0$2>nz3Dt_Y;cP&}4ePC}7dKPnio}oXy z2j#as6+9%$&+kL|9ZwVA@K*0JQwQpv^aT0E#!d%N|25HK7 z1)}e+ICj%_ZklE7x?brS@91TQ{YOFqkJ{09cf5l*pUSb~9euvh?pR3RajMr_yrVBL z+Jr&^Pf(i+#XI^!qyAeVfhVc{#l$<5Bf}DNigIMY_o>}w#XC5Mpgt}b@0jO3RIg9* z4xOi9S-hxt$2^Ckv0}zMbWRi};*pr~4%uqzW5zpVi>Z$p@6h-NKxq7!@eYj-{Uyaa zWE0wS=Vr(TlrJgXA-gW(-0p($4vlN(c#L?3`lNW}eDMzLE9uV{@6djcejwgCbusbI z`SOVa@ebDefq3WtUAz;WlacXGbZ$n*JIu@}>9h_Z-jTT_8Sls(lZa4@n;x@lJF+G!Xw>RQz+k>n;QF&p`YWo$txVJVW`N=)6w>_*nU#Y2Ycp6P@>Q zgO8Q(DFaXWo#?zzIrv!lo;$!(ekVHbvmE?DzUSOEvJUv;e=Fbf`k#8{d*uCgv_?_9 z6D$97p?D`&-lw;CCsy9)Lh(+lywAnNJF)UUJ>ngltJ8k>Dc+&;c$~&l`wPZ9ls_=* z^*P?5oWsS$JCu7c>&1+B$YxU?Gu|OvO?}LGhwMO1Li-#u-l6eL{UygcWLspsL$;v& zXOr)d@eb`H8OZmX!OZ|YuWwyS zzGu@PM82m0_jG^TvZ-rUt823h_fCHhx2b2|Cq|x!;vrh=V#PnP@;tp=?~0Y@xzP2l zSb3g{x!x5k&(oLdU9|79u5l56Dc7_jb2yQ?8p_>JUy;|Ws2q8Xipr7Kq^KNu4T{Q< z*PM)a0r%6;{UXTmg$u&2&0#FZE~OwK?yYnr({n9&#-npqCD)%i{`HBja`zuwxvP$JO)hb_gx612k zRo+mq@_QOp-rS_})h#N&ca6%|tyTF~?^F5x>s7uXpz>B-8d{pJ%c}3-~9#i>WzNYf;9#?r+r^;Us zsr;|rQ2F;xsQmkHs{HM@RQ`i^RQ~QsmH+TPm7hMP@*ka1`Cq@U^1t~&UjbD4MjgJ{)<73C#__%R4 zK7PE7Pnclig@ragaiWb+nq=dXC)@axDK>uHbvAze^)`OvjW+J_*!YYYHeOn4Ba_`Zu|%E!-3oYFo8=kRG`bF0%VqXUgh+$U~)ZwT}==9RV6`G44< zCeOwpB%XzIB&XX{<9i_}XYW$gYI+uu`VKmhl4-01=s$f>(z!ZGcn%ZKS)T8?5S<2SrY}H74htc7V*r;ut`Ec zEUMkrlLdcuM7u1r9bWcjzZWmsUPgnt-_8x*x`g7MX;l!AMDs^`ePjI zm;gH_!j8%PU`J}a@P`X_OoJV6*fFyo>|le09aCV(^{~SOJ4*Y(4n0NKF%@>)06V6` zj+^?yj-94Iu7e#n!j2iRV^%-dk;;WX=s6h5|6^)$xCQ3xcs>^8ypTYrQD>0B6}SZ! z>nTRLC?v3i*7zZo+yFh_if^XsVyWxTra3ZoovfK_V-f6F(hqjXd2BB1SO7Z~!;V|~!45f(&4V2aVaLs|#@o><2sKJhlvW+zC5;u*2UEcF1|`cGz(j?5KbpRsCRxoX5TbJG`)C z1?*Va4|ZJfd2Fybj}b=BV^pSjj4)yztA;;n;g3f6qop7GA?L9g*ii>N?tvYv`@s%5 zkKGMB>S0F{>{!zecF1{b73^q$9nG-g-hQw{&SPs~$9mYI!;ZFoutUyc>tM(Iu%i`r zZ0ZL)w^Xb zjqhS~!NGuGlj zel8FBxqRg3oB_L4@Oj{!;ETYQfOjVeejNA-;3tBg41RgC;9cOSfp>$S30_YX{1oul zgZF?h1>cb__^IG;06!i4P2fYC;I9LJBlsEMXMsPPEqIZS`1~W0k1#M_X9gC81Qt^r zJ=Ne`A%WA_V597I3pBoeQ|2SI?tFxT@6g5P!=*&!BTRl;6klNS?kK+4C5K=Yd}c{$}vEftP+?3jPl8%fVNI zmwsOc{!Z{d@P6>p@3(`$3w#CmD)7?pUjgq0zXJS9@K@aL6jwOlFFGzG{cgk^CLif{ z;>qt+Khp2Sli!Jt@Ow4(rxyFui2Z57{z$*qfUg685BSyKrQh!cUk|$qwcyu-*TJ`emwsOd{(kVS;5UJne!mZV0DKVqX7JMQUj@Gb{6_E(fWP8? zPlun#?-{^IzZ0Kt@{xWgp8QVrBmGW1`JMO(zi+|*Y{UNS#QyBU{z$(+2>v1P4}*Uc zy!884@Y}(+gWnBa`u%I*cYuEc{MW%tzwZIR4}1ss1K_3K9|Qjc_@}@h1TX!*7yOgp z_k%wKUi$rU@ZSLcH27!0Uva-@!cXM)p}K9GE{NA4RkDI&})&KWR9>G+P*BO)7F=Iu(J~sK^Zir+V`_$wcTYK{V zY4V>2d-4tRK~a*J{9RV5{Tv&ye$QkQ>c{#$6XvXMXz?>=<+^(OI16*GtY7V{_BB^C z^FQ$|P0ZQUQ0Z&&F=vZ^ZHrOcS68zFw7<5g0f$ayB@P_sT+vWRKV-w`hhAEI6~Jn= z7Fz;Yt*^pg>n(H@Og1+Q3W}!4&B7@YyEmuL=-w=yBsYr+O6BH^((byh5|?BrmUcID zm2@|oQCetLC@h##B!!O>M%NBU%&k;wl{EYMk>Enib1L=`S@sb-R%HPSA zYB^uaGPR{OKsJ->g2dToMOqhVES#Bv5s7xQqSQ)_Z^++UZWOfPBifYy!*`MzuNzEGw%=p^9`qiZ zYtpM)EKs6=|53xU`6gG;T|;ch_TP@;WqXOVJV@U~R-N4 Date: Thu, 25 Dec 2025 05:06:32 +0000 Subject: [PATCH 05/10] update 950 kernel --- csrc/py_itfs_cu/asm_mla.cu | 2 +- ...A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co | Bin 0 -> 31744 bytes ...W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 0 -> 31800 bytes ...A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co | Bin 0 -> 31864 bytes ...W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 0 -> 31904 bytes 5 files changed, 1 insertion(+), 1 deletion(-) create mode 100755 hsa/gfx950/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co create mode 100755 hsa/gfx950/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co create mode 100755 hsa/gfx950/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co create mode 100755 hsa/gfx950/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co diff --git a/csrc/py_itfs_cu/asm_mla.cu b/csrc/py_itfs_cu/asm_mla.cu index d880476da3..124f11ab82 100644 --- a/csrc/py_itfs_cu/asm_mla.cu +++ b/csrc/py_itfs_cu/asm_mla.cu @@ -204,7 +204,7 @@ void mla_decode_stage1_asm_fwd( { sub_Q = 64; static AiterAsmKernel impl_a16w16_bf16_subQ64( - "_ZN5aiter42mla_a16w16_qh16_m64x1_n16x1_coex0_mask1_psE", + "_ZN5aiter39mla_a16w16_qh16_m64x1_n16x1_coex0_mask1E", "/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co"); impl_ptr = &impl_a16w16_bf16_subQ64; } diff --git a/hsa/gfx950/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co b/hsa/gfx950/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co new file mode 100755 index 0000000000000000000000000000000000000000..7a0493c0e496829a2b4b3e32ffd60c9b5d5b7e60 GIT binary patch literal 31744 zcmeHw4SZD9weQZENeE{q+Ypj)f}su}!VtocFJM5>8ItfJUjfmARl_71NF-k-0m0&M z7y>FqjS(p#+K7N?twkStv6m*Jk1Bf63)K3gt@qKsdSCB-?R~v{w|)I|-hZ8a_AnU- z5`x~m`@0!_i~rgGz1QCBtjXDXopnBDOZlR?Nl?+!(nR9i zEE)fEW@kyDN!KH)KZhZ+Gl|J&JEG;7Ce;fmTop&7#Oo+06}eJO#!6=t+IKf6G5@tD*ht?|9PZ%b-W&UFnC$GWoGuk=+{`&(OIZ1MYQD(`KnYxAR~rltj4Woxsq+JAU$eN&aM{*bHD*WiD4>GQ6p zHEUY^ZS0V%weCKD^0!?ZeD&-7mFwynYrZ3zR#vTFv&P@DU8p?Q?5mMV$#Im9RBfs> zx{y7!ZEE&ES65s#P!pHQ(Ng1#)~AVD9HoAn_=4fZrpEeBmv!PoX`v&|V0~KXjHC2L zTX>tauprK0eOl;_qx1z^cwo*7X`(02XnmSEKaSEDZDQ#nX<|j3(fTxTWgMk1+QfyC zgJpG`(fTy8HjdIIO*H3&gRRx|Yo%kFMF&=S?9(jLR|nA(O50WHZ@fzV%>&Rs*ico! zPCDSUK7bygJ#f1HYCCxFYCFKHLQBCFeTj#vd~MaW5ifKMpo?B!*nPDf?7i9!_79+g z1It_tfIQhT@`)g_Vtk)9`cB1_8=-`b0zAMFm6MY~X8=2y;@GKJ9bv0=d>H!` zRuqAZ8MuQbIeM_Iuwb`^ZR@tO_HIie4{QnT>gi%{c3axronyjdjD5H+e4T;r>}*{h zGbZ?fO)-Xg1Zc%r)OKumtZ3tK==#{P!80R8n+_ng340fIl5@k}n>%$7?Ou21LbkK} z4D0N6vqyk?Ap5$*>>J(m7Z+wfG4XK|A2sotCO&20@bGYB+*RRK295}iFfb*YVqiu% z!@#U?mVs#h0A;k$-axcB5bX^_djrwlK=OyOwW4#$(zE)q3R}+;qqg_x><8T)<@t~1 zw=d{iP`aSgnm@VIHRV;kXtJ&6VbmAFA8zc+6hckY^{lL*&fe}Wn*5Glnm?g)9y{DU z!R`#`6OM09pXUrPruT%MPW7|!mZj;}I0G4M`#HB$VHs@4x%5d2OP}PdO~>|lgWG;C zeJQ@UJlEY@o%yvBTzl@H;L59~KGG*U*{De_w$+(u@Cn4BYypl+tOyV~**o3yCb&Dt zyN~NSd!yT(ulCH?I%E5cdGquHHshG?F5F(oj&^TjP6zrKzb!8{x83O+pFA{a%yUAFf}!orB+->R+IDvC>z^0v0bs$t-BSmA2-A1OxRASVEl~4=0Ij*OTdb4 zcWSCWW5kHyvWnxn`)2Z4XZNtM+xYtkvRS$AS+i=X>`2GnxpVYz&NyAKz~9igTm#!- zW6+tG`zUjc4?6Q5;rx8N2NPB1q`X?#KR)ctwVIH&DWSVZWT7_6-FBt5@==1jB>I^pj{W38WNb!mK*h&M_{%t&=C@t z%Wg61ogRVty1;^vz!J8?sCRn=&e8>XLIQi+7&QI(tf9j^0sp`svaGE8zcBEx|EzOm5Os26!T_1;e z5ZfcM9nRT|bp!pHbz#)WvaBcDXl^B4s+VP1ucKa%s+VP1kA(ukY?QuPmi0QydQ`nE z%X%z$xQ-X4Z`k$bQH(kS@pp$U(?q$Pvhkke47YLykgTg&c#t4ml3_9^?e18xn%N1vv?M z8*&Qr9^^FS1IQW3Imm~Qk02jIehm33B97|H%d=aJa^CnbwjDSIjB;{vF6mRO)s?5(5zj>G6DO&)BDTr1({>2Q zeIjD8#CvcpRaJ9?%r1_791lt0lZdz5(I3yH6&F(uSXx_q;HmGJ&QiZNk94+r}3kYand)-5&h_6 zob;oQang@I#!25SNA%5OTJ9e+%d%caxz!`q?aR)Qh%3;vgW?U~ZkAx+UXQ^2Ofkw` z9)Sm$#V8-~2z-g+1?U~6cma5fB^r3#Bk%-EGRh&3z>{p4Q9k7nc$y^}oqykJz^jJjd8$-RF~zFh=c0 zO%m-!O%m;{Ef(#rEk?U1L_20R$ILXxbedz%yn-=b>F1cQiFWh0sy%;9-rh4TXzkf- zZSUD~a#xRo{p^Pw3CieLpAz4h7|fP)HkEe|4`vhYNfEer1nOgaPxbpUME(9OfzONy zX3Md<#zH6N`yrhJ*Ne6XCki|~S>TaEw2%4zsQrsmMg2=N1in05j6rcN)gSdjFXnqD z{?&5P{@8+Gc3+J@#kBSpy{i%CI{d?<0z0{{UhV)Nw z67?V4kN%_dW&N40qW;`=Q7`pr4kG;z9}@K+J&gXN;tZ;%b6q-y=)2L_56?PEgI2Ci{isBpDhn(AFAF?d#X>OxF z$&U0%Rfav8Q>v_e#yF3C_LE_lyVw&3?*3fN^Q6J=WltNppXwuHVzOd57l3{j#p=L= z>;(gluj*g5 z-<@^XR{p+rxHd?J&SNlSh+C}-h!M7n}5dPRaiC)pLHM-M_S=*_UwHIKeklxW5H*byaV%J z#>8_*eNMTk&xz3+H&5{6qV&d^dX(?D+&Y{J+fyyBfYbBJ4>Ol3S{~u8ik45fO3_?| zcPd%|p-<6@2>psyLRhD0(+C?B%}scZqRk{+uV}LfH!0eUg!e0&hj6Q+%^}>bXk~;C zDcW4ZhZSu;;bV$+GvO18wutaaMO#Anw4zlIKC5WU2@fgS3c?o@?KZ;iDB4QGR}}4b z!q*h-4#GDS?JmOaE81$pHx;dl@CS-kL-?+uts#70(P|0LD%#zIKT@=M!t;vONcf4O zH52|!(OL*URkSw3e^Rs!guhm_jfDTIX!jBRPSG|K{t+iugnv~uoiM?o1ql-^T07xz zi?)L>#iBh(IKrasB+RgAy9l!^+9QNxELsQQSc}$4c)dm2O*qk_?IE0O(e@G+TC{zH zQ!Uzl!WkCr8N%5Xt&7lW(GC!nTeO3O3oP1U!o?Pij$J3_oeu8-JA1`Ac8;>w3_O+( zzSH}vy#Tn|`__DV%h1SGcY#;R=NT~U}k(8Fo5uA8M*ZFgQ4V=? zcu17@??U-4ZxPpTHtM@NP(JCM!$YEca5u_tdzW*4p;3QiFUqI9RXilhFYQP9J#P!& za;oo`sS9mSdxN~cvC~1c|G?YHWelY+GJSHF@JXoI^hHqk0?~I*0-O6cbLUxmW-Gnp z9et5u|Iv`ZV-EBkiFXj^Q#oF|qc1k}j)w%Epmu%5JNgnsClnHRl5{Q?@90a7_NPJu zPgDE;#5E;~lcatdAS-(D(>IX#BYG4vmlcD~fl>Cg?V=oDpX4NW7DP>v|9TIf!#^-H=VzVDEg7nTw~j%!qqpa%#lKi+_lxcqd-o z=Ti9|(xG@KUfw5OzK7aVyb~|)bGdvE=~28BFYiP7p4dDN<#zg$12WTdWwIjUh2!Zhw?vEPw`J|o`-nK|6Ec0L;j48f9TrSmBv5lqgVVx*UzHk z9~z^t_=m=gj(@06ieF-{X~l_$q>qevCpI1$ihueQ|6J<2%TW9?6#vBLd-5^QP<|&i z?^6IiUcP4*c*^g@=6yWi zmG61;uf6j<@?JYyqbS~qm;bq3yb~|)(^tF`FYj}?cqd-or$6ydyu44Zcn9a|wBG}Y zcj!DG%OL4rHr}E9f!S`r@ebuN`V;R^?!jyqH{Kze&HA|U4%uqf$BlQ$4#Xg|&vD}& z8sDtH;&_K_i;j25mMe{S$UZZ^`NH!(GTxzmr1$^Pa8LJlE!%tMZFTQ(s|5kJeRxP6)(@zpX*)m@;n2%-bMQ!?;02JS8`1&I)@XTtD)Qt z^%Z^1iptT~sHhx$O^V9V*Py5zea*?ZK7{*e=zbB#^l(Agy(5g}*sT-<#J!bHJ3Z5a zXFO`RIbF5m*}G@b3l$b}I&0A$_eim`Szfx2qq1?$(?j$yM5}PxTvBjlwC$Tvb znM?6tK8lw+iOreFT*{?#C$TvbnM+xX{s)>nk=KegqYvX=7$98}lGlRdHK1MK>0Xx( zAl>64ul2}lJo~}Zy)0co%3H|wIT`ckM+v>Jr^L%ATy9;Bmrv+xU5%GdxZJuLFQ3q# zbv0f-VQ5_)T32zu>d^hEIQECu)uDBDXkEoUrdSuh=yerOWA(7}cqV}3Wabkfo}D+& zAj$)q}^_oXumM(B~NZ_^9hkQOX9rwFy*L_9F0A>N{{`b+qQNU|~ zc-AdI_ur?FQSf#T;B|PP082b;--_qcfGI#;nisgH)D?i4XZfd3GMIVxnopl)^!@-| zo*lTR{3`+4)^Hzw#+C|e7I4&-l~%s2LglwCSNW|gRQ}c5RK9AZ%D;BI%I~~G<&}4- z+_zfg)m19@*Qk8$8kN`8s(jtuDsQM)c~hgx?`c+fYm3U)x2gQz4JzNXQRQF1Pv!S- zR{55I%D3t&-xgH)_I8y&utVh!J*e_;>{R)~yHx(@BPxHaL*9a75+beo^J$c}eBpeOcwN998*$c~#}F z9aH(=ysq*$j;s7{zo+u=pHO*Ex60oPsrHmH+(* zDnENh9@*jPu^8fac%FlnS@}K-z<)8di<^TO3RQ|IID*qopSNW&EQ2GD-N0tB6 zFIE1({#oU}{*}uA_rIw8Uw@%PoHk%GiKQM z4L8{M?AbPc(@i$+_1bu8sg0MH+xWbBHojnijW1ki&-JNN8zA+8YH`4ncm;G&6C3d9)prHwZgb`+OXF;u%V(J>8}n?~|Zh zzgtyn=^08IE9gwN)7S?wX8Q2ti*1te{3f2yywvj~Iy>Gy5_XJ)T~13uAm36HD6o_S zN-WcG%*{WeA-6!!nrJc4jARuHJ}c^(5iROjlhIQ}{pc9IYo`nT+9SVf*owQuww@7m<>C;u%mns?9fw$9W!Cajj&@5?3gx2cFdN?B1boX1wd zjyqw84|ezm!45f(-3~kMf*n<`W6dDgA?LBL!H!DUQ4KrR4uTzD`aCwmoW}^G=P@eN zJVqEbkJZ8-_3%eC{LwZD{*d!n9qed;9rwVF^@CuCoX75l9gVP~1$Jy01UuwBwhneQ z!H!ngaql45A?LA;uwygq&|ydWAlM=2u}!e!e%P@Uc5EL6JLEieAM6Ogjv(yVF$i|Z zdF<=3V+-uq20I=Y1UtU;c`VJG#|We6F)GtMMi@1ZJqUm7f^xL1Unvv z9go3|CkDX|Igjmx9go0{PS~+$5bTii*f(Iuqp;&~*zwIlutUycdtt|Z*wF<$4i17H zavpmUc03I`o`oHU2Eh(FkL`mU&%llYu;cI`*dgb!r(nmoV8?T?nODtYmz!q$Jlm)7?25Ila-8YYbi7mN~{CQ}J#^ zcc%wq%N(M(M=9AZ-Z|^ECnw-hE94Q)yrTp6DMj*%hMt*M?43V!B7c~N{9!)whc4U) z)qOq@FQ}PlaU!3WhkRZ>@_DX+!z%bZ@GkI0;HQE2Bny5L_$lCv!A}Rj+AerE_*vjR z;LE`4se+#Yel~b7_;T=F>4KjL{zmX~z|RFA(gc44_?y6&f}aQeLbl*V{^GOqB7b3E zzRnCR2nj5qHhQYTxkCbHu@Oeu;}K}Q=Tqh{v`GF!!Ml3#_vBV$@)ssQD~2yHc~1;q zV)A7d`T5wNMcAJT?9U49kM#Qj@QcAO1Ai-c>Gzw#F9E+C{B7W+-xq>k3jP-GUj;Ay zz7qT$;8%mM0WbZ&3jCemec=7zrQdG{e;4>F@N2+JzkdyUCHQLaYr%i%ey2FZ34hUX z9PM`_PBHmtzY|Y>r}oi)C!YLHe3aj7u|M_LpJwb&8}>)~y$*Z>_I zE#NnRmwsOdz6pFQ_ z@#J^nqx??!*d${YEzZ z`dccSeQmY$LmQQ?O_go6_#uv#=7!eFYF}ez>-y&Arj|Ces9xt=Ti3W2_5R8lf7SZ6 zm3574nkv^cwN(1*>nodDE1O%I*3{MeTc@&-7Q`R1bD{3?@Q z5!1fLSTa z$*0Ejf5POCV5W)wykqhXJ(^_SG$M$}5n3e5(E1@0o*wmdRK;j|Ljwy?-6hbSyA zFOX_e3#A&`mPmG5xmRvZFOfDBPWMieth=COAV2pq5FPr7n98zMOT3F0h$Bz_9Sjud z&kFYl(xZ{$BR5QxzjrIEt#ZRe(xtgus>%6W;(W6m&F3^0PA80yoab>54*PvL+$8no zd@eE9Y%lAjzDctzY*62_Ue5m#9cDvWFYT9j9qKJn`f|OIDAyaa8)F~jHsJ(_acG*P zo+dUVdQ4tIY52kf1f$>JIt~JY8>^U-#sE-+6Vf``9c0}atas`nVVngbGH-?w`5^1`Z-%WJWm2IN5 w(yUBDk=8P4zx?~T3!JoFXo&xhZV)K0!Q${T z1VoB}5h)_th=6FVMHel)G#OoL(V_+Fy4u!V+Ff_u-FDaQZoB<-_J7`eH%!KXgrJ-5 zZ!`Iw{NMeb`*GfxdH0@o-kUd@OXkf`Vq)A@CSDeHk;RzX1a5Qt^FqSwMOTd55 z5?LH*YU>v2&tcH)Ok#prk7zljNp*o>iYQ8C$)Zf{U;<-rQvenTj z^o~cmV!bG`4-(A{b36sbp0tzg<^C=HxT%u%uczZlpD#i?8t-afG?w8{GMHWwc*j6N zOO(Ek2VDZ49q5@qeP-!W#sV*RYbvW6z0S41hFV{B#gxUweT5d-KtfM zz9x3W*|_FDU&1$?>%GJn+9JIZL?nm8*;qc7UTY4fCsrBO!f*2Lve8hz0w&h0!{ zRz?}ETNA6IG`g&b=6rCtv7&mlbWFYIz$%Y@nnk+nAbdh;y2kb!uCe{5UbH`4Q(nDR zI^e9{iyp!~aJJ=IJJ^1$9bi?VrQoXG#3SY2ri!XgFYM_>7hSxt_gXvHf2|!H>_rEM z7BzOdpsg3}ySU)+wRUjqT03~Y7abg$ddUUHd(pm&3r<{X2Pdz!gHyfe;Lx;7E(rFb zeHRy;zSa)TTx$nsucm{@Ag^Z;)S*S4tN+<^Vz{ddM9+Q@rPXB@o?aSx>m?&EMj5W# z)%ENrQCjub$R0-YhZdK1PC=hW8LxXQe;cJ$kFD(C`#rrgPI)JWMH#PKE8mGph|=r} zH?uRce zVcZeK9bi_^r0 z!h)J=Z<#m0=-&LIvU{rVuVzBQhWxVH{386XsPk>eE35G~uFWs2Z!GysXMMw(_1-35 zrzw|gWha~szI$-?*=5K_@J`3XeAC&uy1oIut*dQX`HZu1V@-Ko^@qj1Gs3yH5`kuC zfA1*l+s=Ev>wT*l>fk)`fm@z2ulF@HuBoeSm8bLKq&v!gs*qyA*|*GJm~~vhLC84h zX8yx%@>E~SP~vo)!S4}V>*<^SV*CxWZ?=FYp%>MaDFB0@f4$rNUN(c*ItRYOhThdm z-ORg0YjF$6TP%sJ1v@G8EYa#+4HpxN*%k_+MSNWFB47to>^l^zJ!G|x3Spd(q6lQn zz-=tf-hpj}1==laOS_e|v|D1ie{*nW$FR^aqc5Hi7erTV7y2YSNvE{9T1SLN7uHAFl zj`nk`6|xJm4|1SA#J<)}pUx2biHVPxxZA|9oA|ha1407~zpMzYFmO<4kby~|Bm>hz zX$EG5G7Loh2Phkc`UaxDfv9gF>Klmq2GV{sv%hNB*}3+G3(o6{niuWT+4tJ_l;rNt zZJFIVd)n+)Ywoz#@y?g@f^oKvjd))Q@P4|mFHS;D)AfvufX?1*FBtcIeOm75)|u>R z`{+c6Umtz4Id!JP4{vsa91iuf(B_4y10DV}w)KL`p|CWz?Lz8Ug{6*lRHb5jl)-Ji zkh%~bvK;5#&5qov(awGMk9Ou%Qy;109Bjx~Cu?@(7<>$IC|iI-VoUvm4)*=_nWJ5; zqg*F-oxR%b%2hjxn~S#=&zz~pu;LTCD}QS~JKny9Iqc|X)Rvs&>=uV(RKl3#ssvVe zJ^}f_ammS5f#l?DmRx!)WPLtY?Kp%0uxz`QBHOXOw|QyL4lb}G4yt~z-(QhBPcMJSqy!iTVSCs z&=nLog;@=~+bwW9%^~#zO&gnqje4n3hXn;%*&?Hy;1-yu3rr3QOl6A=z2+8}sSC6R z1!l8b4ZXuHFjp6t7Zg~?mKb`MTi_I3pgSnAtBnEEk53tGm|LJ#7nl$fn8*$p`ee7j zR9&DJ6qw1LHuQG4z-(QhBPcMJ9X9lNZh?imKvz)U6n4bWyWIl2+PLM<|Gf^e6YYv6 zEq038h%Fs|H&}Rnc5Y$7BkN)%B5> z2eCaE+o7yYSUb?KsS80T%TiCa(cDUHNiWM%uS2hg>1A2!u}~lg4r_0grCvu_57W!C z)EmJr=JL+*5w9_nrCvu_57W!C)HB2?3~NAVd$TO{I+erLKV2!sd$ky{1hNdW9CAD4 z4#-`Qm5_2sC1e$(3UW838d3|XhcrN%AnPF;AooEwLHrOM5`eTown4T-c0hJQc0u+) zS|NKO`yl%v2OtL_PeR%thaiU`M{79 z@+RaA+@A0eMX*k)Z(AXZ2mBmtsA5+NLt3`v2cLee1`WGEyPay`Tj$%14qgn%;VY?c3fv0$2DR%GhXXF#%~Ba z#7^n7TS?XE%F;4Bnk8v75{1~V9X1TMy zc}z?DKdD8(bk*e}ngk9nUxXW#|)EO2Q4`|O(rUSRCu_7CHZF-G-< zj1~2Uj1~2+n;`04Hv#n?74?|a95dY<(_xM|`6|YIv4>;6Eb7f{Ry%%|u(cyTVC~ps zZRyy2dS{27{p?43Vw9ngJ|(_2HjpXjY%1>^5XdClmn3liAm}4}Px=FCLVqwr;FH4w znR2YQ5oi6!V;l$OT2K6`6=IAleIBS@`<;Qz z$k<@>>*Bo$deOJ}{!!gGe4?%#i{=pGPp=XC^=1v~hs6m}f94*ce|w$K%l0(KQ2VnR zh5p_9(SKNbsXx~&^cS`Yy=+f&5Ve2rL7{*DA@m;>XON!Gcj*|S_eNtsI&YsAuo7m? zEVr`rLqD)+m?+Uk_9^ErW1gW{34Ivt&sTqd%~ZF*G+khNP~Z@XZ)6{GZj*h;veeVu zMtzbUsbk9xd(y|3Tf2>M9{%hn@surm)WE%;i+LV1`2Fk&0}qnEb4*ND4Cex9-$t=I z@GyJUz+>!N20kBB3XGU`6npzzMux()3`ct67`ru@vGd6>kFivX=8tvcsIhjtn(VRr z?d%8V?Y1Q?BY-1-S->pdNZ?4I-Q)I;@J#h*d8YeEdS>{Ohh?gEORj&!xLhR*G7^$L zEK><7laW^mA*P-}xH+WU><0|q~0n&3x(PcwNt z=D)Nt7Yuz?iO^?7v>Q27@FT<8jWF9$zT-;ka58L9wmAI`_lrMDU!-U`gi94ImvDum zISKDnv^+wuq7@MO6s?eOjiMD1)+m~b@E%2*Ot?SdTqlz|<@G(VOK=_2Bl@dOsXp0GtDB2RjXBBN3;kOiRIpK?n zc01wAigpL#tBQ6P;dd2nCE@FeR!;amMXMxyOVL&lzN2VWgy$9QZo~POJ$3q-Z)}j719&##*!% z!T}a-8)1?~+fF#hqU|6|vuHaBGc4LJ!eJI|58()l)=GGTMcYd_#-i;b9B0w?6XsjA z1B4ST+Cjo%i}obpO%|<<&|}dK5tdlA!-TUf+EK#!7LATw2j-nUo@B0-8TO8WXYT<2 zoTnjaCGdGqAgL1gf~Pf!*2F4oANL$ess_H~c`2zLc*65WQiEH}mrW?Y;<=Ed&l2%M z+wCG=*Z}^d=jTbmY7sYF^ozJ*6ZqFWze0T@#@VBze9EKnpeXNcLAl+N!u8pPe*boq zgPtrN6y<|EQGUZy!1XzXzHJZ6r#(}7P?QhvMfpw7Vy@3M^vCw2e8yAGgQEQWL6qP2 zH1N%5x{sOKQ1`4Sz-1nW&-1cWaTeRssL8Q+;P)7o*9 z(ly@E=Na}N4+=bCN8g?C4&r<&M~ip#`9`~wL4l{JUU%`1zQAY`3<^9=ZLSpW=nIYd zXMzIHQvII9JCq~C5_67nWWe{R-8IEKIESDpu|M9ydfy-K{J)EL zB6Bh_-igf3$asgDIVGLeA;dc}wv|9TA%Jsk-H?seK-YYanTw~j%!qp;a%#j!i+_lxcqdxk=W_WT zYD4i(w7gHWd=J&9cqdxk=Suk=YDe)-w7d`Hdm{5Vl-ub^4#ebIDgGh7>_^5ml>Z?;#Xr&V zJe2<-J;gtym+fWTL-`-lQ~VQ|=OLc*KUWq1kUzuYpFg*JPvkYNDDjZ=krD4i#zXz_ zPmkiC%UyTrkAM2(pU8YqF6J4^??mQ(^1w&S_e=p#`JKqTj~jfne9t`al;4TW`;>x@ zmhZU(Jmq&H^FAxV_vd>qT_bCQKmND!J+J?%YraR`Z%1ns#XHgRKUa!(qUC+Mi+7^s zeXbPmM9cg1B;JXZ_vsSv;9Q;dyI1iJoyX%ep4wkA-l6<~S+Cde4&@wr67Nv%!K@cG z-XWV!ebjh|Y&G>!;~laCF$wK+)Od%+H}zK??~rZb@ebL7@}Etwqs_?( z(XMe3e>K;%!gDy`xf;sdP+#HKtf(A*jf%?Q*QBT%ehrGs;n$pucmem*(ETFF@rCk2 zu5BSK$1Ww$FYc{$B+~a<@Ewn;Ee>bZDE8L5)O>{n9gZs0$30T)e1?ba<9O>_8u2+f z20w!MX@XBDJ~!9kvxqMd{1D=uPJ{&lr~UBx}7SQo$Obrs*n z>R=z@I{_RgGamu*{dwaY;zRS^e}QSb!1SQNA?zbVuek+g=mLiZ1ztye$nS@y;(m8+ z`1MK}Fat>UzYhft0bU2hcisGS|9$E(g*ga^&Z(XeNrAt(P+cK4} zSg!J~+^+IF?@)Q!T`KpkRCz_Y%6*k8U%g7@YpPVf_HLEeRI9wMR^|89tGuy6p-K+Ai?^F3>`&It<0hK>-Q03ovQsqyzsr>0fDnD{q<|r9{;lU# z{_Pi3{^D_!|HVrxfBA&Uzw?U9Up=YvzkE&Q-#w-Bj&_y59#r{Xy`l2&omTnx-&FZq zXH@=!w^jbmS(X3rU6r3dr}7_NQ2AfKr}DpfU*#Wupz@#mSmhu6RONs7w<`bHMV0^k z&sF~MKdAhl{!!)s{E5o{^V{J(#r^56Yd<$w6S z%K!97m4`l4c}$FrTP!vn8*Ah7@isnSfQ{R1HlCDZ;{ylU_@F^HK6tQ=r={8WkRdjn zkzwQ4U1#INhS~V=;Wj>EgpH3JY2!ECVB@1l+xVCQ&jgK2=HvjnAHK<8$ZQ`26`czHp)K zHr?9sM|}4x#D37ubjIc`z%v|{FW2=IE5tJ$=F&4Ae%$^5z8Bfz$j+wcO3?E^@Vtve z%E!-*FK#WyIef~P?5Y&YD1SW@_lX5>sue@mvVX z*}GJ=ioOd;eFq!~i8R(>^q)E);ZmIhd=C@fv%K8*LUeYreK5X1IT+ucbXfBIxt0Qd zo~6)VXeq+6GwYm&907f2M2q=+L_CcutKePBm&tnh~m zc1(dCZrCxs5A0wAgdN4O<0jbQfgL4%V27S0?3fHYZiXFGVaJRX{!|dcF1{b4(wO}I~K!^Wqn|WoX6(Ej)kz}R@iY{AJ`%1vE{Ji4%o30c2xF( z9daI90Xy!59bVYs>jOLFJa#+mxC?fa!;V#bV27N?z5+YSU`GY)SltJ9eChMpAafog z44=oSO!F9F*gRGRe^kRC_3%eiANWJgV{2eX4eYoFcC70IJLEieH|(f|9SyK!eIM8% z=drc0qYid7!j60Uzz#W&ZGat{V22JnTKd2aIgf3G9rweIX4tW{5A2Zh*nP0W4?6;| zV_P5CA?LBL!j8?bV+-tfpbzZ$(&w=ha~>lMpU0?7^B7^+JhmPF*a?5^fj{>4fj{It z_8{zd2zERSJ09%=JLEjJ19t3!9j&lqUmw^Z=drKBj@_{15!mtdKCna1WBXypLDnN<`RPhT}nhg!sMq!@OdWhj^GPTe)=VT7WQWz_NNs4vjqDi{XQG~ zeDI6FF9k3CJ_q~)@QcAO126qP7yLr-w}QV7y!88W@OOY;3BD4%^!p0%cY^nV_kovw zza9Kt;LE|U0x$jk74T)?E5NS?|E2q#;tB`+MaN~h-;KD#G!+A*Me^VzaG5w`&#gI;2Xi;3tswt1Ncqg zb?`0VrQbJ#zaM-v_^sfj-|qwO2Oj{x4ZQUGSHW)vzXkjQ;Jvf27|Z1pg5DhrvGzUiy6p_+8*z!S4ev z{r)xZyTLyK{_EhS-}i$*2)+&cVer!LkAZ&z{8QkMfR}zh0RBnvhrk~NFa7>F_-}xJ z8vHZhzjVK+!%yV*A;56I6Hk68KHTrbli!IC_dD_AcjCkRPR|dF$9$$?zgTkp#i5!- z4`EHi^2Urc25VFs$}w!|ca|=*c4U3B75QDt@is%~xtrp(a*#lqS;i$Cx+}^_)SLpym`~@1$j>V*&XAn>uOi~ z8p`UuO;z;68fA@jWldH1;f;p+n#QsUZ*5uQy88OMh9=agSnFNArgk;-zOqVR`MT9* zYid{3m9464DDzfVm(?|v)i>0wT2t+79N*YbF}}Kbea$#9cl#=ujDkI{veG-jTe)gt z(WIgxU!K=nkyn{FA%D_@3SZ@fNfV2G`S`)cit>svoo0V-1MbBuo@H+7suDl58P5*m zX58@C5Tg*!=0)_0dwKDSXP-v!<4k^Ujwiv7MB$G0o(6F6+uKF!{FWUHKIz zf4rb8Uup7tBkCJ(GVE-L=>I;m{w!bDezuzYr+0Sc_n3U zs$Vqu(tEqsKV|Y-ME~D6c?45kUKdPW$BY&J`pD#eyE&X?>=To(Z|=(fr^$a3=*rj8 z2SrI>@^@LK_A_im`<;}Ds2}ZjQkb*0uF1!ol^bjEqc6<4x^|tj%G+4Q%>TqUH85vG zU8T3l%bZQV4NXREZ_Szt(7x)1IvhHcl{j#iv!bqsejta@56Com%YjvBEw%)*YHzu( zx-8$7H_6=0%PT0BoB72PIya|I>)b4vC^rl8O62CWlFquWB9~+*lyo+86?Hb7R+4X4 z$j>V-kabHshsZA~l;-3YOwE&ZCzQy#`4jUCC0kTjBsX1o(uVw^lFnw6CP*XmClyO` z@+Zyk^zxTsdfAMAP^N79iUpqev&B&?KO?A9gc*zYDfBt}dl#y*%`8Za9FjYHNX^%Suo(QWb)BFp2Up!RZI?ff02!J zN`amKsdCtBBDSe5?E}5!ydi&Yxn9tQk7!f+56>huUN@VZY`@?9J?Q;7*Q8gKSfE4! z|09NH^GzOC{}Jf5$W3#8yI&#vAB|#xnd#2Gb`B zSC103MCtnxpl?E*1L$2gZ(-@Jj0Iox)m2tE`&=9RP4)iT=12D)b@`gsH~)m??|RYY zt7zHit1a_YRQQ{lpKtQ}D$DL|s%i0~rn0gLTv>CYufl(9eQiUzulA^`-dE>;dd0J@ zhN`M&e+xV6YOcA@pZG1;CSUDFf7ynb`pR#Mre)JB7+KjlN2$lo&(lc=?tx zqYK$n%a%s}Gd0r-2W#R=Ia+dz(fTw|i=otS6Q47@*ic`)<%&*RA}w^p7_3hVoiUU? zZwqge7UstotWOKwF_b=M3lGm-Elu>q7_Cne7sXKeyiF`wDorenFo6zIDNGpoVnT#&JLu5=pe6u5!B(8J*)rOb7HuG1)^s^jG=Yeg=dgP-g3#vi!p}l zb9FuY%NSbyHnN`){oz%mJyX!fF~;lL%1>fw_1nsRe&6$Y;*@veSd8)dwDR4!#29Ly zyO}+a{kw6=F~;kMnOY3B%bNL`YkgD0#>TQ{|N1(AeM?zY&1QdP87;=|F-2#u8Yg}@ zMi~E+Vd3|FOy4qOZvhG<|^Om~uhT03ogRsE0p%NixPiOBc z>^rV|eVhDMO$~4x?Y&2yD{t~QHPb))2&Adyj z4sHT@lf}+jv9mJI606-nxVUiKu5cI{Nfd_9bjj z=Q-Bi>1GcD_d^bLhS}FT>8~x!er)1nCO%@~*G>F}fg{5sjd9n8*BUrFJlepdaFT&( z;WPs?!Wjml{R5QILVE+z-axcB5bX^_djrWI%J$OsKaAbkrL*sMw$0Cb zB(HUG`{I(t?bf`h?XGDr>xENoUB5wnA^hRSzDy(3G+ocg2P|#XZ)$RKbuc+OnQXtFJ3Bjz{kc=0~U z&PJP)VzUdohje>B^S-Oq1hZ z$@er^J4fiO>pIqo{r>dFgz0~jr3kkHcl}0(y zBhao3Ob!W5Wvh&O%_A^V7w8BH%x1S3^-hn#JY8UZNMI3LZPdFx0_W%gJt2X;zK;<> z^Ka&9Lx*_;T6KYmA%S*w*r-qT2u#%lY9WD{>=~op;Srdv3v`AA=CLD2eZEIvkuK03 z5;%t)HR?ScfrgEZqr@>hV#oi#f7jB+Fgw|)SQ18r*(;qa&Ke(PyE?Cb{?ttgN^Zb| zH7^0{c-s8jKv>ZN;qbO_SQ%%at~dg^k_E|xd?_$bNeQIJtAVumK;RK(%?Vnq+2IlH zb_&4VR%>;TC0r^eBvgOXSmTuJ5#g+M2i77zORR?!wTH8?9%8N!PrS}Rvh|NC*~w~Y zT=*bZlHMLd9h&AKq^WLegeZM0^R zF4fDjtk+SmN7T!*tVaNVlavU3vn=a%l=X;uS(fz(WN>{pLff@JhTILQh15eDAx)4L z$R@~U$bFElkN`x71R<@EU69?7J&?VShaqi{cE~=+e#imHLC7JRa$a|1;kn@oDAs;|Kg!~Bd z6UaY7E<%0|`32;kA-{zDE96&@e}jAs`7Pw%A-{)w0{J84&yY_cY@4nq5Gy1Bk_b^D zb_j+{Yufj=u-j$W=8v$n4_S$MKK=K7n|< z9sTiaI<9Tzaczq@t||VBjKO=3@#`Xv@$5^-v~diePRE}-9rMO9fO-~rjFYZemi2n% zF;4nsS=O`2W1RFOk8v75@)#$5v)rQ}d5n{OEO8F;`W@DNjsa)(FY5oR&U$2|gHpx6j{Cnz=oo@DU`p7IEM zoh2CMkVoKYHo_>M@d!N25{>dXkHGgC`_+dy6-jlZ7?|$KHc(qvY~VPGy^gbAcc)MO zfIV;EdG;J|T)_wITLzwI?9t8(3C9_uc4McAc4McAcGpZ7?XH=Qc8`m8%xaFAZjR|R z$DBQYF<+ID(xmxKN6hi!4nxM-ge-yR>#lyf$f z_l*o@67EkDcwjW@qkK>G2h&9Tp$vgfjSptZu{tI|C+hnlox|6OwnrulJT_I}@dC7u z`u?c>^D{*K3$p~ic!L;&&R3}Zgco{I-!t(q&ll}aE)Hh)^_^1tSC^rmh&7$KQ!Bxl zzNhkcZxwBfN{a|rRL zYs7xNRfqNwae}NrbC0NhXQQZ>`ZULo{@E>}{=NIre}ulQKet`fpWi9!r9RC;r2qbd zqW*)2(0@dnLG_fwpks)>8;$+2+ffp<5@szdx3cbWA6hg_l+clV%6Uu15>z(yyK6th zW~xVEnl3OsBycRnH?j{ox5++aS=Q6sMtzbUsZ+`gd(x+sTlXzBZ4u$4igq*MI*;t(@@tidISZ zwxU%LzN=`}gx!jEH{m}jS}oxPMXM+LsiHL!{!G!D2tQJ^7Q%l~v`vJ+RaHK`sMVMsKb`y@aXnP3LEZSbe42$+K;dqPI zMmWKuwG&=v(e@EewrKkar&_cFgasDuAmI#)c8GA6MSF_y28-4~=(T8v3Flk1BZP}B z+A+dq7LATwC+3|t?_oQ8$vAdSuvZK`nFqez`?5VBxX=5hy$HD9d*1E_9`OF$&R#Rx zonqfJ@bx+14|;!V_W%!hm85yVr@Sdii+~;8tR(h^(JsWkZ{X>r;17EXlS+X{ymOOQ z1CM!EC9MG-_m(HIw~cmZ*t-Uvy#xI7-ln8=z!$v1q)On6-u5I~6RWX(!h1NW7WlIF z<)lX7N$;CUO&&2{wxIl~_k5DRNW=>rw~KgTGx$^9pC^TCMci;PAmWCt;J@qrE!rC~ zPMePM>t2P2M0sB;%AMX6t}iy~59~%cBT))|+pzoe|2f21C$BK9KWrp6V zkigfeU0?BzzTD6Wg#?}^oh!vV`U<1{nUKJ<)V@FQ4&}(O#GIoX8Ss75yQ+8x=K$2l z72_TAyocHiD&C><3@nTNig(O&C>kqfyhG_KDn74OivXhZRj)L|5_TrS?B zeWm)##XB@6)epxz=lT=xTrQtD9PePgAC7na=Hi{`oQ#ZjqH{Ac-eG1=NvCxP@s7+b z$#_TRm}I;ob4@bdkvS(B@5tPfjCW)XO2#`f7xkYR@AQ>3!tC7>@5JF6<^z8U;+$JI zWQ#S}JKtmG;%O~2;-09S8u78>AL1$AiIw-cT)u~NDBg*c_lcG7q4pH-#LD|zDc?hS z6z{~!`%u0oI*&uSo&My2jQB_9aU=3OM&2hP4@B`#bRLInqWn%@dBMo|hx(8?17^k} zl=q3u15x}FE6+pu9~zV5AF7xA$hd~`KU7cgPpmu-<$tK2;vcG)`ZDgJ{14Sr{1ct$ zA)fL-1B!phpONtoT^k!{{DVGv#XofYEHeI~G5U&sXxzy7hx(-WCHk6HjCe@;$cT5M zp7J}XvGP9siFabFat|tUS+^u6M=C^YrI> zSFAkGV6J!3zQ?-8Mf^anX+`F6B6BsAyP>`!uUSz!@){MDBdmGJCN^$w@2I)}Y|F10{mA*Zt%?QxG3>(21f zeH?F}OCvrv*Wf1*Un2N);`8zhK8yJIf*(t~%VqEriC=h-UGBb~{#?hRHqrOtd?xM~ zjxz=vXAI{~hI1#;xs&LeiHt2Smph5hnaEs<2lG*^+(~rKMCMX1mph5hnaEtqI`lu- z+=;wav=x0A_rd__nvlE}B(DMO1yA?7v;pZJ7kRBmUgJ3gp6+Go08-vUuFr{>KR<}; zeLW>sKH*C1YOH)hU+ZeDe8QF1)mZt2{;aF9@(IK1>hQXX`&EbUPsOo6ysi$ftHbLm z?lHx>_<66Zcp9sVUBEK|949kB1>&6u#yP|V^WJ}fX}ZAlkifC*r$)Wz5tyM192XLJ z4fP?P4^74W?%K6qQqq7KK)U~Z9B?e~8X%r^3()=dspA#qB)k^yiD2>F_U(8+4VVPv zCAoo7C9VL>?B*Xm!C+?hsE?jz^j-))KQl0D{+9x@t>HfWv~8u<4B*&pYpi@_smgCz zrSe->tNhEiseJ7km4D@SmEU=X%FFIjxo@4yE6P>wuT=T^DwWq%t9--VDzB?mc|*O* z?`c$dbCb$9wy6BxO)B5AS><27Pv!S-Rr$7n%D3w(-w{;#&Q_H_uuJ6+?pFEN_Ne@! zy()j?VU<7Hrt-(yRsQ%sm4AJ|%AYu(@^2he`ICoK{>`UU{&a`RpE<1ZqeoQ!>@k%; zcUtCt-|NNWEfAg`*|L<>A{=0uy z`TzU9%Kz|*%K!97mH*|>Di43E^0+t~w^(dEKHkPhjIi;MBW>Jfv+<-P8y_{w#z&8~ z@iAj;JT1+}$Bwn}j0_vU<{BFxKiuh||BpaVR*~X_#vGJ)> zZG74^8!srZ@#)iTe8vnLpE=XUXU($l>#w)*8*Z@in{Kjkuh+&)N^E@od>dc5(8d=p zw(%uPY<$@=8(*=)Rvib~e3J1n)VqQ_enRa(1t+)};4>ng8POu1 zH5oTU)Q^kOyJn`~uZhshF!k^@rVrya#wNp#sj#B}cFY(8J9MkCBNukKU`HYBC>{bk z+Dv~;fgRIe$8^{+a|rB6ju-xL!;U$y!vj0!4S^kOq_ATa?6?7TcwxuZ`t{6Wvp&~*b$O-_%%JRQ%~qMREN=rY=j zG`M_^z#=`#C>MqV7E^36#*!Ui$y+a`IMa2VFh5-nXJ=RAA~4=tfO4M67hU2P!4FH} zhf?@q^$_?$&R>gR$1>Ql5_a4=1a`>z>t@)o9CoaN9k&gE9diC!0y|c~j$2^ImxsU( zIe)Ez9e2Qvb+Dsy2<(va*IL+dC+zUS4*w9?A?L5#VaHvtqa1cr4S^kU{`v~+D1#jp zuw(rY*ztwUU!%?Wi!gHjqB6~2gc0*sHT+NuKQzJ*Ekoc3Ie*o_jyl+J5A4`D1a`>z z>u%Ul4?CJ*$EG2$L(X3tU`GS&Xoem44uKtV{@M&Xw!#h_cC-$G9diEK0z2-99ou2Y z&LOZv&R_SzjsWZk!j4@-V27N)z6v|G!HylU z_+c;n&;~#38v;Mb`RhU0@eu5I6m~p51a`>zYY*&r7s9go3| zuMdG8a{f91I}X8)4%l&I2<(va*AuYgN!al;>^M3EcF6hbAnbSwb{vKs$A-WTIe&cv zc6<|dJOewP9RfSPu=#6@Ie!sG&RcPF~rJs4Z&2*o`}iFWa>V5dDX4v$hHPiW>99k>svCvRxz znR&zB`9UZ0gSp5L<{>}m!hKGi7vk{(n#mR?@^!h$*X1E!=L$Hig3kr-0$&Ke7`!J@ z@KeA~13w-7Oz`XMf_HF>qhmw{gi{#NkP-#3F_4t^E* z+rUeIF9E*-{4L2D(*G5JV;6Hor8_L2T3p8QRGgukn?AGO$zM(jrm_Cxx+ z27Dd(d%$l5Fa3Qt_F@i%2fzox?*cFV{Z;VW!0!P60QfJ=->L8q`8y35>2KomOg_@z#FM|NeWbsMCw~(k z;qTqpkGN#FM{?kMuY3nDwez#tZq4&+k4BH_%3hS&wRW&q~`D$y+8k)-*n;NQWYW>aAnwu)7)z)sRn+oP` ze?^N?aO783`lkCTt7a6>EH3uv`+ODomHE>PW=^m0S5BWfW0t=FKd4zzUNO1H?9Xh# zmH3Qcq1N72c9Z`kN_VWu@1D}TJ?`nnX9Ua3>&;I!`Eyac+vKy?^lo2b^6mw_`Q;|R zsHiu;*5vb|+Eo*imkc{ER`u@ZKC}JqvflholRtZZZ~jkIR1(>leZ6^lt$nQD zcR39{*6+J8SA9c^pSdcx)Z=$*m}`CgMpw13xtf{(6W`LrTulv?z7`*IwfHx;7_EJE zH5H)!wM`B9Q>v`Q9};s_G}O@#`Y`&hq zdTdbE%Xw9zoL9|mjD3*Xgp(k&Us7LBAr?;)nCLM|m;Q?`PeT>y%Xz=&_ms@vh3@(N zXW0ccj`+~;Es-wm1AXLiEPt=LQP75uXjA$R@6I$nH=3N(cRwz8_eOzdRX()LJw{a#U$`!DO{ UxUyW0jad30oBGx$g{UI?4-@i#g#Z8m literal 0 HcmV?d00001 diff --git a/hsa/gfx950/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co b/hsa/gfx950/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co new file mode 100755 index 0000000000000000000000000000000000000000..bd1eaa493c5ab31b285ae35c808b01f01977ac3f GIT binary patch literal 31904 zcmeHw4SZD9nfJ+^$p?2PhY*r*gCIkQFoZB9A;bYuXZS|G0s?|n!z390!QyZk z5>$#1BT_`P5dqO!i@s>lrOD``MT-`w>q=YqrG3|KcemZ`cH3@0o&7)e+!H3_Ktj+> z_iZM>C;xl?&-r-nGjq>*?sM;***tUpEF~_^V`buHVHa4OxlQ0Pw;dbB#-e!Q+$;(I zIkU3_(A3r=)StnS*_p&7vmViMOq1#Yp;S?n$Wla^+Q9_I28%Kq)fJEUL|I~FJF@ji zqp9EB73)Nit+1IJ=6DKk(Ap(!t~2UpTi=PUCcUQ<Z>~Bs_|9(pI-E=t9JG3 z27esGJ!*EfqM&(!(KWuv4RjgFLVEHS!} zJvDBu^FLELv7ommE|#OE#2Bqx6SWwPdTiozh8JsVsy1HMiSwj|ju?Y=YoRknqtDyI z<*2Fn68hzd-PM<5jlbaFy*h^`ia3>e8yU z(gCOSUi1*@fzwS_+rfiZ+W}S;S_-b{O*~ZUYb>ki^g>H7y6EDC-B;Ve-mC3ke=j;X zu(+Yq1#P`(-^B$7ueO6DSKGl0z3AY;w2Lk{+KcvGTyX4aJ2-x|9h~Sz2M4BKbU~;W z?Yp?(S{YUeI*@42YEeVd<5yM?sAbR?v7_Bb3@buEiTQ3@UA;xgs zuCAwl7Nb>amqQe7|RO#wqW`u^8iZYvp@!Nimvz z?q+sI_V2}|#2Bv!W@<5-UDC|gTx;rU*VUCY_}5hXYZ^;dS8ni^m(XJTK2vn|s&V3n zGrrCWzN*SKHP%-?!&8&@;vC0a)xHfStE+sCCHL0WuQfY2bnrTz0G=-@xqaazA5IhN z3k#~Nd?mj8qI>g;O75w^zv`mG4f!QC`9=6&R_ou8S5ob3Sesu`*D&*MTy^!8>wS&> zPE#)0%8t3}{rBMPv&)c=;+>9*`?{-PORDIA#_H1As`Hb3XM}5QIReej z{@zpAH(mGo*85l2*TQ+^1CKmoUhl7OsI0ANm8bLKq&vocs*q~I*|*GJn04I1LC84h zX8yxt@>E~SP?PhpR)hQP*1zAo?x!ETWdTn@FM0YeN}?_6-R}3Y8N4P3zZ~jaE!EAu zOROe0gS^>dXHD2inP-Vr?@GA1aNM?V7%dVLLKgtrnc~>4SRG-jb#xfxgcU^~V+L+x z362hID=gS_m_V8!VF zdq6wd@3JW&a^LrWX;dDlgft*i5<&#bl3?hWH6i4na(qH4k1!!2R7{wngz5kupbpf5 zZ9q0i2^|G^fFUYpB!qqmAUi|sjrOp2UMsyJVec)iI(DSV>u#OLwzr>Qt&p9NJ&=9v zVfK}F`ecUL4^4d3#9bzS)x;+Z91tF0_+w>wrGbOOgA7a#CmWa^PB$iCaSBTt=d4J>_Gp7^K9{x#AFdqN-CLZw z6=Pg`?jPgIsiHp8#yi=NaW1ySnPc#A#Gz~f4v8-g5IWg++Gme(w~lrn*LC(9nC{MR&Syv4w=$;#{fyq4lak%!bdF9Mn^KX)3eP4X4>&$0r6QP; zlFd?zFC?i6`dpNaZJXFGUg*}{ir9}^U~>j+CsZ(gdVF0VBfdUh#kM;oMV~xqP;hba zaov3j`K+})G3+)zA3-)X+dXw^1(hB6pz7`{J)AX4*YP0?kIFW%2{s0uIoZ3Ib9B&| z>j>xO+C7*6Gsfjq!2Z!;XRbX7iC`)RbEafxu|KuzQ{Aw2@r7Z`sAROKt4l6?q z)D=fSSF#|(AlC+FD5-(8cr}n79|-JX*3m($H9MS$I1cNV+iI-{vV@D}goKK(8Ec%9 zofyt)bzm*hv&4GH24H>4!g`3gK0N9=1IgAuq-Lk6#c|<3q>fHeSHy)=E!h==Eu$;w zxTLu@mBs7&D6DVT9*pg9)+WRQ=-1SRp_65)C);Svq_(7&WvSPp*CX_@EcFNga6%K& z-YiSKjY+dJEvWvSPx91+v#N-^H6 zC6J|%<&YJS+aY&A?t-j>ltRiOt05JTyCGGO8b}?a9?}R|57_{@53&gofas7QqzSSO z@*re8WCvs?qy^Fn*$vqP*$deR*$;UN(gryIIS4rnIRbee@&e>V$Wh44kYkWnA;%%F zLry^2AtA_{kdu(NA*UekLQX^8hn#_&gM0w_5b_b^2aq2@{sD3U@)O8ELVgPQ8RTCe zKZpDqxu%gLJ}ZJ5EWvFa7YRy6_N(YfM}4RkYSK( zAr43uBpWgs;)LWtav?5A9;5(L2q}WNAyXhzAvZxhkZF(^kXev9kXs=0AqzKCeA(p8 zu~U4x)tNh*;>!o|8Oa!zR1tKF__!@6$8I&sIithacHkH=%1KGt)ShDKwj9-tIJHwB zKTfR>F>8*UwnI4X;}KiO--GMws+t{Sc5&?Ect`-BK)l_K{&+SW*EaLGwnZG*6#qoV z;GM_#brHvS_Qhk`I0m58@sX!v-Z%!JXOYJ^wKdC9uSXu^)ZQ#hJ&QcXseR-zPUA-& zUHQFJYv1M>>P<03RPPuwgT>EaR%=7 z2;9#Uquk~Zc#v6)@)3`~7brGDyQ35v0gthG1CM(Io?r<^Iph&|k|i4DQyzh*S&~sc z;}Q4)V?X~0ry^;NR0A_S*#>H>iVPe|vDXpyi?bPHKV;7vc#b^>9Gd?j`?`VW7<;7s ze8LgjZ-VlWaiZRkaiZQe6GgpiCZgVBq8_uFV`i9RI?XYsT)~(x^>EBrM7`Nt)Q;aL zHFqQitsR@JO&yz0?&xr^AAi3kP8k~QQ{r3WgTv&UP37GKg2M>+Bn#X-2>K}BlYU>i z(C^O__|)*=FgaG+NVJK1f2hrY>qOmyV+9@_FYrh{>PNkQRR8%&LjS^KfiK=D#-Kb6 z>5qERF6w z`#ey+_B(^aqGN;2Z;1CM+NdPkBAeb{?t7}|L!`W zm+fheq4uXY3jO={qyLEZQh#QP(4T7-dfA@lAZq`?Lqh-I!{|RE&LBPIFz6Vf_eNvi zKkJwtv=U~`F150=LqD=;m?+Uk_9^ErW1gW{34Ivt&sKee%`}g|bX{OZNZ=5PZ)6{G zZj*h;veeVuMtzbUY2!)_dom`JTDy&L9{Kc#iI}_CV+QX2Ow9AR!S7{H8n~bIonvCM zVmKE-`!GBXvXWjZtLV;$BM#?Ge1J^Vc9HB;!bqS}p`E%;Fp?M9mID35iybvOmKr&wG8r{|^bXDn8< z9KvOamP@!&(OiUgDq0?)Ptghp{fbsdSgB}5gw=}XCcH<{rVy@Ew5fy}740U%`xVVY zxJA*X5jHE@48n&LZ5H9fiZ+Mv5k+-%+&HgzqU@1>spmyPNRu z6s?NzyrR_*eynJ9gg;iadcsc>t&#Ab6>UA?FBEM9;eROFeT2VJv`vJ+$B7l;pA}6f zjI(G#!g!0;L^!~rZ6i#!Xb%z&vS`~0(=FN#!c2>{lW@32YatwI(OLV4Us2i)y_(_RSN<2`401NVA=VrQ=z^^UV|8+c+W_-dmr%3iyIIm|PBg(c79#Yhneqk9rRzR{>x4zMNbKJm!5fx!xn@%SM!6 z^`1-C=ZJWr?RF6_Yyf}U`;+8Qm53WI1Vr4h3H-pwW-N#IAsC(KQgZLUpDug2ESwzB}UB ztZ&VlZSA;G=^F3o^9}oth6EmSpzqFj2f21C$BK9K1xCB$A%Q2TUU%`1zR+kB3JE+( zZ7vt@=!=Z{r$PcxQ~jRAJCq~C5_5)fWWW!o-BraqIESD#)tlj;vKRHZ8~!^ zWCO}q6z`B-Jvg_!Y`jC`nmHaLUZFlIUb$4fL;FhlOT|01U!?Dkch2-A-nmphu|M9y zdfy-K{ME%f(K#6z??mTjWW2-7oRUuK5aJz~TaxjP%rVJ$N9LMjyd!f?GTxE7CmHX^ z9F&ZAWG?D|GT!MfXM|OxGv0~AHOvS87{ocZZpcP!uxq}@%*E4MX2d;FIW^*A#XrPT zyb~+$bE$j}wV`+?R^BI8zK7~lyb~+$bGdvEwWD|^R^Es5J<)j_%I)+d2V}%QGLIXP z-!bw&5qThrf1>j^WE16gy2}ej#y`}D%o#8<9-+KXWFCm(pICVw%Ky-q6#tN3_9NpO z%Kwm_;-6S~9?Jiap5hp7J}`>#E6>y2^{!ZXp37bDik0W-$@Q*S zd7j=}@1lK=b&ZSoE4ii>nZt?9)llw+`ii_}MdiqAR8)?&%$ ziZ-DS<6ambT@#Ymg5))z9pLF+mlh!1<07y1$ZI_N!PC7gZ9vLf$n`l1^XG?gU9YFa z$|qcIU5%Ab=x$w&l~1_bx*98=(35pFRz9JBUF~02aldN+{i!(i``6X}b+vz8#XY82 z7eDWH6;ESzu=98(fa7H5V<5g0!8nIFZ{GVaFkKgz5fV6reQfA8kHAb_;Lwo3Yp4(T zd}tc(ch^Q-tE2-nfpq`-P~Z^YH9$P;7NGm@(}pX|NjL)E6T#xo+PC2OG+;82PtOSq zobC$1%(MKH#~I8#JMfdI8GSDVpE)craOSlE+SYI%e){HOYbJ2W<`q`HxLD=4E>Zci zr7FK|xyo0rQ2Cc`SNWZHsJ!GZmHSqyysT8^{&JPCS*`NQ3YD+DTjkYNDzB|k`8{U%Kx<-}XyI$oRH>mu}_o@8;O)B3UQ27>JRQX#c zRsNl~RsPN?m4EkLmA`jd<==Z>~{Qv%g z%71)8<^S&|D*xmkRsK&uRrx>vOy&RjFDn1V&sF~K|EBW)_@&DK^H(bW&97DdU%yrP z?|-NAKmI}GfBuuo!=I`=F3!d+78{R`xADY88y_&h#%(qmPfoV+fdg%P&>$NhJlMw5 z(`|gn5F5|TwDD`MvGL)ur4UWE;QX1{=TeMjOBRW*hff6*eMr3FF0@4yu`_^lW`88IySo^)iOFz$HaZ&#&^J=&h&ZnTj~5i>{RV@a0rQK zA)WR#n`(UD2j%SDs#-zMLQ>yBXOf-9I*2jS1|(gqlZ59m@tozQo(s|0@%F)ZesVCL zpLAOC0=brgK%S*AP-rQ_u`}n4h8zJsGonR3`;j?O@RY{?F=luZ{>of*1Ln3ti4v$HF3F&E!IfO4+M7hdG&zz_4`hhq3)X&?AO&R=t3 z#{$@~7{!zWc6?#;*C2EL zB8;5Bs7&)0VZ{7Z0Y6m14|VWEV;}fI&R>a#@vAz%N zkn`7C*ij2R8eqr0ePD;2zc#>*O|V0U9Zh{;hn&AQ!jAi4#}?Sp+y{2Z`RhK|5r7>* z*s-k-?2z-MJj&}OqE0IEX{pGkQC=_1UdMMJGB)Kl^8S|ET?ZJiZuati_4^TMqJXxyZk{0uHO-bHKa6 z7l1DU?@1E;IPep|PXvEG_*HhnyTMNd?*Tspyq+TX$>47U?*%^-d|R5}r+~i+{50^h zz=t%!-vIt*@YBK127h6g;6=XS)AJ(VU|_D!49p7&ETlSmiov-<0;jS;M%m*LXneos z#e4(4xS!&3w-S|aF!`xbe4fdBqWD6SpK+0&gZ-F~{V2wMEX96Ef6oQK0Q_R`%fL&2 z-vWLi_$A<%gO~oE2YwOwTfyH3Uiy0l_&dO_0$&bZ`gFH9Zw3DV_%F=gY48vEI~^G5Z{l-JKGNUBlfS8c zq`!$Le-j_!?+39TJFp)u*pJ=V59#lRz&{NB5%7xOMgEB{%hc$0sk!cFU;Q= z@DKTW2r$y$#FM{?kMuY3mfX3@h~t7x4u)()&aZ79dF zW#3x1+}iQRKQ$vSOS#!C9SKOE(reWqf%0cMaoNRq;rkPf*UwG8YL@Fw{LQD~TG~8f z>M{2kuhVAPEOx^D${PIoe~7eUsY8} zZ9_?2eeLSXDu2U-hWfGzRaNV&$Ah`sU)E?89C_vCzKOo_)su>@FDmlq`Fv%0<#`kH zub)`vFQ0h*q{;q#{9tBTY1!CLvp=%|*W#7P+M2qm==UOsKZqM>BVR*|LL$q}iDVhW z{kM1}vJ>OF^5ac@Rc=?_ZSq!8p;I~CPP)CSNtHE5Fj@Hx+i}%T0b&RDI)3 zhMmPx{oiNSPg~QqpJtOkQPP#CubQX*J#~Lq{t1(BiSp+VaBTEkpxYSy_bd<_-M{7-yiJ#*F9mirog%+=`M&}h{5RacgQ z_E*){;-gnyjt?GlmDN_$5BxCt0h~r(DX;>q#g;%;=+$J0YF%B)0)KkTVM32c!h%Qfng4)aVqx1Kc%-@gh z{GDjo0acE8O~N*{rG226Jig`cI@bx>@DXiF|Ka;Gjn_>kC);a}30_+#@Mm(N62;H# z{{rx2Ql6=oAlrXa5++LXB`yaq- Date: Thu, 25 Dec 2025 13:08:21 +0800 Subject: [PATCH 06/10] update 942 kernles --- ...6_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 31800 -> 31800 bytes ...6W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co | Bin 31872 -> 31904 bytes ...6_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 31904 -> 31904 bytes 3 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co b/hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co index 1f28af088bce0de9eb28afc7a0b0561b8d5df5b6..43fc35bb8bcff3f608d1f92756dd4a56b3d9c12d 100755 GIT binary patch delta 37 lcmdn-gK@_X#togZjLn<7Vvn-`sh}ho!WNr}+Qreb z^-{|;W}&s8#xNhmWSKnhK*ocMF87wG35gHDgD=E~`7}*5(L@b<+4cMGpPE~~F;4Q12jV6 z6kR6!vWL-*u~1K!9f0(<({{bSJc${w&}A~#A&eY)-R7)+L0|q86QpMuc6em5JE?ut zWB($&4Bzn_?DOYi2i#O{YL$Z@_~NLy;LVSh#)rpCI&V*@ulFJ6?>Qy>+_)n6@h7ry zkMBh%8jh&g_VWgQcw&ar34iPIf(SqIWaQBpPKw7lZgw^|HTj5RqonAN{{TFlc~3{j z``~F4JsvaQfkboBLevm0g*~3D*6xn0+d~&Ww6#B^a*7%aGtr6C1U%wdqmkoGCPfkH zueZiXu>p%)A#h-Q8^BuAd0T{Uc(U!08Qj!t=D6BgxgqyxDje;Qncqk+D$% z19)H3^6d)$_>7kCQTR(HqQgoc9H1zdx*+zTtDK>GzLS%wQ$g&-zq$Bp6ancBE1k2v*A2kxJP}Frei6 zeEmUUc9YIyl^(v(Ilr30h62@B)$z*<{eCk=ZKA|tpJ_`uXu*ygLB-F@mG|{3{VcVmKki=Rk&R; cYP@Rk!=OJhXQ(1-S{gLLO^JqPlvr~82l26OKL7v# delta 1680 zcmb7EO>7%Q6n?w$e{AaC7$S$n$VwcVHi_BB;|P%-rxC(w;tCE)FCnOfA__4fQE(FC zvFntmpyKWTs*sSlaV`mxD};ABAbhA$ija^vl>>!iD+Fpdu*~f2x;W{Dk!HX5zIpF^ zJF~OzvwZ>Y3zfJ4S9?Dnt1o~)Oaw3Y1qwDg>&x7)f7pyU?gAn{A?!O8MN&df^bl)k zJjg8yuQ+#)aH(B#PY4K~?pVdI9&r!NpP4?f@apopB`HC!zunU^pL>h^;hk}(9=5RJ zd!DF%8@cHV3W)sT>j^T+QILUcH+VH}N?_5wVnLXpL2V;LD7 z44;!RCCXz+&cNl`9>_HWS15i==O;A{-Xn{_t6s5~m8WNJ;nUexWk0;75N{|LI^O>U zP8@4h?olE^iz1rPPg0@#!pu&ht(GpNi>-=iEh~C8S|y#ClCv`iBQd@if~_B>;v%%c zB^^b;O30eqY_l$HTBwa?XK`w7u5>zE#b}C&|6F?);@#zVEOrs%U0OWu2CC`rjh(k- zkn9KPmj6uvyM|=?)qwg|o!EK$%y!_JuZ5EGzsfrtIOTL2<;k7bA1@rLg7qqT5r0ef zW_fH_P7j~rJ3K|-*zLyH}+x!JE6?{%8xDCQ9-0w9Rj$wn}KHAJbVeox@&HOQg-#fq~gOAPX zMxl|FHWtuigVK#@gMU_P=I0E)??C)ngOBh6jU;{scDVg=gQ7+TV9iL_p8xLxz?Q+^ ze5aWgSc?!HU^X~&wL%tF!@_s8n}kVaHD5>$z=egE&Rv)v7#u%onu$cBVM^l-Gbx$o zgp@K(ly0O+CmLxe-AJ2|l1A*{z$7x$CV7bftvon1{?x&y_0UeHcT0DES=c4Te802m zNL1WaijBJ;JbYq diff --git a/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co b/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co index 32129cd44c8de35e407307f0a14460312ebf61c6..09f500dabaf5e7e0e8c1b7048cf2453d124e5411 100755 GIT binary patch delta 134 zcmZ4RlX1aM#t9cVS0plk!HSJH4+}9mY@RFJ&&YUZv##iI7w#`CE&Km7oZiR4z&QDT zq#kpVcmHOcs2nMtgU?&{Gcfc3apV5YD@q&~85eARS!&M6cw(|*xj5s5$(H3887(Gr lSM>23G2|pBrRKyNT9}zIOg<=Jw)swl3KQds&5YIa83D#4GJgO7 delta 134 zcmZ4RlX1aM#t9cVmn1TP!IF(P4+}BcZ=NgM&&YUtv##iI7w*q2E&Km7oZiRaz%coL zq~2!3sB9^LL(g0HGcfc3aU&2jFl=5?;=sr_XY Date: Thu, 25 Dec 2025 15:46:54 +0800 Subject: [PATCH 07/10] fixed nhead32 --- ...6_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 31800 -> 31800 bytes ...6W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16.co | Bin 31904 -> 31864 bytes ...6_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 31904 -> 31904 bytes op_tests/test_mla_persistent.py | 14 ++++++++++---- op_tests/test_mla_sparse.py | 8 +++++--- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co b/hsa/gfx942/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co index 43fc35bb8bcff3f608d1f92756dd4a56b3d9c12d..1f28af088bce0de9eb28afc7a0b0561b8d5df5b6 100755 GIT binary patch delta 37 lcmdn-gK@_X#togZj7^)nVvn-`sh}h<8SaE?s`Kbhiau? z2qM^&U_lp)`ODmBA<|fkr{kGgiMQt@ts8cUCx*r3D4~Q8&--EF%W#;7Mp)F82P*mP zsrg3xt$7=IGCodY6BF67gHdI4A%`LoitWcW3*>KuJ1|j=t8wFS`%MY$9!&}SB&?L z=rUQf}s4@PoPeJI&3PQ-%OT=b8akF}2`L9pG6A9pvlm%Bvy-7cQBjR#h z&AMELsT+1j6L?Y^%}rHQfwdywU)kKGuzLG6mzHz*ZMTg&%~wNt|2Bv=VDP1bm3*(k zJ3A}+A%m~~!->H|THaQ*OKCo}z0LxF$q@&$%ZSvyfYv;Rr22Ytb z5{Xz=7_@9uFzrbpW?B-r%7mj<8HrnECxxhy+tWQn%(5Z9g%dmX92-0W?7im*OZo6| zni(lvm|5kx660pOYRvPOSZp@xCC~MHkUP4TSY|fL5%zYr&;Qe4FV%+-ha2ZE=AJvq SQ5Ku)tJ$2^FxXgTuI&#i!Y<|j delta 1370 zcmZXTO-vg{6vy8z&e{ZM{8>AQF9V`mVp8ndbZtd~07XG2E*uP|Qbni|H3yVZH4$G) zJUE7+D8h~>xwMtaA&PvC4~e2^cTrRcsRzh4?SUSuMCzf5)C0L>JG-9UXv0XW_ulXQ z-+ME&nu7zh^%fNcu5IZ~_k+{rS06%)bnCn8^*z4 zZrsP|x+1kUs%uqS^}MD}Pl7xUSd-IRVhP~33~);pTUsz~ZO!d6eZj-&{dcLY2k=9o zj#ISr?^5ffws4G70zi0AhnMG-aZ13+Dcbf2sKw+PPt&oFdF3?)+DrwuCF1lpp*_;$ z{WZP{&m;_c(pRbHcp^8kiQzY?_+_XN%8!;thek`bP*3TzlMivEr%wLUza~fM5k>e( zI!TWB4p6G{&L<6xO<|k-pDiz|@Vf+nAeo|~Oa|lW>DkUu)LzH9PIG9R~mB!$)Bsv@z*VeL*gum31 z(u(c0xf`O--9G@hYr3wh@I<0}fW#CXT3?TGLqoMBp>1MQDnPU=7maq>6|t1aB?<{T zyph&7{#aSR)Xs+jHNtgp&hIw{YOqSjYQ6M4?Y~811nEcVNt?X=a!sx%m(QT=R$jaG z_ly1}y6kYj<2}{N?f!ys(t*6Jc)l+Y56uy}gmR;f99>4L1@QG2NJ{jfi1`4#)ZoR~$CW zqA{~f95%~_V^O9JhkIk1+oDY#F=vGP1|x<-?27`~c79Ig-6HvC@hnl?eC5>AFNiD8 zk~dd;bE-iR)_rQCQS5G$O4K!T*h>F2?`> diff --git a/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co b/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co index 09f500dabaf5e7e0e8c1b7048cf2453d124e5411..32129cd44c8de35e407307f0a14460312ebf61c6 100755 GIT binary patch delta 134 zcmZ4RlX1aM#t9cVmn1TP!IF(P4+}BcZ=NgM&&YUtv##iI7w*q2E&Km7oZiRaz%coL zq~2!3sB9^LL(g0HGcfc3aU&2jFl=5?;=sr_XY23G2|pBrRKyNT9}zIOg<=Jw)swl3KQds&5YIa83D#4GJgO7 diff --git a/op_tests/test_mla_persistent.py b/op_tests/test_mla_persistent.py index 68d558048a..ea358ff805 100644 --- a/op_tests/test_mla_persistent.py +++ b/op_tests/test_mla_persistent.py @@ -300,15 +300,21 @@ def test_mla( dtype_q=dtype, dtype_kv=kvtype, ) + q_scale = torch.ones([1], dtype=torch.float, device="cuda") + kv_scale = torch.ones([1], dtype=torch.float, device="cuda") + # import pdb;pdb.set_trace() + def test_absorb_decode_bf16(): kv_last_page_lens = torch.ones(batch_size, dtype=torch.int) out_asm = torch.empty((total_q, nhead, v_head_dim), dtype=out_dtype).fill_(-1) + kv_buffer_cal = kv_buffer.to(kvtype) + (attn_logits, attn_lse), us_asm_decode = run_perftest( aiter.mla.mla_decode_fwd, q, - kv_buffer.view(num_page, page_size, nhead_kv, qk_head_dim), + kv_buffer_cal.view(num_page, page_size, nhead_kv, qk_head_dim), out_asm, qo_indptr, kv_indptr, @@ -316,6 +322,7 @@ def test_absorb_decode_bf16(): kv_last_page_lens, max_seqlen_qo, sm_scale, + kv_scale=kv_scale, num_kv_splits=max_split_per_batch, work_meta_data=work_meta_data, work_indptr=work_indptr, @@ -336,6 +343,8 @@ def test_absorb_decode_bf16(): out_asm, msg=f"mla_decode-absorb [golden vs aiter_asm]: {us_asm_decode:>8.2f} us......", ) + # import pdb;pdb.set_trace() + return err, us_asm_decode def test_absorb_decode_fp8(): @@ -343,10 +352,7 @@ def test_absorb_decode_fp8(): out_asm = torch.empty((total_q, nhead, v_head_dim), dtype=out_dtype).fill_(-1) q_fp8 = q.to(dtypes.fp8) - q_scale = torch.ones([1], dtype=torch.float, device="cuda") - kv_buffer_fp8 = kv_buffer.to(dtypes.fp8) - kv_scale = torch.ones([1], dtype=torch.float, device="cuda") out_ref_fp8, lse_ref_fp8 = torch_mla_extend( q_fp8 if dtype == dtypes.fp8 else q, diff --git a/op_tests/test_mla_sparse.py b/op_tests/test_mla_sparse.py index 46faebdc2e..e40da3a66a 100644 --- a/op_tests/test_mla_sparse.py +++ b/op_tests/test_mla_sparse.py @@ -494,11 +494,15 @@ def test_mla( is_causal=False, dtype=out_dtype, ) + q_scale = torch.ones([1], dtype=torch.float, device="cuda") + kv_scale = torch.ones([1], dtype=torch.float, device="cuda") def test_sparse_mla_bf16(): kv_last_page_lens = torch.ones(batch_size, dtype=torch.int) out_asm = torch.empty((total_q, nhead, v_head_dim), dtype=out_dtype).fill_(-1) + kv_buffer_cal = kv_buffer.to(kvtype) + (attn_logits, attn_lse), us_asm_decode = run_perftest( aiter.mla.mla_decode_fwd, q, @@ -511,6 +515,7 @@ def test_sparse_mla_bf16(): kv_last_page_lens, 1, sm_scale, + kv_scale=kv_scale, num_kv_splits=max_split_per_batch, work_meta_data=work_meta_data, work_indptr=work_indptr, @@ -541,10 +546,7 @@ def test_sparse_mla_fp8(): out_asm = torch.empty((total_q, nhead, v_head_dim), dtype=out_dtype).fill_(-1) q_fp8 = q.to(dtypes.fp8) - q_scale = torch.ones([1], dtype=torch.float, device="cuda") - kv_buffer_fp8 = kv_buffer.to(kvtype) - kv_scale = torch.ones([1], dtype=torch.float, device="cuda") out_ref_fp8, lse_ref_fp8 = torch_mla_extend( q_fp8 if dtype == dtypes.fp8 else q, From de639217185d9d9b14f81106bee702710fee80d6 Mon Sep 17 00:00:00 2001 From: zanzhang Date: Thu, 25 Dec 2025 17:54:35 +0800 Subject: [PATCH 08/10] 308 ready --- ...6_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 31904 -> 32096 bytes op_tests/test_mla_persistent.py | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co b/hsa/gfx942/mla/MLA_A16W16_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co index 32129cd44c8de35e407307f0a14460312ebf61c6..945360d7b0e512ed1503d27d345bca0c5f9b42a2 100755 GIT binary patch delta 1166 zcmZ4RlkveX#t9mX4<>3Zug^$g00S7!AOPVr0x32i&M1ZOGp3@_1rRR73@#`Gs*efC zB}N}q|73qgK_$X=Y;Izd5@IaaJXg4%k%>oQv!3W-Mq%bf_464R)EGnpAILDUG+bn6 zsGDpcZb%jJ1>&|;F$b4>dL&$FYJP*{Lt2`@SwQ-U9rqWOmi_-3PVZx2V4VCvQjfXG zyMMDzRE`wS#OJO185nwixN-mH6(tUgObM-%UzO^4?$Ls#7AU0zrr8;4bWp{0q2dY< z23*ntDt=^gVwrfoL?m1S%bRqdLzJ;I*qEYn9b5>^m<<&VXhjtl2W2!S4u*`WlY`2)*IQaJ2hQ_8KrX0v- zH@F>!X0Aps6&98z43iHEm~FmOp~A$Jku6K{>lD~f=Yz#*xbY@CB$gId9H9jBhzj1&3dAT8QDbwAILDUG)z{IFeJ#nAZ|-g z4YE0JBwR^SFCg`h6!n`gNI$XT{>;*{|3Aa&eGCo^llMpIZT=XQEhTX1dFy@#h8`ep z1Y!n;%^9T*j7)P{Ca)^f^VHD-2_b+In90ttM;jpqV(CJ~6Le6;Eui8olNHOw>mwpi zW&NPCIc})pkx=m(6IAgOsCWXJ`h2K(0Gjz#P;rrTFk|xeazU>>@nANbm;hCCq!um& zX3U0)_q3pji-Qs>69>bJDU*dNw%41PGvp*DrRKyN7#LZ?n1;?yFs6}_8=Prs0cTnm z!k8|mW-z9O3tXFtDO{U{3tXFvsR>lAp`n2(h$#oM*$r-op_voRU_%QtV}{8G1`o(0&8goX8$(1$sOeabw@2at5 RnovIZSB)Jb!(_`^bpY~AzzhHY diff --git a/op_tests/test_mla_persistent.py b/op_tests/test_mla_persistent.py index ea358ff805..ef15f7a153 100644 --- a/op_tests/test_mla_persistent.py +++ b/op_tests/test_mla_persistent.py @@ -213,7 +213,9 @@ def test_mla( max_seqlen_qo = seq_lens_qo.max().item() qo_indptr[1 : batch_size + 1] = torch.cumsum(seq_lens_qo, dim=0) total_q = qo_indptr[-1].item() - q = torch.randn((total_q, nhead, qk_head_dim), dtype=torch.bfloat16) + # q = torch.randn((total_q, nhead, qk_head_dim), dtype=torch.bfloat16) + q = torch.randn((total_q + 1, nhead, qk_head_dim), dtype=torch.bfloat16) + q = q[:-1] # troch implementation out_ref, lse_ref = torch_mla_extend( @@ -302,7 +304,6 @@ def test_mla( ) q_scale = torch.ones([1], dtype=torch.float, device="cuda") kv_scale = torch.ones([1], dtype=torch.float, device="cuda") - # import pdb;pdb.set_trace() def test_absorb_decode_bf16(): @@ -343,7 +344,6 @@ def test_absorb_decode_bf16(): out_asm, msg=f"mla_decode-absorb [golden vs aiter_asm]: {us_asm_decode:>8.2f} us......", ) - # import pdb;pdb.set_trace() return err, us_asm_decode From ff168aff5ace18c0c9d30217f4f6f7b13daa6a76 Mon Sep 17 00:00:00 2001 From: zanzhang Date: Mon, 5 Jan 2026 15:50:25 +0800 Subject: [PATCH 09/10] update nhead64 ps mock --- aiter/mla.py | 6 +++--- aiter/ops/attention.py | 3 ++- csrc/kernels/mla/metadata/v1_2_device.cuh | 5 ++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/aiter/mla.py b/aiter/mla.py index a22a6fc4d7..503c2163b8 100644 --- a/aiter/mla.py +++ b/aiter/mla.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. # user interface @@ -271,8 +271,7 @@ def mla_decode_fwd( num_kv_splits = get_cu_num() if nhead == 16 or ( nhead == 128 and q.dtype == dtypes.fp8 and kv_buffer.dtype == dtypes.fp8) or ( - nhead == 64 and q.dtype == dtypes.bf16 and kv_buffer.dtype == dtypes.bf16) or ( - nhead == 32 and q.dtype == dtypes.bf16 and kv_buffer.dtype == dtypes.bf16 + nhead in [32, 64] and q.dtype == dtypes.bf16 and kv_buffer.dtype == dtypes.bf16 ): # Natively support cases pass @@ -284,6 +283,7 @@ def mla_decode_fwd( q = q.view(total_s, nhead, -1) o = o.view(total_s, nhead, -1) io_transformed = True + max_seqlen_q = 1 else: assert False, f"{nhead=} and {max_seqlen_q=} not supported" diff --git a/aiter/ops/attention.py b/aiter/ops/attention.py index a433bd213a..5fbf797b5e 100644 --- a/aiter/ops/attention.py +++ b/aiter/ops/attention.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: MIT -# Copyright (C) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. import math from typing import Optional, Tuple @@ -631,6 +631,7 @@ def get_mla_metadata_info_v1( int(math.ceil(max_seqlen_qo * num_head_qo / 128)) if num_head_qo == 16 or (num_head_qo == 128 and kv_dtype == dtypes.fp8 and q_dtype == dtypes.fp8) + or (num_head_qo in [32, 64] and kv_dtype == dtypes.bf16 and q_dtype == dtypes.bf16) else int(math.ceil(max_seqlen_qo * num_head_qo / 16)) ) batch_size = batch_size * max_seqlen_qo if is_sparse else batch_size diff --git a/csrc/kernels/mla/metadata/v1_2_device.cuh b/csrc/kernels/mla/metadata/v1_2_device.cuh index e466aa00c4..9a51a4e9d3 100644 --- a/csrc/kernels/mla/metadata/v1_2_device.cuh +++ b/csrc/kernels/mla/metadata/v1_2_device.cuh @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved. #include "v1_comm.cuh" @@ -443,7 +443,7 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba // launch kernel MLA_NUM_HEADS_DISPATCHER( - num_heads, + num_heads_per_head_k, MLA_METADATA_DISPATCHER( max_seqlen_qo * num_heads_per_head_k, kPackedQoLenPerWg, @@ -456,5 +456,4 @@ void get_mla_metadata_v1_2_device(const torch::Tensor& seqlens_qo_indptr, // [ba dev_prop.warpSize, dev_prop.maxSharedMemoryPerMultiProcessor))); - } From 78826e019cebbba05da9b7e3bc8854f427f85897 Mon Sep 17 00:00:00 2001 From: Zzz9990 Date: Thu, 25 Dec 2025 20:12:21 -0600 Subject: [PATCH 10/10] update 950 kerrnels --- ...6W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co | Bin 31744 -> 31752 bytes ...6_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 31800 -> 31800 bytes ...6_1TG_4W_64mx1_16nx1_Coex0_Msk1_QH16_PS.co | Bin 31904 -> 32096 bytes 3 files changed, 0 insertions(+), 0 deletions(-) diff --git a/hsa/gfx950/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co b/hsa/gfx950/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16.co index 7a0493c0e496829a2b4b3e32ffd60c9b5d5b7e60..9129542941d517ed73235e7c7cc210f914348367 100755 GIT binary patch delta 377 zcmZqp!PxPGae@Y8$41TljGQt23}6s5`6Ht+qw8i?rer=Iv-l`K)5Oe@)FKn3$sYvO znd1wJC(jn}=dr{lAuAX>IZsfE(PDC^poT&N3j@Oh1_nl;sgsV_ZPr)yv||RcK^O!W zCp%V)aHAO`05so~0Zot{B&e62GI=4R%H~F){RxZ-lO2oY4c`btOyYu23`J1->%af` zAm$3FJkT9raAxzw;_FO|k&`pa^cm$h&n&B8WP*4LtF>Ui=2Zku-dCZ-7&G}{g+6Bj t)Wn3zl9lG1F!NVTjs)_~K;_O%o>^%w2y+{ZF53LC@;@VE(&UHL@&G6XW;6f* delta 369 zcmeD9!PxMFae@Y8!$!^hjGQ6-3}6s4`6Ht+qy1)9rer=Is*Mi?1^=22ai`(`S^~JhQBV5vPTdODclEemGMh!x%F8WraRx1k|{Q z$%>WcoG`-|Oil#yjzHy(OkP-NE(miNjLw;ySRy*PzKU}*d(}fmrWnS_*;TTP3;=tw BTSou@ diff --git a/hsa/gfx950/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co b/hsa/gfx950/mla/MLA_A16W16_1TG_4W_32mx1_16nx1_Coex0_Msk1_QH16_PS.co index 9832c222c6982951679ab44cc57955f627b5fd41..37b2431f5ea65211b12c53fbed992332ecd217c2 100755 GIT binary patch delta 37 lcmdn-gK@_X#togZj7^)nVvn-`sh}h3Zug^$g00S7!AOPVr0x32i&M1ZOGp3@_1rRR73@#`Gs*efC zB}N}q|73qgK_$X=Y;Izd5@IaaJXg4%k%>oQv!3W-Mq%bf_464R)EGnpAILDUG+bn6 zsGDpcZb%jJ1>&|;F$b4>dL&$FYJP*{Lt2`@SwOlcl5yhZxh39=ObM-%UzO^4?$Ls# z1Sq8hrr8;4bWp{0q2dY<23*ntDt=^gVwrfoL?m1S%bRqdLzJ;I*qEYn9b5>^m<<&VXhjtl2W28A4u*`W zlY`2)*IQaJ2hQ_8KrX0v-H@F>!X0Aps6&98z43pmpm~FmOp~A$JkuKnn$p$&J6K{>lD~f=Yz#*xbY@CB*2kd9H9jBhwx6&3dAT8QDbwAILDUG)z{IFeJ#nAZ|-g z4YE0JBwR^SFCg`h6!n`gNcTiC9^4#T>dnZspk?x^GCfZnEoewWDJ3w?&ag)tRa_S; zo}hy&ZUGf%nXFhYULO&GD(eT8&2d8&kA#ZXn4pTMK*bZ#)aOIR1JKN`f{KfzgBg>z zmkWCBi3hXc#003C6SZ(5Fk?1Ud_oJVxHu>gGI21hm@-+YVtc)r1w&3^Qff}Tfq{`F zjA`iX1Y;T*xxtyHaJ5DjhA>eVQ!^OT!Ue9)#1yX0!Ue9)#nc3<*3i(v6vUJR+3W_l z!_dsh2&Tfq%!Fa`I{~xJb1GGsm{ufCcC410{GeKZ3!ZCACjY876O@<=@t{O0gnlvE ivBsQHVsd4TJ=2NO$-8Rom?o4@{#9eg$S~QmRviE&k+JCj