diff --git a/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_dsv3.csv b/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_dsv3.csv new file mode 100644 index 0000000000..90d9a73b9d --- /dev/null +++ b/aiter/configs/model_configs/a8w8_blockscale_bpreshuffle_tuned_gemm_dsv3.csv @@ -0,0 +1,601 @@ +cu_num,M,N,K,kernelId,splitK,us,kernelName,tflops,bw,errRatio +256,16,3072,1536,7,0,6.6778,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.61,725.01,0.0 +256,16,4096,512,12,0,3.4188,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,19.63,654.15,0.0 +256,16,7168,2048,7,0,7.4162,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,63.34,2014.81,0.0 +256,16,4608,7168,7,0,18.4179,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,57.39,1807.6,0.0 +256,16,7168,2304,12,0,9.3937,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,56.26,1786.44,0.0 +256,16,128,7168,7,0,10.9098,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,2.69,94.99,0.0 +256,16,2112,7168,7,0,17.9549,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,26.98,853.31,0.0 +256,16,2240,7168,7,0,17.9866,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.57,903.04,0.0 +256,16,8192,1536,7,0,6.5764,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,61.23,1956.94,0.0 +256,16,11264,1536,7,0,6.7797,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,81.66,2608.75,0.0 +256,32,3072,1536,7,0,6.5994,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,45.76,752.24,0.0 +256,32,4096,512,7,0,3.336,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,40.23,712.13,0.0 +256,32,7168,2048,6,0,8.3186,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,112.94,1827.75,0.0 +256,32,4608,7168,7,0,18.2735,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,115.68,1836.23,0.0 +256,32,7168,2304,7,0,8.841,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,119.55,1928.24,0.0 +256,32,128,7168,7,0,11.2862,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,5.2,102.34,0.0 +256,32,2112,7168,7,0,17.9464,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,53.99,863.87,0.0 +256,32,2240,7168,7,0,17.8765,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,57.48,919.03,0.0 +256,32,8192,1536,7,0,6.7336,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,119.6,1953.84,0.0 +256,32,11264,1536,7,0,7.4643,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,148.35,2421.06,0.0 +256,48,3072,1536,6,0,6.9309,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,65.36,733.99,0.0 +256,48,4096,512,7,0,3.4031,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,59.16,739.02,0.0 +256,48,7168,2048,7,0,8.2067,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,171.72,1884.62,0.0 +256,48,4608,7168,7,0,18.2884,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,173.38,1849.07,0.0 +256,48,7168,2304,12,0,9.7874,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,161.99,1768.99,0.0 +256,48,128,7168,7,0,11.3869,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,7.74,111.87,0.0 +256,48,2112,7168,7,0,17.7186,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,82.02,885.26,0.0 +256,48,2240,7168,7,0,17.863,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,86.29,930.16,0.0 +256,48,8192,1536,6,0,7.2732,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,166.08,1848.3,0.0 +256,48,11264,1536,6,0,7.746,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,214.43,2382.72,0.0 +256,64,3072,1536,7,0,6.6382,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,90.99,784.87,0.0 +256,64,4096,512,12,0,3.8482,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,69.76,689.73,0.0 +256,64,7168,2048,12,0,8.0525,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,233.35,1953.26,0.0 +256,64,4608,7168,12,0,19.4883,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,216.94,1748.68,0.0 +256,64,7168,2304,12,0,10.0827,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,209.66,1743.58,0.0 +256,64,128,7168,7,0,11.4701,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,10.24,121.41,0.0 +256,64,2112,7168,7,0,17.5361,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,110.5,904.87,0.0 +256,64,2240,7168,7,0,17.762,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,115.71,945.94,0.0 +256,64,8192,1536,12,0,7.2158,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,223.21,1902.74,0.0 +256,64,11264,1536,17,0,8.7114,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,254.22,2162.87,0.0 +256,80,3072,1536,7,0,6.7552,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,111.76,789.46,0.0 +256,80,4096,512,7,0,3.9648,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,84.63,704.57,0.0 +256,80,7168,2048,17,0,9.8854,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,237.6,1617.62,0.0 +256,80,4608,7168,6,0,21.4354,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,246.55,1602.06,0.0 +256,80,7168,2304,17,0,10.1472,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,260.41,1758.74,0.0 +256,80,128,7168,7,0,11.4594,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,12.81,131.89,0.0 +256,80,2112,7168,7,0,17.6228,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,137.45,910.76,0.0 +256,80,2240,7168,7,0,17.5966,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,145.99,965.42,0.0 +256,80,8192,1536,12,0,8.4612,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,237.94,1656.56,0.0 +256,80,11264,1536,11,0,9.939,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,278.52,1934.46,0.0 +256,96,3072,1536,7,0,6.8027,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,133.18,802.02,0.0 +256,96,4096,512,6,0,3.9901,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,100.91,735.0,0.0 +256,96,7168,2048,17,0,10.0382,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,280.78,1619.11,0.0 +256,96,4608,7168,12,0,19.4234,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,326.5,1781.51,0.0 +256,96,7168,2304,17,0,10.1868,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,311.27,1778.04,0.0 +256,96,128,7168,7,0,11.4861,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,15.34,141.93,0.0 +256,96,2112,7168,7,0,17.5815,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,165.32,923.27,0.0 +256,96,2240,7168,7,0,17.6742,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,174.42,971.73,0.0 +256,96,8192,1536,12,0,8.6168,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,280.37,1659.92,0.0 +256,96,11264,1536,11,0,9.7862,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,339.45,2004.01,0.0 +256,112,3072,1536,6,0,6.6529,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,158.87,838.54,0.0 +256,112,4096,512,12,0,4.239,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,110.82,724.7,0.0 +256,112,7168,2048,11,0,10.1121,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,325.19,1633.2,0.0 +256,112,4608,7168,6,0,21.0607,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,351.31,1655.46,0.0 +256,112,7168,2304,17,0,10.2514,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,360.87,1792.8,0.0 +256,112,128,7168,7,0,11.926,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,17.23,146.65,0.0 +256,112,2112,7168,7,0,17.5452,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,193.28,935.57,0.0 +256,112,2240,7168,7,0,17.6166,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,204.16,985.49,0.0 +256,112,8192,1536,11,0,8.6159,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,327.14,1693.38,0.0 +256,112,11264,1536,10,0,10.6725,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,363.13,1873.66,0.0192 +256,128,3072,1536,6,0,6.7885,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,177.94,839.9,0.0 +256,128,4096,512,6,0,3.9371,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,136.36,815.64,0.0 +256,128,7168,2048,11,0,9.9822,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,376.48,1680.71,0.0 +256,128,4608,7168,17,0,22.2306,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,380.36,1580.13,0.0 +256,128,7168,2304,12,0,10.8525,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,389.57,1718.04,0.0 +256,128,128,7168,7,0,12.0993,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,19.41,154.37,0.0 +256,128,2112,7168,7,0,17.803,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,217.69,932.26,0.0 +256,128,2240,7168,12,0,18.9357,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,217.07,926.68,0.0 +256,128,8192,1536,12,0,8.6274,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,373.37,1724.35,0.0 +256,128,11264,1536,10,0,10.7933,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,410.36,1888.37,0.0237 +256,144,3072,1536,6,0,7.0264,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,193.41,828.95,0.0 +256,144,4096,512,7,0,4.6896,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,128.79,714.46,0.0 +256,144,7168,2048,10,0,11.634,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,363.41,1464.62,0.0209 +256,144,4608,7168,17,0,22.9973,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,413.64,1538.85,0.0 +256,144,7168,2304,12,0,13.2409,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,359.22,1428.24,0.0 +256,144,128,7168,7,0,12.6892,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,20.82,156.56,0.0 +256,144,2112,7168,7,0,18.1102,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,240.75,926.51,0.0 +256,144,2240,7168,7,0,18.2234,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,253.75,973.12,0.0 +256,144,8192,1536,10,0,10.431,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,347.41,1453.69,0.0206 +256,144,11264,1536,11,0,11.9976,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,415.32,1730.91,0.0 +256,160,3072,1536,6,0,6.5502,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,230.52,907.97,0.0 +256,160,4096,512,7,0,4.7362,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,141.69,736.83,0.0 +256,160,7168,2048,10,0,11.7219,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,400.76,1476.0,0.0213 +256,160,4608,7168,17,0,22.9632,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,460.29,1552.55,0.0 +256,160,7168,2304,10,0,13.4514,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,392.88,1425.69,0.0233 +256,160,128,7168,7,0,13.1726,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,22.29,159.83,0.0 +256,160,2112,7168,12,0,18.7552,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,258.3,904.36,0.0 +256,160,2240,7168,12,0,18.8543,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,272.51,950.45,0.0 +256,160,8192,1536,10,0,10.4498,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,385.32,1478.51,0.0225 +256,160,11264,1536,9,0,12.1962,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,453.95,1734.29,0.0 +256,176,3072,1536,12,0,7.0386,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,235.98,862.43,0.0 +256,176,4096,512,6,0,4.7531,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,155.31,763.51,0.0 +256,176,7168,2048,10,0,12.0818,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,427.7,1453.73,0.0202 +256,176,4608,7168,17,0,23.1037,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,503.24,1554.46,0.0 +256,176,7168,2304,16,0,13.7576,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,422.55,1413.31,0.0 +256,176,128,7168,7,0,12.1177,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,26.65,183.54,0.0 +256,176,2112,7168,7,0,20.0731,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,265.47,854.07,0.0 +256,176,2240,7168,7,0,20.1259,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,280.82,899.66,0.0 +256,176,8192,1536,10,0,10.6662,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,415.25,1475.39,0.0185 +256,176,11264,1536,15,0,13.111,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,464.51,1642.65,0.0079 +256,192,3072,1536,11,0,7.7182,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.76,802.41,0.0 +256,192,4096,512,11,0,4.8046,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,167.61,784.32,0.0 +256,192,7168,2048,10,0,12.2567,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,459.92,1454.37,0.02 +256,192,4608,7168,17,0,22.2651,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,569.66,1624.78,0.0 +256,192,7168,2304,16,0,13.6604,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,464.25,1442.85,0.0 +256,192,128,7168,7,0,13.9974,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,25.17,167.38,0.0 +256,192,2112,7168,12,0,18.8487,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,308.42,919.22,0.0 +256,192,2240,7168,12,0,18.7135,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,329.47,977.52,0.0 +256,192,8192,1536,12,0,10.6821,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,452.33,1500.04,0.0 +256,192,11264,1536,15,0,13.1393,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,505.64,1668.41,0.0091 +256,208,3072,1536,12,0,7.9348,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,247.38,795.99,0.0 +256,208,4096,512,6,0,5.0959,a8w8_blockscale_bpreshuffle_1x128x128_256x16x128x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_8_1x2_intrawave_v1,171.2,766.81,0.0 +256,208,7168,2048,17,0,12.6319,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,483.45,1431.93,0.0 +256,208,4608,7168,11,0,23.4141,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,586.85,1556.24,0.0 +256,208,7168,2304,16,0,13.6088,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,504.84,1467.89,0.0 +256,208,128,7168,7,0,15.397,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,24.79,159.88,0.0 +256,208,2112,7168,7,0,20.2395,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,311.16,865.06,0.0 +256,208,2240,7168,7,0,20.4561,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,326.53,903.35,0.0 +256,208,8192,1536,11,0,11.4103,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,458.75,1429.43,0.0 +256,208,11264,1536,15,0,13.9288,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,516.73,1601.49,0.0103 +256,224,3072,1536,11,0,7.7348,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,273.3,832.46,0.0 +256,224,4096,512,11,0,5.1647,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,181.91,783.56,0.0 +256,224,7168,2048,17,0,12.6958,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,518.02,1445.37,0.0 +256,224,4608,7168,11,0,23.2846,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,635.51,1576.16,0.0 +256,224,7168,2304,16,0,13.7205,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,539.25,1475.34,0.0 +256,224,128,7168,12,0,15.2978,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,26.87,168.68,0.0 +256,224,2112,7168,12,0,18.7236,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,362.23,944.83,0.0 +256,224,2240,7168,12,0,18.6354,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,386.0,1001.61,0.0 +256,224,8192,1536,17,0,11.0501,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,510.14,1501.98,0.0 +256,224,11264,1536,15,0,14.1353,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,548.35,1605.33,0.0097 +256,240,3072,1536,11,0,7.7708,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,291.47,844.42,0.0 +256,240,4096,512,17,0,5.029,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,200.17,832.39,0.0 +256,240,7168,2048,17,0,12.8612,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,547.88,1447.16,0.0 +256,240,4608,7168,17,0,33.305,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,476.04,1109.81,0.0 +256,240,7168,2304,16,0,13.7347,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,577.17,1493.2,0.0 +256,240,128,7168,7,0,15.3666,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,28.66,175.66,0.0 +256,240,2112,7168,7,0,20.6831,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,351.33,864.13,0.0 +256,240,2240,7168,17,0,22.4435,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,343.4,839.97,0.0 +256,240,8192,1536,17,0,11.2924,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,534.86,1495.14,0.0 +256,240,11264,1536,15,0,14.3567,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,578.46,1607.39,0.0092 +256,256,3072,1536,12,0,7.5085,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,321.76,890.28,0.0 +256,256,4096,512,11,0,5.1542,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,208.32,839.19,0.0 +256,256,7168,2048,17,0,12.8266,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,585.98,1471.5,0.0 +256,256,4608,7168,17,0,33.8093,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,500.2,1101.01,0.0 +256,256,7168,2304,16,0,13.6939,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,617.48,1517.09,0.0 +256,256,128,7168,7,0,15.541,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,30.23,181.33,0.0 +256,256,2112,7168,12,0,19.701,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,393.44,916.46,0.0 +256,256,2240,7168,17,0,21.7267,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,378.37,876.26,0.0 +256,256,8192,1536,17,0,11.2226,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,574.06,1529.99,0.0 +256,256,11264,1536,15,0,14.5866,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,607.3,1608.45,0.0112 +256,288,3072,1536,17,0,7.9157,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,343.36,875.53,0.0 +256,288,4096,512,11,0,5.7842,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,208.84,795.94,0.0 +256,288,7168,2048,11,0,14.8507,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,569.38,1306.25,0.0 +256,288,4608,7168,10,0,33.5934,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,566.34,1123.7,0.0183 +256,288,7168,2304,9,0,16.3878,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,580.47,1300.2,0.0 +256,288,128,7168,7,0,16.5604,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,31.91,184.51,0.0 +256,288,2112,7168,12,0,20.0581,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,434.73,918.32,0.0 +256,288,2240,7168,12,0,20.1319,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,459.39,964.19,0.0 +256,288,8192,1536,15,0,13.8741,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,522.39,1278.92,0.01 +256,288,11264,1536,14,0,15.1539,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,657.63,1599.06,0.0 +256,320,3072,1536,11,0,8.2841,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,364.54,866.26,0.0 +256,320,4096,512,12,0,5.7154,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,234.84,854.26,0.0 +256,320,7168,2048,15,0,15.7397,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,596.91,1265.78,0.012 +256,320,4608,7168,17,0,33.512,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,630.8,1142.07,0.0 +256,320,7168,2304,14,0,17.52,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,603.29,1246.57,0.0 +256,320,128,7168,7,0,16.2403,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,36.16,202.78,0.0 +256,320,2112,7168,17,0,21.6995,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,446.5,865.65,0.0 +256,320,2240,7168,17,0,21.9488,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,468.18,901.36,0.0 +256,320,8192,1536,11,0,13.9271,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,578.23,1315.23,0.0 +256,320,11264,1536,14,0,15.4432,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,717.01,1618.96,0.0 +256,352,3072,1536,12,0,9.2019,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,361.0,806.57,0.0 +256,352,4096,512,12,0,6.0786,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,242.88,849.04,0.0 +256,352,7168,2048,15,0,16.5946,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,622.78,1232.16,0.0111 +256,352,4608,7168,17,0,34.0809,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,682.29,1138.39,0.0 +256,352,7168,2304,15,0,18.4343,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,630.71,1213.63,0.0105 +256,352,128,7168,7,0,16.9917,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,38.01,207.79,0.0 +256,352,2112,7168,17,0,22.4901,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,473.89,851.43,0.0 +256,352,2240,7168,17,0,22.6341,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,499.41,890.53,0.0 +256,352,8192,1536,15,0,14.5377,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,609.34,1299.43,0.0108 +256,352,11264,1536,8,0,17.7817,a8w8_blockscale_bpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,684.99,1449.36,0.0 +256,384,3072,1536,17,0,10.0823,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,359.43,760.51,0.0 +256,384,4096,512,11,0,6.0382,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,266.74,900.85,0.0 +256,384,7168,2048,14,0,16.6003,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,679.16,1263.32,0.0001 +256,384,4608,7168,17,0,33.7358,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,751.94,1165.57,0.0 +256,384,7168,2304,14,0,18.8133,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,674.18,1217.48,0.0 +256,384,128,7168,7,0,16.8568,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,41.8,223.55,0.0 +256,384,2112,7168,17,0,21.6425,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,537.21,901.62,0.0 +256,384,2240,7168,17,0,21.7408,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,567.19,944.27,0.0 +256,384,8192,1536,11,0,14.6209,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,660.95,1331.26,0.0 +256,384,11264,1536,14,0,18.7788,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,707.58,1413.41,0.0 +256,416,3072,1536,10,0,10.3186,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,380.47,766.91,0.0201 +256,416,4096,512,11,0,6.3357,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,275.4,902.51,0.0 +256,416,7168,2048,14,0,16.8772,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,723.69,1273.66,0.0001 +256,416,4608,7168,17,0,35.3235,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,777.98,1128.03,0.0 +256,416,7168,2304,14,0,19.3946,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,708.47,1208.45,0.0001 +256,416,128,7168,7,0,16.8231,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,45.38,238.12,0.0 +256,416,2112,7168,17,0,22.607,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,557.15,879.28,0.0 +256,416,2240,7168,17,0,22.6496,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,589.81,922.84,0.0 +256,416,8192,1536,14,0,15.0697,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,694.7,1329.66,0.0 +256,416,11264,1536,14,0,19.6139,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,733.91,1392.49,0.0001 +256,448,3072,1536,10,0,10.521,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,401.85,775.52,0.0226 +256,448,4096,512,14,0,6.4638,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,290.7,927.71,0.0 +256,448,7168,2048,14,0,16.8311,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,781.49,1308.3,0.0001 +256,448,4608,7168,17,0,34.7379,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,851.95,1162.14,0.0 +256,448,7168,2304,14,0,19.1245,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,773.75,1253.36,0.0 +256,448,128,7168,7,0,17.2395,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,47.69,246.15,0.0 +256,448,2112,7168,17,0,21.83,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,621.36,927.28,0.0 +256,448,2240,7168,17,0,21.9689,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,654.86,968.4,0.0 +256,448,8192,1536,14,0,15.3434,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,734.8,1343.32,0.0 +256,448,11264,1536,14,0,19.5871,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,791.45,1433.71,0.0001 +256,480,3072,1536,12,0,10.737,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,421.89,782.81,0.0 +256,480,4096,512,2,0,6.2848,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,320.34,998.45,0.0116 +256,480,7168,2048,14,0,17.2931,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,814.94,1303.66,0.0001 +256,480,4608,7168,14,0,44.6566,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,710.06,915.75,0.0 +256,480,7168,2304,14,0,19.3619,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,818.85,1265.49,0.0 +256,480,128,7168,7,0,17.124,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,51.44,261.68,0.0 +256,480,2112,7168,12,0,24.3686,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,596.39,845.64,0.0 +256,480,2240,7168,10,0,30.5163,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,505.11,709.37,0.0203 +256,480,8192,1536,14,0,15.3417,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,787.37,1380.85,0.0 +256,480,11264,1536,14,0,20.0691,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,827.61,1437.64,0.0002 +256,512,3072,1536,12,0,10.3254,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,467.96,837.81,0.0 +256,512,4096,512,16,0,6.3359,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,338.94,1034.36,0.0 +256,512,7168,2048,14,0,17.4512,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,861.4,1321.9,0.0001 +256,512,4608,7168,14,0,45.4319,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,744.47,911.67,0.0 +256,512,7168,2304,14,0,19.7543,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,856.09,1267.31,0.0 +256,512,128,7168,7,0,17.2667,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,54.41,273.28,0.0 +256,512,2112,7168,17,0,28.6404,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,541.27,732.24,0.0 +256,512,2240,7168,17,0,32.0107,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,513.63,687.9,0.0 +256,512,8192,1536,14,0,15.5365,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,829.33,1400.44,0.0 +256,512,11264,1536,14,0,20.3545,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,870.41,1455.32,0.0002 +256,544,3072,1536,11,0,10.7665,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,476.83,826.31,0.0 +256,544,4096,512,11,0,6.5542,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,348.13,1042.4,0.0 +256,544,7168,2048,14,0,17.8795,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,893.31,1319.55,0.0001 +256,544,4608,7168,14,0,43.6922,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,822.5,959.97,0.0001 +256,544,7168,2304,14,0,20.4991,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,876.55,1247.24,0.0 +256,544,128,7168,7,0,17.6502,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,56.56,280.8,0.0 +256,544,2112,7168,17,0,30.1928,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,545.53,706.66,0.0 +256,544,2240,7168,17,0,30.4538,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,573.63,735.31,0.0 +256,544,8192,1536,14,0,19.5581,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,699.98,1141.8,0.0001 +256,544,11264,1536,13,0,22.6618,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,830.65,1341.13,0.0 +256,576,3072,1536,11,0,10.7765,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,504.41,848.35,0.0 +256,576,4096,512,11,0,6.4566,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,374.18,1101.3,0.0 +256,576,7168,2048,14,0,18.2205,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,928.15,1323.63,0.0001 +256,576,4608,7168,14,0,41.5108,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,916.65,1023.04,0.0 +256,576,7168,2304,14,0,20.3428,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,935.24,1283.0,0.0 +256,576,128,7168,7,0,17.3162,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,61.04,299.93,0.0 +256,576,2112,7168,17,0,29.3292,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,594.63,739.9,0.0 +256,576,2240,7168,17,0,29.4991,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,627.03,771.74,0.0 +256,576,8192,1536,14,0,19.5228,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,742.49,1173.23,0.0001 +256,576,11264,1536,13,0,22.6297,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,880.76,1377.06,0.0 +256,608,3072,1536,17,0,11.1319,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,515.44,843.34,0.0 +256,608,4096,512,11,0,6.5322,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,390.39,1131.19,0.0 +256,608,7168,2048,14,0,22.1181,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,807.07,1114.09,0.0001 +256,608,4608,7168,14,0,46.0337,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,872.51,933.92,0.0001 +256,608,7168,2304,14,0,24.7544,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,811.26,1075.86,0.0001 +256,608,128,7168,7,0,17.5586,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,63.54,309.32,0.0 +256,608,2112,7168,12,0,31.2029,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,589.97,707.15,0.0 +256,608,2240,7168,12,0,31.8443,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,613.12,726.61,0.0 +256,608,8192,1536,14,0,19.506,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,784.42,1203.64,0.0001 +256,608,11264,1536,13,0,23.1995,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,906.86,1376.43,0.0 +256,640,3072,1536,11,0,11.0309,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,547.53,873.35,0.0 +256,640,4096,512,12,0,6.9388,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,386.86,1105.05,0.0 +256,640,7168,2048,14,0,22.1029,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,850.14,1138.58,0.0002 +256,640,4608,7168,14,0,45.0184,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,939.14,966.62,0.0001 +256,640,7168,2304,14,0,24.8262,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,851.49,1094.19,0.0001 +256,640,128,7168,7,0,17.2882,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,67.93,327.9,0.0 +256,640,2112,7168,12,0,31.333,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,618.44,715.85,0.0 +256,640,2240,7168,17,0,32.1596,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,639.07,731.07,0.0 +256,640,8192,1536,14,0,19.6599,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,819.24,1223.39,0.0001 +256,640,11264,1536,13,0,23.483,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,943.06,1392.6,0.0 +256,672,3072,1536,11,0,11.1656,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,567.98,884.82,0.0 +256,672,4096,512,9,0,7.2846,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,386.92,1090.83,0.0 +256,672,7168,2048,14,0,22.8363,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,863.98,1124.97,0.0001 +256,672,4608,7168,14,0,46.7375,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,949.83,942.29,0.0001 +256,672,7168,2304,14,0,25.263,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,878.61,1096.35,0.0001 +256,672,128,7168,7,0,17.4629,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,70.61,338.23,0.0 +256,672,2112,7168,17,0,33.2237,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,612.41,686.08,0.0 +256,672,2240,7168,17,0,32.7518,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,658.89,729.24,0.0 +256,672,8192,1536,14,0,19.8966,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,849.97,1237.66,0.0001 +256,672,11264,1536,13,0,24.3387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,955.4,1375.28,0.0 +256,704,3072,1536,15,0,12.7235,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,522.17,795.8,0.0103 +256,704,4096,512,9,0,7.2722,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,406.04,1130.99,0.0 +256,704,7168,2048,14,0,22.4082,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,922.41,1169.86,0.0001 +256,704,4608,7168,14,0,46.4409,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1001.41,959.6,0.0001 +256,704,7168,2304,14,0,25.0759,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,927.31,1125.77,0.0001 +256,704,128,7168,7,0,17.4112,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,74.2,352.88,0.0 +256,704,2112,7168,17,0,32.2596,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,660.75,717.89,0.0 +256,704,2240,7168,15,0,33.2265,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,680.4,730.04,0.0088 +256,704,8192,1536,14,0,20.0629,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,883.06,1255.98,0.0002 +256,704,11264,1536,13,0,23.9522,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1017.05,1429.62,0.0 +256,736,3072,1536,15,0,13.0679,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,531.51,793.63,0.0098 +256,736,4096,512,15,0,8.1013,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,381.05,1049.62,0.0101 +256,736,7168,2048,14,0,22.9657,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,940.93,1164.29,0.0002 +256,736,4608,7168,14,0,47.6525,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1020.31,946.2,0.0001 +256,736,7168,2304,14,0,26.3794,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,921.56,1090.32,0.0002 +256,736,128,7168,7,0,17.2879,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,78.12,369.13,0.0 +256,736,2112,7168,17,0,32.3747,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,688.33,726.6,0.0 +256,736,2240,7168,15,0,33.4748,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,706.05,735.75,0.0092 +256,736,8192,1536,14,0,20.6675,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,896.19,1246.98,0.0002 +256,736,11264,1536,14,0,28.3815,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,897.34,1233.64,0.0003 +256,768,3072,1536,15,0,12.8704,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,563.13,824.9,0.0109 +256,768,4096,512,9,0,7.1138,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,452.81,1234.48,0.0 +256,768,7168,2048,14,0,23.4702,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,960.73,1161.6,0.0003 +256,768,4608,7168,14,0,47.348,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1071.52,963.36,0.0001 +256,768,7168,2304,14,0,25.6188,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,990.18,1143.48,0.0002 +256,768,128,7168,7,0,17.4362,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,80.83,379.62,0.0 +256,768,2112,7168,17,0,31.4984,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,738.24,758.38,0.0 +256,768,2240,7168,17,0,32.8377,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,751.04,761.38,0.0 +256,768,8192,1536,14,0,20.6334,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,936.7,1276.84,0.0002 +256,768,11264,1536,14,0,27.8839,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,953.06,1283.27,0.0002 +256,800,3072,1536,9,0,13.5065,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,558.97,804.25,0.0002 +256,800,4096,512,11,0,8.6137,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,389.55,1051.85,0.0 +256,800,7168,2048,14,0,24.0675,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,975.93,1154.56,0.0003 +256,800,4608,7168,14,0,49.0037,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1078.45,941.51,0.0001 +256,800,7168,2304,14,0,27.0976,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,975.15,1100.73,0.0002 +256,800,128,7168,7,0,17.2924,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,84.89,396.52,0.0 +256,800,2112,7168,17,0,34.0854,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,710.63,711.52,0.0 +256,800,2240,7168,17,0,34.2121,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,750.91,741.69,0.0 +256,800,8192,1536,13,0,22.6597,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,888.48,1187.96,0.0 +256,800,11264,1536,14,0,29.4759,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,939.15,1240.09,0.0002 +256,832,3072,1536,11,0,13.5812,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,578.13,817.92,0.0 +256,832,4096,512,11,0,8.7297,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,399.75,1069.78,0.0 +256,832,7168,2048,14,0,24.3455,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1003.37,1162.91,0.0002 +256,832,4608,7168,13,0,48.8678,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1124.71,954.85,0.0 +256,832,7168,2304,14,0,26.6808,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1029.99,1137.88,0.0002 +256,832,128,7168,7,0,17.4686,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,87.4,406.12,0.0 +256,832,2112,7168,17,0,33.4405,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,753.31,736.14,0.0 +256,832,2240,7168,17,0,33.183,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,805.16,775.92,0.0 +256,832,8192,1536,13,0,22.6764,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,923.34,1212.38,0.0 +256,832,11264,1536,14,0,29.5287,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,974.97,1263.95,0.0002 +256,864,3072,1536,11,0,13.7838,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,591.54,823.73,0.0 +256,864,4096,512,14,0,8.6765,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,417.67,1108.44,0.0 +256,864,7168,2048,13,0,25.9384,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,977.98,1111.7,0.0 +256,864,4608,7168,14,0,50.4218,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1131.97,935.82,0.0001 +256,864,7168,2304,13,0,28.4282,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1003.86,1086.67,0.0 +256,864,128,7168,7,0,17.4738,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,90.73,419.59,0.0 +256,864,2112,7168,17,0,34.0582,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,768.09,733.49,0.0 +256,864,2240,7168,17,0,34.2926,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,809.08,761.69,0.0 +256,864,8192,1536,13,0,22.8305,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,952.38,1229.31,0.0 +256,864,11264,1536,14,0,30.0839,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,993.79,1266.22,0.0002 +256,896,3072,1536,11,0,13.7912,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,613.12,841.11,0.0 +256,896,4096,512,16,0,8.9001,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,422.25,1111.89,0.0 +256,896,7168,2048,13,0,26.0642,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1009.3,1126.45,0.0 +256,896,4608,7168,13,0,49.6474,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1192.21,960.98,0.0 +256,896,7168,2304,13,0,28.4018,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1042.01,1106.43,0.0 +256,896,128,7168,7,0,17.4262,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,94.35,434.37,0.0 +256,896,2112,7168,17,0,33.6303,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,806.68,753.67,0.0 +256,896,2240,7168,15,0,34.8031,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,826.73,761.22,0.0097 +256,896,8192,1536,13,0,23.2974,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,967.86,1229.29,0.0 +256,896,11264,1536,14,0,29.8151,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1039.89,1303.46,0.0002 +256,928,3072,1536,11,0,14.1072,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,620.8,839.69,0.0 +256,928,4096,512,9,0,9.2641,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,420.15,1098.27,0.0001 +256,928,7168,2048,13,0,26.8127,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1016.17,1114.56,0.0 +256,928,4608,7168,14,0,62.6104,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,979.13,770.39,0.0001 +256,928,7168,2304,13,0,29.6374,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1034.23,1078.27,0.0 +256,928,128,7168,7,0,17.5367,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,97.1,445.18,0.0 +256,928,2112,7168,15,0,35.4255,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,793.15,725.77,0.0094 +256,928,2240,7168,15,0,41.3108,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,721.37,650.33,0.0103 +256,928,8192,1536,13,0,24.0023,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,972.99,1217.08,0.0 +256,928,11264,1536,13,0,32.1391,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,999.14,1233.17,0.0 +256,960,3072,1536,15,0,14.2159,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,637.29,850.55,0.011 +256,960,4096,512,14,0,9.4689,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,425.24,1103.93,0.0 +256,960,7168,2048,13,0,26.7651,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1053.08,1136.13,0.0 +256,960,4608,7168,13,0,61.3396,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1033.88,794.9,0.0 +256,960,7168,2304,13,0,29.9557,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1058.53,1084.58,0.0 +256,960,128,7168,7,0,17.7166,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,99.43,454.07,0.0 +256,960,2112,7168,15,0,35.0171,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,830.07,744.64,0.0089 +256,960,2240,7168,15,0,41.2822,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,746.77,659.81,0.0083 +256,960,8192,1536,13,0,24.1159,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1001.8,1235.12,0.0 +256,960,11264,1536,13,0,32.1454,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1033.39,1256.88,0.0 +256,992,3072,1536,15,0,14.3692,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,651.51,858.58,0.0107 +256,992,4096,512,16,0,9.0103,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,461.78,1191.03,0.0 +256,992,7168,2048,13,0,28.164,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1034.13,1098.32,0.0 +256,992,4608,7168,14,0,65.4042,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1001.95,753.52,0.0002 +256,992,7168,2304,13,0,31.124,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1052.75,1060.98,0.0 +256,992,128,7168,7,0,17.6278,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,103.26,469.83,0.0 +256,992,2112,7168,15,0,42.7996,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,701.77,617.76,0.011 +256,992,2240,7168,15,0,44.0329,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,723.45,627.06,0.0095 +256,992,8192,1536,13,0,24.3949,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1023.35,1244.5,0.0 +256,992,11264,1536,13,0,35.8936,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,956.33,1147.08,0.0 +256,1024,3072,1536,15,0,14.4314,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,669.63,871.91,0.0104 +256,1024,4096,512,14,0,8.9977,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,477.34,1223.65,0.0 +256,1024,7168,2048,13,0,27.6361,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1087.88,1138.27,0.0 +256,1024,4608,7168,14,0,66.6804,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1014.48,746.96,0.0001 +256,1024,7168,2304,13,0,30.5214,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1108.17,1099.37,0.0 +256,1024,128,7168,7,0,17.738,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,105.93,480.31,0.0 +256,1024,2112,7168,15,0,42.9525,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,721.83,624.04,0.0093 +256,1024,2240,7168,15,0,45.8042,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,717.91,610.95,0.0102 +256,1024,8192,1536,13,0,24.4525,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1053.87,1265.02,0.0 +256,1024,11264,1536,13,0,35.9326,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,986.11,1167.27,0.0 +256,1088,3072,1536,14,0,14.6439,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,701.16,892.82,0.0 +256,1088,4096,512,14,0,10.8902,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,419.04,1062.16,0.0 +256,1088,7168,2048,13,0,27.8387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1147.46,1167.65,0.0 +256,1088,4608,7168,13,0,63.115,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1138.77,805.77,0.0 +256,1088,7168,2304,13,0,31.8055,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1129.89,1088.47,0.0 +256,1088,128,7168,7,0,17.4858,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,114.18,514.41,0.0 +256,1088,2112,7168,15,0,42.1783,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,781.02,652.78,0.0098 +256,1088,2240,7168,15,0,42.6599,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,819.0,673.45,0.0114 +256,1088,8192,1536,14,0,29.1979,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,937.75,1098.7,0.0002 +256,1088,11264,1536,13,0,34.8053,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1081.68,1249.33,0.0 +256,1152,3072,1536,14,0,14.8672,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,731.25,912.48,0.0 +256,1152,4096,512,11,0,10.4873,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,460.73,1156.08,0.0 +256,1152,7168,2048,13,0,28.4487,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1188.91,1179.47,0.0 +256,1152,4608,7168,13,0,63.9377,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1190.24,811.8,0.0 +256,1152,7168,2304,13,0,31.2094,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1219.21,1143.38,0.0 +256,1152,128,7168,7,0,17.3729,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,121.68,545.1,0.0 +256,1152,2112,7168,15,0,42.8632,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,813.75,659.36,0.0111 +256,1152,2240,7168,15,0,42.9315,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,861.69,686.55,0.0101 +256,1152,8192,1536,14,0,29.0123,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,999.27,1145.26,0.0003 +256,1152,11264,1536,14,0,38.2423,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1042.37,1177.31,0.0003 +256,1216,3072,1536,14,0,15.0215,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,763.95,935.82,0.0 +256,1216,4096,512,11,0,10.5963,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,481.33,1196.76,0.0 +256,1216,7168,2048,14,0,34.2239,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1043.19,1011.08,0.0003 +256,1216,4608,7168,13,0,64.1537,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1252.14,825.41,0.0 +256,1216,7168,2304,14,0,39.8731,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1007.31,921.66,0.0003 +256,1216,128,7168,7,0,17.5146,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,127.4,567.82,0.0 +256,1216,2112,7168,15,0,42.2623,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,871.17,685.99,0.0099 +256,1216,2240,7168,15,0,44.7578,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,872.45,675.2,0.0093 +256,1216,8192,1536,14,0,30.1479,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1015.05,1140.17,0.0003 +256,1216,11264,1536,13,0,39.6259,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1061.86,1175.07,0.0 +256,1280,3072,1536,14,0,15.2662,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,791.26,953.02,0.0 +256,1280,4096,512,9,0,10.7165,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,500.98,1235.32,0.0002 +256,1280,7168,2048,14,0,34.9063,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1076.62,1021.35,0.0002 +256,1280,4608,7168,13,0,66.3484,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1274.44,813.91,0.0 +256,1280,7168,2304,14,0,38.4207,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1100.41,984.22,0.0003 +256,1280,128,7168,7,0,17.0544,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,137.72,611.0,0.0 +256,1280,2112,7168,15,0,44.41,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,872.67,669.23,0.0105 +256,1280,2240,7168,15,0,47.0387,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,873.84,658.3,0.0113 +256,1280,8192,1536,14,0,29.7623,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1082.32,1193.47,0.0002 +256,1280,11264,1536,13,0,40.1039,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1104.43,1199.47,0.0 +256,1344,3072,1536,14,0,15.3253,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,827.62,981.42,0.0 +256,1344,4096,512,11,0,11.226,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,502.15,1228.87,0.0 +256,1344,7168,2048,14,0,35.111,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1123.86,1045.26,0.0003 +256,1344,4608,7168,14,0,69.8148,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1271.72,788.52,0.0001 +256,1344,7168,2304,14,0,40.1692,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1105.14,967.89,0.0003 +256,1344,128,7168,7,0,17.5946,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,140.17,619.24,0.0 +256,1344,2112,7168,15,0,47.8227,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,850.92,636.72,0.0133 +256,1344,2240,7168,15,0,47.6559,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,905.65,665.42,0.0109 +256,1344,8192,1536,13,0,32.0707,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1054.63,1143.33,0.0 +256,1344,11264,1536,14,0,42.6947,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1089.28,1162.76,0.0003 +256,1408,3072,1536,14,0,18.4709,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,719.38,840.89,0.0 +256,1408,4096,512,11,0,11.4299,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,516.68,1255.69,0.0 +256,1408,7168,2048,14,0,35.9583,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1149.64,1049.79,0.0004 +256,1408,4608,7168,13,0,67.567,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1376.6,830.27,0.0 +256,1408,7168,2304,14,0,40.2003,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1156.87,993.63,0.0003 +256,1408,128,7168,7,0,17.0568,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,151.48,666.63,0.0 +256,1408,2112,7168,15,0,47.3223,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,900.86,658.86,0.0105 +256,1408,2240,7168,15,0,56.5775,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,799.16,573.67,0.0138 +256,1408,8192,1536,13,0,31.7962,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1114.39,1189.27,0.0 +256,1408,11264,1536,13,0,41.2863,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1180.08,1239.72,0.0 +256,1472,3072,1536,14,0,18.7869,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,739.43,852.91,0.0 +256,1472,4096,512,11,0,11.5851,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,532.93,1286.95,0.0 +256,1472,7168,2048,13,0,38.6451,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1118.33,1003.94,0.0 +256,1472,4608,7168,13,0,70.912,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1371.29,805.89,0.0 +256,1472,7168,2304,13,0,41.985,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1158.04,976.76,0.0 +256,1472,128,7168,7,0,17.045,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,158.47,694.96,0.0 +256,1472,2112,7168,15,0,48.1347,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,925.92,662.89,0.0127 +256,1472,2240,7168,15,0,56.6831,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,833.93,585.75,0.0121 +256,1472,8192,1536,13,0,32.4447,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1141.76,1200.85,0.0 +256,1472,11264,1536,13,0,41.8267,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1217.78,1260.53,0.0 +256,1536,3072,1536,14,0,18.7986,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,771.1,878.53,0.0001 +256,1536,4096,512,11,0,11.7512,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,548.24,1316.16,0.0 +256,1536,7168,2048,13,0,38.7247,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1164.56,1028.95,0.0 +256,1536,4608,7168,13,0,74.1864,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1367.75,784.46,0.0 +256,1536,7168,2304,13,0,42.7211,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1187.57,984.86,0.0 +256,1536,128,7168,7,0,17.0095,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,165.71,724.35,0.0 +256,1536,2112,7168,15,0,56.2643,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,826.57,580.06,0.0121 +256,1536,2240,7168,15,0,57.2408,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,861.71,593.07,0.0129 +256,1536,8192,1536,13,0,32.9551,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1172.95,1217.05,0.0 +256,1536,11264,1536,13,0,47.4698,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1119.66,1143.12,0.0 +256,1600,3072,1536,14,0,19.197,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,786.55,885.9,0.0001 +256,1600,4096,512,11,0,12.4198,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,540.34,1290.16,0.0 +256,1600,7168,2048,13,0,39.2775,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1196.01,1041.17,0.0 +256,1600,4608,7168,13,0,76.2076,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1386.95,777.41,0.0 +256,1600,7168,2304,13,0,43.1301,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1225.32,1000.21,0.0 +256,1600,128,7168,7,0,17.142,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,171.28,746.47,0.0 +256,1600,2112,7168,15,0,57.7957,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,838.2,577.31,0.0119 +256,1600,2240,7168,15,0,58.5542,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,877.48,592.5,0.0127 +256,1600,8192,1536,14,0,39.0007,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1032.43,1057.8,0.0003 +256,1600,11264,1536,13,0,48.686,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1137.18,1146.2,0.0 +256,1664,3072,1536,14,0,19.2705,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,814.9,908.03,0.0001 +256,1664,4096,512,11,0,12.5421,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,556.47,1322.0,0.0 +256,1664,7168,2048,13,0,40.0303,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1220.46,1047.78,0.0 +256,1664,4608,7168,13,0,78.2312,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1405.12,770.7,0.0 +256,1664,7168,2304,13,0,45.0495,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1220.04,981.23,0.0 +256,1664,128,7168,7,0,17.2341,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,177.18,770.05,0.0 +256,1664,2112,7168,15,0,58.1058,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,867.07,586.78,0.0115 +256,1664,2240,7168,15,0,58.5507,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,912.64,605.26,0.0125 +256,1664,8192,1536,14,0,39.0559,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1072.2,1085.67,0.0003 +256,1664,11264,1536,13,0,48.9251,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1176.89,1172.08,0.0 +256,1728,3072,1536,14,0,19.5203,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,835.41,921.59,0.0001 +256,1728,4096,512,11,0,12.664,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,572.31,1353.26,0.0 +256,1728,7168,2048,13,0,40.2419,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1260.73,1068.33,0.0 +256,1728,4608,7168,13,0,77.4316,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1474.23,792.2,0.0 +256,1728,7168,2304,13,0,45.5247,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1253.74,994.38,0.0 +256,1728,128,7168,7,0,17.224,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,184.1,798.08,0.0 +256,1728,2112,7168,15,0,59.0234,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,886.42,590.01,0.0103 +256,1728,2240,7168,15,0,60.5467,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,916.49,597.62,0.0128 +256,1728,8192,1536,14,0,39.1791,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1109.94,1111.53,0.0003 +256,1728,11264,1536,13,0,50.8631,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1175.59,1157.7,0.0 +256,1792,3072,1536,14,0,19.6235,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,861.79,941.79,0.0002 +256,1792,4096,512,11,0,12.8007,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,587.17,1382.32,0.0 +256,1792,7168,2048,13,0,45.9911,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1143.99,957.58,0.0 +256,1792,4608,7168,13,0,80.042,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1478.97,779.47,0.0 +256,1792,7168,2304,13,0,51.3255,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1153.23,902.75,0.0 +256,1792,128,7168,7,0,17.193,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,191.26,827.16,0.0 +256,1792,2112,7168,15,0,58.5148,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,927.24,607.59,0.0137 +256,1792,2240,7168,15,0,60.4126,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,952.55,611.29,0.0131 +256,1792,8192,1536,13,0,39.9475,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1128.91,1118.86,0.0 +256,1792,11264,1536,13,0,50.5023,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1227.84,1196.46,0.0 +256,1856,3072,1536,14,0,19.7339,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,887.58,961.43,0.0001 +256,1856,4096,512,11,0,13.4968,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,576.78,1352.3,0.0 +256,1856,7168,2048,14,0,46.4731,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1172.56,970.21,0.0003 +256,1856,4608,7168,14,0,101.1065,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1212.66,627.45,0.0002 +256,1856,7168,2304,14,0,50.7571,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1207.79,933.84,0.0003 +256,1856,128,7168,7,0,17.219,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,197.79,853.5,0.0 +256,1856,2112,7168,15,0,61.1264,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,919.33,593.56,0.0119 +256,1856,2240,7168,15,0,63.2116,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,942.88,596.01,0.0142 +256,1856,8192,1536,13,0,41.1411,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1135.31,1114.27,0.0 +256,1856,11264,1536,13,0,53.7919,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1193.92,1151.93,0.0 +256,1920,3072,1536,14,0,20.0902,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,901.9,968.84,0.0002 +256,1920,4096,512,11,0,13.8011,a8w8_blockscale_bpreshuffle_1x128x128_256x32x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,583.51,1362.85,0.0 +256,1920,7168,2048,13,0,47.0238,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1198.79,981.15,0.0 +256,1920,4608,7168,14,0,101.8229,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1245.65,633.33,0.0002 +256,1920,7168,2304,13,0,52.3907,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1210.48,925.05,0.0 +256,1920,128,7168,7,0,17.3114,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,203.52,876.39,0.0 +256,1920,2112,7168,17,0,63.4512,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,916.19,583.31,0.0 +256,1920,2240,7168,15,0,71.8732,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,857.85,534.56,0.0127 +256,1920,8192,1536,13,0,42.5047,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1136.78,1105.51,0.0 +256,1920,11264,1536,13,0,58.89,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1128.17,1078.36,0.0 +256,1984,3072,1536,14,0,20.4447,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,915.81,976.08,0.0002 +256,1984,4096,512,16,0,14.1538,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,587.93,1368.25,0.0 +256,1984,7168,2048,13,0,50.9014,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1144.38,927.01,0.0 +256,1984,4608,7168,14,0,101.9497,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1285.57,642.83,0.0002 +256,1984,7168,2304,13,0,54.3635,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1205.44,911.07,0.0 +256,1984,128,7168,7,0,17.2452,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,211.11,907.31,0.0 +256,1984,2112,7168,15,0,64.72,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,928.16,583.14,0.0122 +256,1984,2240,7168,15,0,72.6116,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,877.43,539.39,0.0129 +256,1984,8192,1536,13,0,43.673,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1143.25,1102.2,0.0 +256,1984,11264,1536,13,0,59.8064,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1147.91,1087.58,0.0 +256,2048,3072,1536,14,0,20.3481,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,949.84,1004.87,0.0002 +256,2048,4096,512,8,0,14.727,a8w8_blockscale_bpreshuffle_1x128x128_256x32x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,583.28,1352.82,0.0 +256,2048,7168,2048,13,0,51.4088,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1169.64,938.25,0.0 +256,2048,4608,7168,14,0,107.6462,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1256.82,618.55,0.0002 +256,2048,7168,2304,13,0,56.3484,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1200.49,897.87,0.0 +256,2048,128,7168,7,0,17.1571,a8w8_blockscale_bpreshuffle_1x128x128_256x16x64x256_16x16_16x16_16x16x1_16x16x1_1x16x1x16_4_1x1_intrawave_v1,219.04,939.66,0.0 +256,2048,2112,7168,17,0,75.8695,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,817.31,507.05,0.0 +256,2048,2240,7168,15,0,77.8746,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,844.52,512.51,0.0132 +256,2048,8192,1536,13,0,45.7842,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1125.71,1076.42,0.0 +256,2048,11264,1536,13,0,63.5576,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1115.0,1047.63,0.0 +256,4096,3072,1536,13,0,33.098,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1167.89,1092.99,0.0 +256,4096,4096,512,16,0,24.1906,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,710.19,1560.47,0.0 +256,4096,7168,2048,13,0,88.0582,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1365.68,928.81,0.0 +256,4096,4608,7168,13,0,172.9439,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1564.57,579.03,0.0 +256,4096,7168,2304,13,0,96.2657,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1405.4,879.57,0.0 +256,4096,128,7168,12,0,18.8135,a8w8_blockscale_bpreshuffle_1x128x128_256x32x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,399.51,1665.09,0.0 +256,4096,2112,7168,15,0,124.7209,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,994.36,495.51,0.0134 +256,4096,2240,7168,15,0,125.7021,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1046.39,507.28,0.0132 +256,4096,8192,1536,13,0,81.534,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1264.25,1054.57,0.0 +256,4096,11264,1536,13,0,107.9513,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1312.94,1073.33,0.0 +256,8192,3072,1536,13,0,61.3912,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1259.29,1101.68,0.0 +256,8192,4096,512,16,0,42.9003,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,800.92,1710.95,0.0 +256,8192,7168,2048,13,0,157.6682,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1525.47,944.37,0.0 +256,8192,4608,7168,13,0,303.8251,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1781.18,550.47,0.0 +256,8192,7168,2304,13,0,173.3037,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1561.32,881.86,0.0 +256,8192,128,7168,17,0,23.2457,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,646.67,2655.76,0.0 +256,8192,2112,7168,15,0,224.9827,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1102.46,482.09,0.0144 +256,8192,2240,7168,15,0,232.1474,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1133.19,480.2,0.0144 +256,8192,8192,1536,13,0,147.5306,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1397.39,1080.34,0.0 +256,8192,11264,1536,13,0,193.6738,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1463.64,1107.19,0.0 +256,16384,3072,1536,13,0,112.7521,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1371.32,1157.83,0.0 +256,16384,4096,512,16,0,79.5094,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,864.29,1819.95,0.0 +256,16384,7168,2048,13,0,295.6815,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1626.87,957.5,0.0 +256,16384,4608,7168,13,0,576.7307,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1876.67,522.71,0.0 +256,16384,7168,2304,13,0,324.2619,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1668.92,891.7,0.0 +256,16384,128,7168,14,0,39.7935,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,755.52,3079.71,0.0 +256,16384,2112,7168,15,0,419.8442,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1181.55,480.62,0.014 +256,16384,2240,7168,15,0,437.1202,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1203.64,473.32,0.0149 +256,16384,8192,1536,13,0,275.8353,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1494.79,1110.03,0.0 +256,16384,11264,1536,13,0,356.6236,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1589.73,1154.06,0.0 +256,32768,3072,1536,13,0,214.7832,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1439.77,1193.65,0.0 +256,32768,4096,512,16,0,147.5846,a8w8_blockscale_bpreshuffle_1x128x128_256x64x128x256_16x16_16x16_16x16x1_16x16x1_1x32x1x8_8_2x1_intrawave_v1,931.26,1946.75,0.0 +256,32768,7168,2048,13,0,569.6706,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1688.82,968.19,0.0 +256,32768,4608,7168,13,0,1101.5999,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1965.02,517.34,0.0 +256,32768,7168,2304,13,0,619.81,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1746.23,906.37,0.0 +256,32768,128,7168,0,0,60.8852,a8w8_blockscale_bpreshuffle_1x128x128_256x128x128x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,987.59,4010.62,0.0 +256,32768,2112,7168,2,0,807.0066,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v3,1229.4,481.32,0.0044 +256,32768,2240,7168,15,0,846.6061,a8w8_blockscale_bpreshuffle_1x128x128_256x64x64x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1242.92,469.8,0.0138 +256,32768,8192,1536,13,0,525.8909,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1568.07,1140.51,0.0 +256,32768,11264,1536,13,0,688.0934,a8w8_blockscale_bpreshuffle_1x128x128_256x64x256x128_16x16_16x16_8x32x1_8x32x1_1x32x1x8_8_2x1_intrawave_v1,1647.85,1171.11,0.0