diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h index 3aca1cba5c..21b044281a 100644 --- a/frame/include/bli_arch_config.h +++ b/frame/include/bli_arch_config.h @@ -265,6 +265,9 @@ CNTX_INIT_PROTS( generic ) // -- ARM architectures -- +#ifdef BLIS_KERNELS_ARMSVE +#include "bli_kernels_armsve.h" +#endif #ifdef BLIS_KERNELS_ARMV8A #include "bli_kernels_armv8a.h" #endif diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c new file mode 100644 index 0000000000..82def6df7b --- /dev/null +++ b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c @@ -0,0 +1,235 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Linaro Limited + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "blis.h" + +#ifdef __ARM_FEATURE_SVE +#include +#else +#error "No Arm SVE intrinsics support in compiler" +#endif // __ARM_FEATURE_SVE + +// assumption: +// SVE vector length = 256 bits. +// + +void bli_dpackm_armsve256_asm_8xk + ( + conj_t conja, + pack_t schema, + dim_t cdim_, + dim_t n_, + dim_t n_max_, + void* restrict kappa_, + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_, + cntx_t* restrict cntx + ) +{ + double* a = ( double* )a_; + double* p = ( double* )p_; + double* kappa = ( double* )kappa_; + const int64_t cdim = cdim_; + const int64_t mnr = 8; + const int64_t n = n_; + const int64_t n_max = n_max_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; + + double* restrict alpha1 = a; + double* restrict alpha1_4 = alpha1 + 4 * inca; + double* restrict pi1 = p; + const svbool_t all_active = svptrue_b64(); + svfloat64_t z_a0; + svfloat64_t z_a4; + svuint64_t z_index; + + // creating index for gather/scatter + // with each element as: 0, 1*inca, 2*inca, 3*inca + z_index = svindex_u64( 0, inca * sizeof( double ) ); + + if ( cdim == mnr ) + { + if ( bli_deq1( *kappa ) ) + { + if ( inca == 1 ) // continous memory. packA style + { + for ( dim_t k = n; k != 0; --k ) + { + // svld1_f64 retrieves all zero's into z_a0 and z_a4, + // which is not correct. + // qemu-aarch64 or gcc interpretation of svld1_f64 + // should be blamed. + + // load 8 continuous elments from *a + // z_a0 = svld1_f64( all_active, alpha1 ); + // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 ); + + // as a workaround, using gather load + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); + + // store them into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a4 ); + + alpha1 += lda; + alpha1_4 = alpha1 + 4 * inca; + pi1 += ldp; + } + } + else // gather/scatter load/store. packB style + { + for ( dim_t k = n; k != 0; --k ) + { + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); + + // scatter store into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a4 ); + + alpha1 += lda; + alpha1_4 = alpha1 + 4 * inca; + pi1 += ldp; + } + } + } + else // *kappa != 1.0 + { + // load kappa into vector + svfloat64_t z_kappa; + + z_kappa = svdup_f64( *kappa ); + + if ( inca == 1 ) // continous memory. packA style + { + for ( dim_t k = n; k != 0; --k ) + { + // load 8 continuous elments from *a + // z_a0 = svld1_f64( all_active, alpha1 ); + // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 ); + // same reason as above. as a workaround, using gather load + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); + + // multiply by *kappa + z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); + z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 ); + + // store them into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a4 ); + + alpha1 += lda; + alpha1_4 = alpha1 + 4 * inca; + pi1 += ldp; + } + } + else // gather/scatter load/store. packB style + { + for ( dim_t k = n; k != 0; --k ) + { + // gather load from *a + z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index ); + z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index ); + + // multiply by *kappa + z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 ); + z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 ); + + // scatter store into *p + svst1_f64( all_active, pi1, z_a0 ); + svst1_vnum_f64( all_active, pi1, 1, z_a4 ); + + alpha1 += lda; + alpha1_4 = alpha1 + 4 * inca; + pi1 += ldp; + } + } + } // end of if ( *kappa == 1.0 ) + } + else // if ( cdim < mnr ) + { + bli_dscal2m_ex + ( + 0, + BLIS_NONUNIT_DIAG, + BLIS_DENSE, + ( trans_t )conja, + cdim, + n, + kappa, + a, inca, lda, + p, 1, ldp, + cntx, + NULL + ); + + // if ( cdim < mnr ) + { + const dim_t i = cdim; + const dim_t m_edge = mnr - i; + const dim_t n_edge = n_max; + double* restrict p_edge = p + (i )*1; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } + } + + if ( n < n_max ) + { + const dim_t j = n; + const dim_t m_edge = mnr; + const dim_t n_edge = n_max - j; + double* restrict p_edge = p + (j )*ldp; + + bli_dset0s_mxn + ( + m_edge, + n_edge, + p_edge, 1, ldp + ); + } +} diff --git a/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c new file mode 100644 index 0000000000..01bb644b12 --- /dev/null +++ b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c @@ -0,0 +1,809 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + Copyright (C) 2020, Linaro Limited + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +*/ + +#include "blis.h" + +/* + o 8x8 Double precision micro-kernel + o Runnable on ARMv8a with SVE 256 feature, compiled with aarch64 GCC. + o Tested on qemu-aarch64 and armie for SVE. + + Preconditions: + - to use this kernel, SVE with vector length of 256 bits is a must. + + April 2020. +*/ +void bli_dgemm_armsve256_asm_8x8 + ( + dim_t k0, + double* restrict alpha, + double* restrict a, + double* restrict b, + double* restrict beta, + double* restrict c, inc_t rs_c0, inc_t cs_c0, + auxinfo_t* restrict data, + cntx_t* restrict cntx + ) +{ + void* a_next = bli_auxinfo_next_a( data ); + void* b_next = bli_auxinfo_next_b( data ); + + // Typecast local copies of integers in case dim_t and inc_t are a + // different size than is expected by load instructions. + uint64_t k_iter = k0 / 4; + uint64_t k_left = k0 % 4; + uint64_t rs_c = rs_c0; + uint64_t cs_c = cs_c0; + +__asm__ volatile +( +" \n\t" +" ldr x0,%[aaddr] \n\t" // Load address of A +" ldr x1,%[baddr] \n\t" // Load address of B +" ldr x2,%[caddr] \n\t" // Load address of C +" \n\t" +" ldr x3,%[a_next] \n\t" // Move pointer +" ldr x4,%[b_next] \n\t" // Move pointer +" \n\t" +" ldr x5,%[k_iter] \n\t" // Init guard (k_iter) +" ldr x6,%[k_left] \n\t" // Init guard (k_iter) +" \n\t" +" ldr x7,%[alpha] \n\t" // Alpha address +" ldr x8,%[beta] \n\t" // Beta address +" \n\t" +" ldr x9,%[cs_c] \n\t" // Load cs_c +" lsl x10,x9,#3 \n\t" // cs_c * sizeof(double) +" \n\t" +" ldr x13,%[rs_c] \n\t" // Load rs_c. +" lsl x14,x13,#3 \n\t" // rs_c * sizeof(double). +" \n\t" +" add x20,x2,x10 \n\t" //Load address Column 1 of C +" add x21,x20,x10 \n\t" //Load address Column 2 of C +" add x22,x21,x10 \n\t" //Load address Column 3 of C +" add x23,x22,x10 \n\t" //Load address Column 4 of C +" add x24,x23,x10 \n\t" //Load address Column 5 of C +" add x25,x24,x10 \n\t" //Load address Column 6 of C +" add x26,x25,x10 \n\t" //Load address Column 7 of C +" \n\t" +" prfm pldl1keep,[x2] \n\t" // Prefetch c. +" prfm pldl1keep,[x20] \n\t" // Prefetch c. +" prfm pldl1keep,[x21] \n\t" // Prefetch c. +" prfm pldl1keep,[x22] \n\t" // Prefetch c. +" prfm pldl1keep,[x23] \n\t" // Prefetch c. +" prfm pldl1keep,[x24] \n\t" // Prefetch c. +" prfm pldl1keep,[x25] \n\t" // Prefetch c. +" prfm pldl1keep,[x26] \n\t" // Prefetch c. +" \n\t" +" ldr z0, [x0] \n\t" // Load a +" ldr z1, [x0, #1, MUL VL] \n\t" +" \n\t" +" ptrue p0.d, all \n\t" +" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // PRFM, the following prefetch on [x1] and [x0] +" \n\t" // is for b rows 4..7 and a columns 4..7. +" \n\t" // both of them will be used in next iteration +" \n\t" // of k_iter (unrolled per 4 loops) +" \n\t" +" dup z16.d, #0 \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #256] \n\t" // prefetch b row no.4 +" dup z17.d, #0 \n\t" // Vector for accummulating column 0 +" prfm PLDL1KEEP, [x1, #320] \n\t" // prefetch b row no.5 +" dup z18.d, #0 \n\t" // Vector for accummulating column 1 +" prfm PLDL1KEEP, [x1, #384] \n\t" // prefetch b row no.6 +" dup z19.d, #0 \n\t" // Vector for accummulating column 1 +" prfm PLDL1KEEP, [x1, #448] \n\t" // preftech b row no.7 +" dup z20.d, #0 \n\t" // Vector for accummulating column 2 +" dup z21.d, #0 \n\t" // Vector for accummulating column 2 +" \n\t" +" dup z22.d, #0 \n\t" // Vector for accummulating column 3 +" prfm PLDL1KEEP, [x0, #256] \n\t" // prefetch a col. no.4 +" dup z23.d, #0 \n\t" // Vector for accummulating column 3 +" prfm PLDL1KEEP, [x0, #320] \n\t" // prefetch a col. no.5 +" dup z24.d, #0 \n\t" // Vector for accummulating column 4 +" prfm PLDL1KEEP, [x0, #384] \n\t" // prefetch a col. no.6 +" dup z25.d, #0 \n\t" // Vector for accummulating column 4 +" prfm PLDL1KEEP, [x0, #448] \n\t" // prefetch a col. no.7 +" dup z26.d, #0 \n\t" // Vector for accummulating column 5 +" dup z27.d, #0 \n\t" // Vector for accummulating column 5 +" \n\t" +" dup z28.d, #0 \n\t" // Vector for accummulating column 6 +" dup z29.d, #0 \n\t" // Vector for accummulating column 6 +" dup z30.d, #0 \n\t" // Vector for accummulating column 7 +" dup z31.d, #0 \n\t" // Vector for accummulating column 7 +" \n\t" +" \n\t" +" cmp x5,#0 \n\t" // If k_iter == 0, jump to k_left. +" beq .DCONSIDERKLEFT \n\t" +" \n\t" +" add x0, x0, #64 \n\t" //update address of A +" add x1, x1, #64 \n\t" //update address of B +" \n\t" +" cmp x5,1 \n\t" // If there is just one k_iter, jump to that one. +" beq .DLASTITER \n\t" // (as loop is do-while-like). +" \n\t" +" DLOOP: \n\t" // Body +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" prfm PLDL1KEEP, [x1, #448] \n\t" // prefetch b row no.8, 512-64=448 +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" prfm PLDL1KEEP, [x1, #512] \n\t" // prefetch b row no.9 +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" prfm PLDL1KEEP, [x1, #576] \n\t" // prefetch b row no.10 +" \n\t" +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" ldr z6, [x0] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" ldr z7, [x0, #1, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // End it 1 +" \n\t" +" fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" prfm PLDL1KEEP, [x1, #640] \n\t" // prefetch b row no.11 +" fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" prfm PLDL1KEEP, [x0, #448] \n\t" // prefetch a col. no.8 +" fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" prfm PLDL1KEEP, [x0, #512] \n\t" // prefetch a col. no.9 +" \n\t" +" fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" ldr z0, [x0, #2, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" ldr z1, [x0, #3, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" +" \n\t" //End it 2 +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" prfm PLDL1KEEP, [x0, #576] \n\t" // prefetch a col. no.10 +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" prfm PLDL1KEEP, [x0, #640] \n\t" // prefetch a col. no.11 +" \n\t" +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" \n\t" +" add x1, x1, #128 \n\t" // because immediate in 'ldr1rqd' must be +" \n\t" // in range -128 to 112 +" \n\t" +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" ldr z6, [x0, #4, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" ldr z7, [x0, #5, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #0] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // End it 3 +" \n\t" +" fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" ldr z0, [x0, #6, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" ldr z1, [x0, #7, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" //End it 4 +" add x0, x0, #256 \n\t" +" add x1, x1, #128 \n\t" +" \n\t" +" sub x5,x5,1 \n\t" // i-=1 +" cmp x5,1 \n\t" // Iterate again if we are not in k_iter == 1. +" bne DLOOP \n\t" +" \n\t" +".DLASTITER: \n\t" +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" ldr z6, [x0] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" ldr z7, [x0, #1, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // End it 1 +" \n\t" +" fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" ldr z0, [x0, #2, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" ldr z1, [x0, #3, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #64] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #80] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #96] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #112] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" +" \n\t" //End it 2 +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" ldr z6, [x0, #4, MUL VL] \n\t" // Load a( 0:3,l ) +" \n\t" +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" ldr z7, [x0, #5, MUL VL] \n\t" // load a( 4:7,l ) +" \n\t" +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" add x1, x1, #128 \n\t" // because immediate in 'ldr1rqd' must be +" \n\t" // in range -128 to 112 +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" ld1rqd {z2.d}, p0/z, [x1, #0] \n\t" // load b( l,0:1 ) +" \n\t" +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" \n\t" +" \n\t" // End it 3 +" \n\t" +" fmla z16.d, z6.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z7.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" \n\t" +" fmla z18.d, z6.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" fmla z19.d, z7.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" \n\t" +" fmla z20.d, z6.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z7.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" \n\t" +" fmla z22.d, z6.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z7.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" \n\t" +" fmla z24.d, z6.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" fmla z25.d, z7.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" \n\t" +" fmla z26.d, z6.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z7.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" add x1, x1, #64 \n\t" +" \n\t" +" fmla z28.d, z6.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z7.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" \n\t" +" fmla z30.d, z6.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z7.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" \n\t" +" \n\t" //End it 4 +" add x0, x0, #192 \n\t" +" \n\t" +" .DCONSIDERKLEFT: \n\t" +" cmp x6,0 \n\t" // If k_left == 0, we are done. +" beq .DPOSTACCUM \n\t" // else, we enter the k_left loop. +" \n\t" +".DLOOPKLEFT: \n\t" +" \n\t" +" ldr z0, [x0] \n\t" // Load a +" ldr z1, [x0, #1, MUL VL] \n\t" +" add x0, x0, #64 \n\t" +" \n\t" +" ld1rqd {z2.d}, p0/z, [x1] \n\t" // load b( l,0:1 ) +" ld1rqd {z3.d}, p0/z, [x1, #16] \n\t" // load b( l,2:3 ) +" ld1rqd {z4.d}, p0/z, [x1, #32] \n\t" // load b( l,4:5 ) +" ld1rqd {z5.d}, p0/z, [x1, #48] \n\t" // load b( l,6:7 ) +" add x1, x1, #64 \n\t" +" \n\t" +" sub x6,x6,1 \n\t" +" \n\t" +" fmla z16.d, z0.d, z2.d[0] \n\t" // Accummulate c(0:3,0)+=a(0:3,l)*b(l,0) +" fmla z17.d, z1.d, z2.d[0] \n\t" // Accummulate c(4:7,0)+=a(4:7,l)*b(l,0) +" \n\t" +" fmla z18.d, z0.d, z2.d[1] \n\t" // Accummulate c(0:3,1)+=a(0:3,l)*b(l,1) +" fmla z19.d, z1.d, z2.d[1] \n\t" // Accummulate c(4:7,1)+=a(4:7,l)*b(l,1) +" \n\t" +" fmla z20.d, z0.d, z3.d[0] \n\t" // Accummulate c(0:3,2)+=a(0:3,l)*b(l,2) +" fmla z21.d, z1.d, z3.d[0] \n\t" // Accummulate c(4:7,2)+=a(4:7,l)*b(l,2) +" \n\t" +" fmla z22.d, z0.d, z3.d[1] \n\t" // Accummulate c(0:3,3)+=a(0:3,l)*b(l,3) +" fmla z23.d, z1.d, z3.d[1] \n\t" // Accummulate c(4:7,3)+=a(4:7,l)*b(l,3) +" \n\t" +" fmla z24.d, z0.d, z4.d[0] \n\t" // Accummulate c(0:3,4)+=a(0:3,l)*b(l,4) +" fmla z25.d, z1.d, z4.d[0] \n\t" // Accummulate c(4:7,4)+=a(4:7,l)*b(l,4) +" \n\t" +" fmla z26.d, z0.d, z4.d[1] \n\t" // Accummulate c(0:3,5)+=a(0:3,l)*b(l,5) +" fmla z27.d, z1.d, z4.d[1] \n\t" // Accummulate c(4:7,5)+=a(0:3,l)*b(l,5) +" \n\t" +" fmla z28.d, z0.d, z5.d[0] \n\t" // Accummulate c(0:3,6)+=a(0:3,l)*b(l,6) +" fmla z29.d, z1.d, z5.d[0] \n\t" // Accummulate c(4:7,6)+=a(0:3,l)*b(l,6) +" \n\t" +" fmla z30.d, z0.d, z5.d[1] \n\t" // Accummulate c(0:3,7)+=a(0:3,l)*b(l,7) +" fmla z31.d, z1.d, z5.d[1] \n\t" // Accummulate c(4:7,7)+=a(0:3,l)*b(l,7) +" \n\t" +" cmp x6,0 \n\t" // Iterate again. +" bne .DLOOPKLEFT \n\t" // if i!=0. +" \n\t" +" .DPOSTACCUM: \n\t" +" \n\t" +" ld1rd {z6.d}, p0/z, [x7] \n\t" // Load alpha. +" ld1rd {z7.d}, p0/z, [x8] \n\t" // Load beta +" \n\t" +" cmp x13,#1 \n\t" // If rs_c != 1 (column-major) +" bne .DGENSTORED \n\t" +" \n\t" +" .DCOLSTORED: \n\t" // C is column-major. +" \n\t" +" dup z0.d, #0 \n\t" +" dup z1.d, #0 \n\t" +" dup z2.d, #0 \n\t" +" dup z3.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS1 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr z0, [x2] \n\t" //Load column 0 of C +" ldr z1, [x2, #1, MUL VL] \n\t" +" \n\t" +" ldr z2, [x20] \n\t" //Load column 1 of C +" ldr z3, [x20, #1, MUL VL] \n\t" +" \n\t" +" fmul z0.d, z0.d, z7.d \n\t" // Scale by beta +" fmul z1.d, z1.d, z7.d \n\t" // Scale by beta +" fmul z2.d, z2.d, z7.d \n\t" // Scale by beta +" fmul z3.d, z3.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS1: \n\t" +" \n\t" +" fmla z0.d, z16.d, z6.d[0] \n\t" // Scale by alpha +" fmla z1.d, z17.d, z6.d[0] \n\t" // Scale by alpha +" fmla z2.d, z18.d, z6.d[0] \n\t" // Scale by alpha +" fmla z3.d, z19.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" str z0, [x2] \n\t" //Store column 0 of C +" str z1, [x2, #1, MUL VL] \n\t" +" \n\t" +" str z2, [x20] \n\t" //Store column 1 of C +" str z3, [x20, #1, MUL VL] \n\t" +" \n\t" +" dup z8.d, #0 \n\t" +" dup z9.d, #0 \n\t" +" dup z10.d, #0 \n\t" +" dup z11.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS2 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr z8, [x21] \n\t" //Load column 2 of C +" ldr z9, [x21, #1, MUL VL] \n\t" +" \n\t" +" ldr z10, [x22] \n\t" //Load column 3 of C +" ldr z11, [x22, #1, MUL VL] \n\t" +" \n\t" +" fmul z8.d, z8.d, z7.d \n\t" // Scale by beta +" fmul z9.d, z9.d, z7.d \n\t" // Scale by beta +" fmul z10.d, z10.d, z7.d \n\t" // Scale by beta +" fmul z11.d, z11.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS2: \n\t" +" \n\t" +" fmla z8.d, z20.d, z6.d[0] \n\t" // Scale by alpha +" fmla z9.d, z21.d, z6.d[0] \n\t" // Scale by alpha +" fmla z10.d, z22.d, z6.d[0] \n\t" // Scale by alpha +" fmla z11.d, z23.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" str z8, [x21] \n\t" //Store column 2 of C +" str z9, [x21, #1, MUL VL] \n\t" +" \n\t" +" str z10, [x22] \n\t" //Store column 3 of C +" str z11, [x22, #1, MUL VL] \n\t" +" \n\t" +" dup z0.d, #0 \n\t" +" dup z1.d, #0 \n\t" +" dup z2.d, #0 \n\t" +" dup z3.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr z0, [x23] \n\t" //Load column 4 of C +" ldr z1, [x23, #1, MUL VL] \n\t" +" \n\t" +" ldr z2, [x24] \n\t" //Load column 5 of C +" ldr z3, [x24, #1, MUL VL] \n\t" +" \n\t" +" fmul z0.d, z0.d, z7.d \n\t" // Scale by beta +" fmul z1.d, z1.d, z7.d \n\t" // Scale by beta +" fmul z2.d, z2.d, z7.d \n\t" // Scale by beta +" fmul z3.d, z3.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS3: \n\t" +" \n\t" +" fmla z0.d, z24.d, z6.d[0] \n\t" // Scale by alpha +" fmla z1.d, z25.d, z6.d[0] \n\t" // Scale by alpha +" fmla z2.d, z26.d, z6.d[0] \n\t" // Scale by alpha +" fmla z3.d, z27.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" str z0, [x23] \n\t" //Store column 4 of C +" str z1, [x23, #1, MUL VL] \n\t" +" \n\t" +" str z2, [x24] \n\t" //Store column 5 of C +" str z3, [x24, #1, MUL VL] \n\t" +" \n\t" +" dup z8.d, #0 \n\t" +" dup z9.d, #0 \n\t" +" dup z10.d, #0 \n\t" +" dup z11.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROCOLSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" ldr z8, [x25] \n\t" //Load column 6 of C +" ldr z9, [x25, #1, MUL VL] \n\t" +" \n\t" +" ldr z10, [x26] \n\t" //Load column 7 of C +" ldr z11, [x26, #1, MUL VL] \n\t" +" \n\t" +" fmul z8.d, z8.d, z7.d \n\t" // Scale by beta +" fmul z9.d, z9.d, z7.d \n\t" // Scale by beta +" fmul z10.d, z10.d, z7.d \n\t" // Scale by beta +" fmul z11.d, z11.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROCOLSTOREDS4: \n\t" +" \n\t" +" prfm pldl2keep,[x3] \n\t" +" prfm pldl2keep,[x4] \n\t" +" \n\t" +" fmla z8.d, z28.d, z6.d[0] \n\t" // Scale by alpha +" fmla z9.d, z29.d, z6.d[0] \n\t" // Scale by alpha +" fmla z10.d, z30.d, z6.d[0] \n\t" // Scale by alpha +" fmla z11.d, z31.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" str z8, [x25] \n\t" //Store column 6 of C +" str z9, [x25, #1, MUL VL] \n\t" +" \n\t" +" str z10, [x26] \n\t" //Store column 7 of C +" str z11, [x26, #1, MUL VL] \n\t" +" \n\t" +" b .DEND \n\t" +" \n\t" +" .DGENSTORED: \n\t" // C is general-stride stored. +" \n\t" +" \n\t" // x14 is row-stride in number of bytes. +" lsl x15,x14,#2 \n\t" // x15 is 4-row-stride, which is the address offset +" \n\t" // btw c(4,*) and c(0,*) +" index z4.d, xzr, x14 \n\t" // z4 is address offsets of four contiguous elements +" \n\t" // in a column. such as c( 0:3,* ). +" \n\t" // z4 is used as vector index for gather/scatter +" \n\t" // loading/storing from column of *c +" \n\t" +" \n\t" // C's each column's address: +" \n\t" // x2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7) +" \n\t" // x5, x6, x7, x8, x16, x17, x18, x19: are addresses of c(4,0:7) +" add x5, x15, x2 \n\t" // x5 is address of c(4,0) +" add x6, x15, x20 \n\t" // x6 is address of c(4,1) +" add x7, x15, x21 \n\t" // x7 is address of c(4,2) +" add x8, x15, x22 \n\t" // x8 is address of c(4,3) +" add x16, x15, x23 \n\t" // x16 is address of c(4,4) +" add x17, x15, x24 \n\t" // x17 is address of c(4,5) +" add x18, x15, x25 \n\t" // x18 is address of c(4,6) +" add x19, x15, x26 \n\t" // x19 is address of c(4,7) +" \n\t" +" dup z0.d, #0 \n\t" // C column 0, 1 +" dup z1.d, #0 \n\t" +" dup z2.d, #0 \n\t" +" dup z3.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS1 \n\t" // Taking care of the beta==0 case. +" \n\t" +" \n\t" // x2 is address of c(0,0) +" \n\t" // x5 is address of c(4,0) +" \n\t" // x20 is address of c(0,1) +" \n\t" // x6 is address of c(4,1) +" ld1d {z0.d}, p0/z, [x2, z4.d] \n\t" // Load c( 0:3,0 ) into z0 +" ld1d {z1.d}, p0/z, [x5, z4.d] \n\t" // Load c( 4:7,0 ) into z1 +" ld1d {z2.d}, p0/z, [x20, z4.d] \n\t" // Load c( 0:3,1 ) into z2 +" ld1d {z3.d}, p0/z, [x6 , z4.d] \n\t" // Load c( 4:7,1 ) into z3 +" \n\t" +" fmul z0.d, z0.d, z7.d \n\t" // Scale by beta +" fmul z1.d, z1.d, z7.d \n\t" // Scale by beta +" fmul z2.d, z2.d, z7.d \n\t" // Scale by beta +" fmul z3.d, z3.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS1: \n\t" +" \n\t" +" fmla z0.d, z16.d, z6.d[0] \n\t" // Scale by alpha +" fmla z1.d, z17.d, z6.d[0] \n\t" // Scale by alpha +" fmla z2.d, z18.d, z6.d[0] \n\t" // Scale by alpha +" fmla z3.d, z19.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" st1d {z0.d}, p0, [x2 , z4.d] \n\t" // Store c( 0:3,0 ) <- z0 +" st1d {z1.d}, p0, [x5 , z4.d] \n\t" // Store c( 4:7,0 ) <- z1 +" st1d {z2.d}, p0, [x20, z4.d] \n\t" // Store c( 0:3,1 ) <- z2 +" st1d {z3.d}, p0, [x6 , z4.d] \n\t" // Store c( 4:7,1 ) <- z3 +" \n\t" +" \n\t" +" \n\t" +" dup z8.d, #0 \n\t" // C column 2, 3 +" dup z9.d, #0 \n\t" +" dup z10.d, #0 \n\t" +" dup z11.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS2 \n\t" // Taking care of the beta==0 case. +" \n\t" +" \n\t" // x21 is address of c(0,2) +" \n\t" // x7 is address of c(4,2) +" \n\t" // x22 is address of c(0,3) +" \n\t" // x8 is address of c(4,3) +" ld1d {z8.d}, p0/z, [x21, z4.d] \n\t" // Load c( 0:3,2 ) into z8 +" ld1d {z9.d}, p0/z, [x7 , z4.d] \n\t" // Load c( 4:7,2 ) into z9 +" ld1d {z10.d}, p0/z, [x22, z4.d] \n\t" // Load c( 0:3,3 ) into z10 +" ld1d {z11.d}, p0/z, [x8 , z4.d] \n\t" // Load c( 4:7,3 ) into z11 +" \n\t" +" fmul z8.d, z8.d, z7.d \n\t" // Scale by beta +" fmul z9.d, z9.d, z7.d \n\t" // Scale by beta +" fmul z10.d, z10.d, z7.d \n\t" // Scale by beta +" fmul z11.d, z11.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS2: \n\t" +" \n\t" +" fmla z8.d, z20.d, z6.d[0] \n\t" // Scale by alpha +" fmla z9.d, z21.d, z6.d[0] \n\t" // Scale by alpha +" fmla z10.d, z22.d, z6.d[0] \n\t" // Scale by alpha +" fmla z11.d, z23.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" st1d {z8.d}, p0, [x21, z4.d] \n\t" // Store c( 0:3,2 ) <- z8 +" st1d {z9.d}, p0, [x7 , z4.d] \n\t" // Store c( 4:7,2 ) <- z9 +" st1d {z10.d}, p0, [x22, z4.d] \n\t" // Store c( 0:3,3 ) <- z10 +" st1d {z11.d}, p0, [x8 , z4.d] \n\t" // Store c( 4:7,3 ) <- z11 +" \n\t" +" dup z0.d, #0 \n\t" // C column 4, 5 +" dup z1.d, #0 \n\t" +" dup z2.d, #0 \n\t" +" dup z3.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS3 \n\t" // Taking care of the beta==0 case. +" \n\t" +" \n\t" // x23 is address of c(0,4) +" \n\t" // x16 is address of c(4,4) +" \n\t" // x24 is address of c(0,5) +" \n\t" // x17 is address of c(4,5) +" ld1d {z0.d}, p0/z, [x23, z4.d] \n\t" // Load c( 0:3,4 ) into z0 +" ld1d {z1.d}, p0/z, [x16, z4.d] \n\t" // Load c( 4:7,4 ) into z1 +" ld1d {z2.d}, p0/z, [x24, z4.d] \n\t" // Load c( 0:3,5 ) into z2 +" ld1d {z3.d}, p0/z, [x17, z4.d] \n\t" // Load c( 4:7,5 ) into z3 +" \n\t" +" fmul z0.d, z0.d, z7.d \n\t" // Scale by beta +" fmul z1.d, z1.d, z7.d \n\t" // Scale by beta +" fmul z2.d, z2.d, z7.d \n\t" // Scale by beta +" fmul z3.d, z3.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS3: \n\t" +" \n\t" +" fmla z0.d, z24.d, z6.d[0] \n\t" // Scale by alpha +" fmla z1.d, z25.d, z6.d[0] \n\t" // Scale by alpha +" fmla z2.d, z26.d, z6.d[0] \n\t" // Scale by alpha +" fmla z3.d, z27.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" st1d {z0.d}, p0, [x23, z4.d] \n\t" // Store c( 0:3,4 ) <- z0 +" st1d {z1.d}, p0, [x16, z4.d] \n\t" // Store c( 4:7,4 ) <- z1 +" st1d {z2.d}, p0, [x24, z4.d] \n\t" // Store c( 0:3,5 ) <- z2 +" st1d {z3.d}, p0, [x17, z4.d] \n\t" // Store c( 4:7,5 ) <- z3 +" \n\t" +" dup z8.d, #0 \n\t" // C column 6, 7 +" dup z9.d, #0 \n\t" +" dup z10.d, #0 \n\t" +" dup z11.d, #0 \n\t" +" \n\t" +" fcmp d7,#0.0 \n\t" +" beq .DBETAZEROGENSTOREDS4 \n\t" // Taking care of the beta==0 case. +" \n\t" +" \n\t" // x25 is address of c(0,6) +" \n\t" // x18 is address of c(4,6) +" \n\t" // x26 is address of c(0,7) +" \n\t" // x19 is address of c(4,7) +" ld1d {z8.d}, p0/z, [x25, z4.d] \n\t" // Load c( 0:3,6 ) into z8 +" ld1d {z9.d}, p0/z, [x18, z4.d] \n\t" // Load c( 4:7,6 ) into z9 +" ld1d {z10.d}, p0/z, [x26, z4.d] \n\t" // Load c( 0:3,7 ) into z10 +" ld1d {z11.d}, p0/z, [x19, z4.d] \n\t" // Load c( 4:7,7 ) into z11 +" \n\t" +" fmul z8.d, z8.d, z7.d \n\t" // Scale by beta +" fmul z9.d, z9.d, z7.d \n\t" // Scale by beta +" fmul z10.d, z10.d, z7.d \n\t" // Scale by beta +" fmul z11.d, z11.d, z7.d \n\t" // Scale by beta +" \n\t" +" .DBETAZEROGENSTOREDS4: \n\t" +" \n\t" +" fmla z8.d, z28.d, z6.d[0] \n\t" // Scale by alpha +" fmla z9.d, z29.d, z6.d[0] \n\t" // Scale by alpha +" fmla z10.d, z30.d, z6.d[0] \n\t" // Scale by alpha +" fmla z11.d, z31.d, z6.d[0] \n\t" // Scale by alpha +" \n\t" +" st1d {z8.d}, p0, [x25, z4.d] \n\t" // Store c( 0:3,6 ) <- z8 +" st1d {z9.d}, p0, [x18, z4.d] \n\t" // Store c( 4:7,6 ) <- z9 +" st1d {z10.d}, p0, [x26, z4.d] \n\t" // Store c( 0:3,7 ) <- z10 +" st1d {z11.d}, p0, [x19, z4.d] \n\t" // Store c( 4:7,7 ) <- z11 +" \n\t" +" .DEND: \n\t" // Done! +" \n\t" +:// output operands (none) +:// input operands + [aaddr] "m" (a), // 0 + [baddr] "m" (b), // 1 + [caddr] "m" (c), // 2 + [k_iter] "m" (k_iter), // 3 + [k_left] "m" (k_left), // 4 + [alpha] "m" (alpha), // 5 + [beta] "m" (beta), // 6 + [rs_c] "m" (rs_c), // 6 + [cs_c] "m" (cs_c), // 7 + [a_next] "m" (a_next), // 8 + [b_next] "m" (b_next) // 9 +:// Register clobber list + "x0","x1","x2","x3", + "x4","x5","x6", + "x7","x8","x9", + "x10","x11","x12","x13","x14","x15","x16","x17","x18","x19", + "x20","x21","x22","x23","x24","x25","x26", + "x27", + "v0","v1","v2", + "v3","v4","v5", + "v6","v7","v8", + "v9","v10","v11", + "v12","v13","v14", + "v15","v16","v17","v18","v19", + "v20","v21","v22","v23", + "v24","v25","v26","v27", + "v28","v29","v30","v31" +); + +} diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h new file mode 100644 index 0000000000..a5934312a0 --- /dev/null +++ b/kernels/armsve/bli_kernels_armsve.h @@ -0,0 +1,37 @@ +/* + + BLIS + An object-based framework for developing high-performance BLAS-like + libraries. + + Copyright (C) 2014, The University of Texas at Austin + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + - Neither the name(s) of the copyright holder(s) nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) + +PACKM_KER_PROT( double, d, packm_armsve256_asm_8xk )