diff --git a/frame/include/bli_arch_config.h b/frame/include/bli_arch_config.h
index 3aca1cba5c..21b044281a 100644
--- a/frame/include/bli_arch_config.h
+++ b/frame/include/bli_arch_config.h
@@ -265,6 +265,9 @@ CNTX_INIT_PROTS( generic )
 
 // -- ARM architectures --
 
+#ifdef BLIS_KERNELS_ARMSVE
+#include "bli_kernels_armsve.h"
+#endif
 #ifdef BLIS_KERNELS_ARMV8A
 #include "bli_kernels_armv8a.h"
 #endif
diff --git a/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c
new file mode 100644
index 0000000000..82def6df7b
--- /dev/null
+++ b/kernels/armsve/1m/bli_dpackm_armsve256_asm_8xk.c
@@ -0,0 +1,235 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Linaro Limited
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "blis.h"
+
+#ifdef __ARM_FEATURE_SVE
+#include <arm_sve.h>
+#else
+#error "No Arm SVE intrinsics support in compiler"
+#endif // __ARM_FEATURE_SVE
+
+// assumption:
+//   SVE vector length = 256 bits.
+//
+
+void bli_dpackm_armsve256_asm_8xk
+     (
+       conj_t           conja,
+       pack_t           schema,
+       dim_t            cdim_,
+       dim_t            n_,
+       dim_t            n_max_,
+       void*   restrict kappa_,
+       void*   restrict a_, inc_t inca_, inc_t lda_,
+       void*   restrict p_,              inc_t ldp_,
+       cntx_t* restrict cntx
+     )
+{
+    double*       a     = ( double* )a_;
+    double*       p     = ( double* )p_;
+    double*       kappa = ( double* )kappa_;
+    const int64_t cdim  = cdim_;
+    const int64_t mnr   = 8;
+    const int64_t n     = n_;
+    const int64_t n_max = n_max_;
+    const int64_t inca  = inca_;
+    const int64_t lda   = lda_;
+    const int64_t ldp   = ldp_;
+
+    double* restrict alpha1     = a;
+    double* restrict alpha1_4   = alpha1 + 4 * inca;
+    double* restrict pi1        = p;
+    const   svbool_t all_active = svptrue_b64();
+    svfloat64_t      z_a0;
+    svfloat64_t      z_a4;
+    svuint64_t       z_index;
+
+    // creating index for gather/scatter
+    //   with each element as: 0, 1*inca, 2*inca, 3*inca
+    z_index = svindex_u64( 0, inca * sizeof( double ) );
+
+    if ( cdim == mnr )
+    {
+        if ( bli_deq1( *kappa ) )
+        {
+            if ( inca == 1 )  // continous memory. packA style
+            {
+                for ( dim_t k = n; k != 0; --k )
+                {
+                    // svld1_f64 retrieves all zero's into z_a0 and z_a4,
+                    //   which is not correct.
+                    //   qemu-aarch64 or gcc interpretation of svld1_f64
+                    //   should be blamed.
+
+                    // load 8 continuous elments from *a
+                    // z_a0 = svld1_f64( all_active, alpha1 );
+                    // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 );
+
+                    // as a workaround, using gather load
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index );
+
+                    // store them into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( all_active, pi1, 1, z_a4 );
+
+                    alpha1   += lda;
+                    alpha1_4  = alpha1 + 4 * inca;
+                    pi1      += ldp;
+                }
+            }
+            else  // gather/scatter load/store. packB style
+            {
+                for ( dim_t k = n; k != 0; --k )
+                {
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index );
+
+                    // scatter store into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( all_active, pi1, 1, z_a4 );
+
+                    alpha1   += lda;
+                    alpha1_4  = alpha1 + 4 * inca;
+                    pi1      += ldp;
+                }
+            }
+        }
+        else  // *kappa != 1.0
+        {
+            // load kappa into vector
+            svfloat64_t z_kappa;
+
+            z_kappa = svdup_f64( *kappa );
+
+            if ( inca == 1 )  // continous memory. packA style
+            {
+                for ( dim_t k = n; k != 0; --k )
+                {
+                    // load 8 continuous elments from *a
+                    // z_a0 = svld1_f64( all_active, alpha1 );
+                    // z_a4 = svld1_vnum_f64( all_active, alpha1, 1 );
+                    // same reason as above. as a workaround, using gather load
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index );
+
+                    // multiply by *kappa
+                    z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
+                    z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 );
+
+                    // store them into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( all_active, pi1, 1, z_a4 );
+
+                    alpha1   += lda;
+                    alpha1_4  = alpha1 + 4 * inca;
+                    pi1      += ldp;
+                }
+            }
+            else  // gather/scatter load/store. packB style
+            {
+                for ( dim_t k = n; k != 0; --k )
+                {
+                    // gather load from *a
+                    z_a0 = svld1_gather_u64offset_f64( all_active, alpha1, z_index );
+                    z_a4 = svld1_gather_u64offset_f64( all_active, alpha1_4, z_index );
+
+                    // multiply by *kappa
+                    z_a0 = svmul_lane_f64( z_a0, z_kappa, 0 );
+                    z_a4 = svmul_lane_f64( z_a4, z_kappa, 0 );
+
+                    // scatter store into *p
+                    svst1_f64( all_active, pi1, z_a0 );
+                    svst1_vnum_f64( all_active, pi1, 1, z_a4 );
+
+                    alpha1   += lda;
+                    alpha1_4  = alpha1 + 4 * inca;
+                    pi1      += ldp;
+                }
+            }
+        } // end of if ( *kappa == 1.0 )
+    }
+    else // if ( cdim < mnr )
+    {
+        bli_dscal2m_ex
+        (
+          0,
+          BLIS_NONUNIT_DIAG,
+          BLIS_DENSE,
+          ( trans_t )conja,
+          cdim,
+          n,
+          kappa,
+          a, inca, lda,
+          p, 1,    ldp,
+          cntx,
+          NULL
+        );
+
+        // if ( cdim < mnr )
+        {
+            const dim_t      i      = cdim;
+            const dim_t      m_edge = mnr - i;
+            const dim_t      n_edge = n_max;
+            double* restrict p_edge = p + (i  )*1;
+
+            bli_dset0s_mxn
+            (
+              m_edge,
+              n_edge,
+              p_edge, 1, ldp
+            );
+        }
+    }
+
+    if ( n < n_max )
+    {
+        const dim_t      j      = n;
+        const dim_t      m_edge = mnr;
+        const dim_t      n_edge = n_max - j;
+        double* restrict p_edge = p + (j  )*ldp;
+
+        bli_dset0s_mxn
+        (
+          m_edge,
+          n_edge,
+          p_edge, 1, ldp
+        );
+    }
+}
diff --git a/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c
new file mode 100644
index 0000000000..01bb644b12
--- /dev/null
+++ b/kernels/armsve/3/bli_gemm_armsve256_asm_d8x8.c
@@ -0,0 +1,809 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2020, Linaro Limited
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+*/
+
+#include "blis.h"
+
+/*
+   o 8x8 Double precision micro-kernel
+   o Runnable on ARMv8a with SVE 256 feature, compiled with aarch64 GCC.
+   o Tested on qemu-aarch64 and armie for SVE.
+
+   Preconditions:
+    - to use this kernel, SVE with vector length of 256 bits is a must.
+
+   April 2020.
+*/
+void bli_dgemm_armsve256_asm_8x8
+     (
+       dim_t               k0,
+       double*    restrict alpha,
+       double*    restrict a,
+       double*    restrict b,
+       double*    restrict beta,
+       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
+       auxinfo_t* restrict data,
+       cntx_t*    restrict cntx
+     )
+{
+	void* a_next = bli_auxinfo_next_a( data );
+	void* b_next = bli_auxinfo_next_b( data );
+
+	// Typecast local copies of integers in case dim_t and inc_t are a
+	// different size than is expected by load instructions.
+	uint64_t k_iter = k0 / 4;
+	uint64_t k_left = k0 % 4;
+	uint64_t rs_c   = rs_c0;
+	uint64_t cs_c   = cs_c0;
+
+__asm__ volatile
+(
+"                                            \n\t" 
+" ldr x0,%[aaddr]                            \n\t" // Load address of A 
+" ldr x1,%[baddr]                            \n\t" // Load address of B
+" ldr x2,%[caddr]                            \n\t" // Load address of C
+"                                            \n\t"
+" ldr x3,%[a_next]                           \n\t" // Move pointer
+" ldr x4,%[b_next]                           \n\t" // Move pointer
+"                                            \n\t"
+" ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
+" ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
+"                                            \n\t" 
+" ldr x7,%[alpha]                            \n\t" // Alpha address      
+" ldr x8,%[beta]                             \n\t" // Beta address      
+"                                            \n\t" 
+" ldr x9,%[cs_c]                             \n\t" // Load cs_c
+" lsl x10,x9,#3                              \n\t" // cs_c * sizeof(double)
+"                                            \n\t"
+" ldr x13,%[rs_c]                            \n\t" // Load rs_c.
+" lsl x14,x13,#3                             \n\t" // rs_c * sizeof(double). 
+"                                            \n\t"
+" add x20,x2,x10                             \n\t" //Load address Column 1 of C
+" add x21,x20,x10                            \n\t" //Load address Column 2 of C
+" add x22,x21,x10                            \n\t" //Load address Column 3 of C
+" add x23,x22,x10                            \n\t" //Load address Column 4 of C
+" add x24,x23,x10                            \n\t" //Load address Column 5 of C
+" add x25,x24,x10                            \n\t" //Load address Column 6 of C
+" add x26,x25,x10                            \n\t" //Load address Column 7 of C
+"                                            \n\t"
+" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
+" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
+" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
+"                                            \n\t"
+" ldr  z0, [x0]                              \n\t" // Load a
+" ldr  z1, [x0, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" ptrue   p0.d, all                          \n\t"
+" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t" // PRFM, the following prefetch on [x1] and [x0]
+"                                            \n\t" //   is for b rows 4..7 and a columns 4..7.
+"                                            \n\t" //   both of them will be used in next iteration
+"                                            \n\t" //   of k_iter (unrolled per 4 loops)
+"                                            \n\t"
+" dup  z16.d, #0                             \n\t" // Vector for accummulating column 0
+" prfm    PLDL1KEEP, [x1, #256]              \n\t" // prefetch b row no.4
+" dup  z17.d, #0                             \n\t" // Vector for accummulating column 0
+" prfm    PLDL1KEEP, [x1, #320]              \n\t" // prefetch b row no.5
+" dup  z18.d, #0                             \n\t" // Vector for accummulating column 1
+" prfm    PLDL1KEEP, [x1, #384]              \n\t" // prefetch b row no.6
+" dup  z19.d, #0                             \n\t" // Vector for accummulating column 1
+" prfm    PLDL1KEEP, [x1, #448]              \n\t" // preftech b row no.7
+" dup  z20.d, #0                             \n\t" // Vector for accummulating column 2 
+" dup  z21.d, #0                             \n\t" // Vector for accummulating column 2
+"                                            \n\t"
+" dup  z22.d, #0                             \n\t" // Vector for accummulating column 3
+" prfm    PLDL1KEEP, [x0, #256]              \n\t" // prefetch a col. no.4
+" dup  z23.d, #0                             \n\t" // Vector for accummulating column 3
+" prfm    PLDL1KEEP, [x0, #320]              \n\t" // prefetch a col. no.5
+" dup  z24.d, #0                             \n\t" // Vector for accummulating column 4
+" prfm    PLDL1KEEP, [x0, #384]              \n\t" // prefetch a col. no.6
+" dup  z25.d, #0                             \n\t" // Vector for accummulating column 4
+" prfm    PLDL1KEEP, [x0, #448]              \n\t" // prefetch a col. no.7
+" dup  z26.d, #0                             \n\t" // Vector for accummulating column 5 
+" dup  z27.d, #0                             \n\t" // Vector for accummulating column 5
+"                                            \n\t"
+" dup  z28.d, #0                             \n\t" // Vector for accummulating column 6
+" dup  z29.d, #0                             \n\t" // Vector for accummulating column 6
+" dup  z30.d, #0                             \n\t" // Vector for accummulating column 7
+" dup  z31.d, #0                             \n\t" // Vector for accummulating column 7
+"                                            \n\t"
+"                                            \n\t"
+" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
+" beq .DCONSIDERKLEFT                        \n\t"
+"                                            \n\t"
+" add x0, x0, #64                            \n\t" //update address of A
+" add x1, x1, #64                            \n\t" //update address of B
+"                                            \n\t"
+" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
+" beq .DLASTITER                             \n\t" // (as loop is do-while-like).
+"                                            \n\t"
+" DLOOP:                                     \n\t" // Body
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" prfm    PLDL1KEEP, [x1, #448]              \n\t" // prefetch b row no.8, 512-64=448
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" prfm    PLDL1KEEP, [x1, #512]              \n\t" // prefetch b row no.9
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" prfm    PLDL1KEEP, [x1, #576]              \n\t" // prefetch b row no.10
+"                                            \n\t"
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" ldr  z6, [x0]                              \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" ldr  z7, [x0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  // End it 1
+"                                            \n\t"
+" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" prfm    PLDL1KEEP, [x1, #640]              \n\t" // prefetch b row no.11
+" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" prfm    PLDL1KEEP, [x0, #448]              \n\t" // prefetch a col. no.8
+" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" prfm    PLDL1KEEP, [x0, #512]              \n\t" // prefetch a col. no.9
+"                                            \n\t"
+" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" ldr  z0, [x0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" ldr  z1, [x0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"
+"                                            \n\t"                  //End it 2
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" prfm    PLDL1KEEP, [x0, #576]              \n\t" // prefetch a col. no.10
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" prfm    PLDL1KEEP, [x0, #640]              \n\t" // prefetch a col. no.11
+"                                            \n\t"
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+"                                            \n\t"
+" add x1, x1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
+"                                            \n\t" //   in range -128 to 112
+"                                            \n\t"
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" ldr  z6, [x0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" ldr  z7, [x0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #0]             \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  // End it 3
+"                                            \n\t"
+" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" ldr  z0, [x0, #6, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" ldr  z1, [x0, #7, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  //End it 4
+" add x0, x0, #256                           \n\t"
+" add x1, x1, #128                           \n\t"
+"                                            \n\t"
+" sub x5,x5,1                                \n\t" // i-=1
+" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
+" bne DLOOP                                  \n\t"
+"                                            \n\t"
+".DLASTITER:                                 \n\t"
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" ldr  z6, [x0]                              \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" ldr  z7, [x0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  // End it 1
+"                                            \n\t"
+" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" ldr  z0, [x0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" ldr  z1, [x0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"
+"                                            \n\t"                  //End it 2
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" ldr  z6, [x0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
+"                                            \n\t"
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+" ldr  z7, [x0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
+"                                            \n\t"
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" add x1, x1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
+"                                            \n\t" //   in range -128 to 112
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" ld1rqd  {z2.d}, p0/z, [x1, #0]             \n\t" // load b( l,0:1 )
+"                                            \n\t"
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+"                                            \n\t"
+"                                            \n\t"                  // End it 3
+"                                            \n\t"
+" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+"                                            \n\t"
+" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+"                                            \n\t"
+" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+"                                            \n\t"
+" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+"                                            \n\t"
+" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+"                                            \n\t"
+" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+" add x1, x1, #64                            \n\t"
+"                                            \n\t"
+" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+"                                            \n\t"
+" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+"                                            \n\t"
+"                                            \n\t"                  //End it 4
+" add x0, x0, #192                           \n\t"
+"                                            \n\t"
+" .DCONSIDERKLEFT:                           \n\t" 
+" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
+" beq .DPOSTACCUM                            \n\t" // else, we enter the k_left loop.
+"                                            \n\t"
+".DLOOPKLEFT:                                \n\t"
+"                                            \n\t"
+" ldr  z0, [x0]                              \n\t" // Load a
+" ldr  z1, [x0, #1, MUL VL]                  \n\t"
+" add x0, x0, #64                            \n\t"
+"                                            \n\t"
+" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
+" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
+" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
+" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
+" add x1, x1, #64                            \n\t"
+"                                            \n\t"
+" sub x6,x6,1                                \n\t"
+"                                            \n\t"
+" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
+" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
+"                                            \n\t"
+" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
+" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
+"                                            \n\t"
+" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
+" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
+"                                            \n\t"
+" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
+" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
+"                                            \n\t"
+" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
+" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
+"                                            \n\t"
+" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
+" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
+"                                            \n\t"
+" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
+" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
+"                                            \n\t"
+" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
+" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
+"                                            \n\t"
+" cmp x6,0                                   \n\t" // Iterate again.
+" bne .DLOOPKLEFT                            \n\t" // if i!=0.
+"                                            \n\t"
+" .DPOSTACCUM:                               \n\t"
+"                                            \n\t"
+" ld1rd {z6.d}, p0/z, [x7]                   \n\t" // Load alpha.
+" ld1rd {z7.d}, p0/z, [x8]                   \n\t" // Load beta
+"                                            \n\t"
+" cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
+" bne .DGENSTORED                            \n\t"
+"                                            \n\t"
+" .DCOLSTORED:                               \n\t" // C is column-major.
+"                                            \n\t"
+" dup  z0.d, #0                              \n\t"
+" dup  z1.d, #0                              \n\t"
+" dup  z2.d, #0                              \n\t"
+" dup  z3.d, #0                              \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr z0, [x2]                               \n\t" //Load column 0 of C
+" ldr z1, [x2, #1, MUL VL]                   \n\t"
+"                                            \n\t"
+" ldr z2, [x20]                              \n\t" //Load column 1 of C
+" ldr z3, [x20, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
+" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
+" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
+" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS1:                     \n\t"
+"                                            \n\t"
+" fmla z0.d, z16.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z1.d, z17.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z2.d, z18.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z3.d, z19.d, z6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" str z0, [x2]                               \n\t" //Store column 0 of C
+" str z1, [x2, #1, MUL VL]                   \n\t"
+"                                            \n\t"
+" str z2, [x20]                              \n\t" //Store column 1 of C
+" str z3, [x20, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" dup  z8.d,  #0                             \n\t"
+" dup  z9.d,  #0                             \n\t"
+" dup  z10.d, #0                             \n\t"
+" dup  z11.d, #0                             \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr z8, [x21]                              \n\t" //Load column 2 of C
+" ldr z9, [x21, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" ldr z10, [x22]                             \n\t" //Load column 3 of C
+" ldr z11, [x22, #1, MUL VL]                 \n\t"
+"                                            \n\t"
+" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
+" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
+" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
+" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS2:                     \n\t"
+"                                            \n\t"
+" fmla z8.d,  z20.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z9.d,  z21.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z10.d, z22.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z11.d, z23.d, z6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" str z8, [x21]                              \n\t" //Store column 2 of C
+" str z9, [x21, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" str z10, [x22]                             \n\t" //Store column 3 of C
+" str z11, [x22, #1, MUL VL]                 \n\t"
+"                                            \n\t"
+" dup  z0.d, #0                              \n\t"
+" dup  z1.d, #0                              \n\t"
+" dup  z2.d, #0                              \n\t"
+" dup  z3.d, #0                              \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr z0, [x23]                              \n\t" //Load column 4 of C
+" ldr z1, [x23, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" ldr z2, [x24]                              \n\t" //Load column 5 of C
+" ldr z3, [x24, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
+" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
+" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
+" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS3:                     \n\t"
+"                                            \n\t"
+" fmla z0.d, z24.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z1.d, z25.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z2.d, z26.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z3.d, z27.d, z6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" str z0, [x23]                              \n\t" //Store column 4 of C
+" str z1, [x23, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" str z2, [x24]                              \n\t" //Store column 5 of C
+" str z3, [x24, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" dup  z8.d,  #0                             \n\t"
+" dup  z9.d,  #0                             \n\t"
+" dup  z10.d, #0                             \n\t"
+" dup  z11.d, #0                             \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROCOLSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+" ldr z8, [x25]                              \n\t" //Load column 6 of C
+" ldr z9, [x25, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" ldr z10, [x26]                             \n\t" //Load column 7 of C
+" ldr z11, [x26, #1, MUL VL]                 \n\t"
+"                                            \n\t"
+" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
+" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
+" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
+" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROCOLSTOREDS4:                     \n\t"
+"                                            \n\t"
+" prfm pldl2keep,[x3]                        \n\t"
+" prfm pldl2keep,[x4]                        \n\t"
+"                                            \n\t"
+" fmla z8.d,  z28.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z9.d,  z29.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z10.d, z30.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z11.d, z31.d, z6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" str z8, [x25]                              \n\t" //Store column 6 of C
+" str z9, [x25, #1, MUL VL]                  \n\t"
+"                                            \n\t"
+" str z10, [x26]                             \n\t" //Store column 7 of C
+" str z11, [x26, #1, MUL VL]                 \n\t"
+"                                            \n\t"
+" b .DEND                                    \n\t"
+"                                            \n\t"
+" .DGENSTORED:                               \n\t" // C is general-stride stored.
+"                                            \n\t"
+"                                            \n\t" // x14 is row-stride in number of bytes.
+" lsl x15,x14,#2                             \n\t" // x15 is 4-row-stride, which is the address offset 
+"                                            \n\t" //     btw c(4,*) and c(0,*)
+" index z4.d, xzr, x14                       \n\t" // z4  is address offsets of four contiguous elements
+"                                            \n\t" //     in a column. such as c( 0:3,* ).
+"                                            \n\t" //     z4 is used as vector index for gather/scatter
+"                                            \n\t" //     loading/storing from column of *c
+"                                            \n\t"
+"                                            \n\t" // C's each column's address:
+"                                            \n\t" //     x2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7)
+"                                            \n\t" //     x5, x6,  x7,  x8,  x16, x17, x18, x19: are addresses of c(4,0:7)
+" add  x5,  x15, x2                          \n\t" // x5  is address of c(4,0)
+" add  x6,  x15, x20                         \n\t" // x6  is address of c(4,1)
+" add  x7,  x15, x21                         \n\t" // x7  is address of c(4,2)
+" add  x8,  x15, x22                         \n\t" // x8  is address of c(4,3)
+" add  x16, x15, x23                         \n\t" // x16 is address of c(4,4)
+" add  x17, x15, x24                         \n\t" // x17 is address of c(4,5)
+" add  x18, x15, x25                         \n\t" // x18 is address of c(4,6)
+" add  x19, x15, x26                         \n\t" // x19 is address of c(4,7)
+"                                            \n\t"
+" dup  z0.d, #0                              \n\t" // C column 0, 1
+" dup  z1.d, #0                              \n\t"
+" dup  z2.d, #0                              \n\t"
+" dup  z3.d, #0                              \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+"                                            \n\t" // x2  is address of c(0,0)
+"                                            \n\t" // x5  is address of c(4,0)
+"                                            \n\t" // x20 is address of c(0,1)
+"                                            \n\t" // x6  is address of c(4,1)
+" ld1d {z0.d}, p0/z, [x2, z4.d]              \n\t" // Load c( 0:3,0 ) into z0
+" ld1d {z1.d}, p0/z, [x5, z4.d]              \n\t" // Load c( 4:7,0 ) into z1
+" ld1d {z2.d}, p0/z, [x20, z4.d]             \n\t" // Load c( 0:3,1 ) into z2
+" ld1d {z3.d}, p0/z, [x6 , z4.d]             \n\t" // Load c( 4:7,1 ) into z3
+"                                            \n\t"
+" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
+" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
+" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
+" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS1:                     \n\t"
+"                                            \n\t"
+" fmla z0.d, z16.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z1.d, z17.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z2.d, z18.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z3.d, z19.d, z6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" st1d {z0.d}, p0, [x2 , z4.d]               \n\t" // Store c( 0:3,0 ) <- z0
+" st1d {z1.d}, p0, [x5 , z4.d]               \n\t" // Store c( 4:7,0 ) <- z1
+" st1d {z2.d}, p0, [x20, z4.d]               \n\t" // Store c( 0:3,1 ) <- z2
+" st1d {z3.d}, p0, [x6 , z4.d]               \n\t" // Store c( 4:7,1 ) <- z3
+"                                            \n\t"
+"                                            \n\t"
+"                                            \n\t"
+" dup  z8.d, #0                              \n\t" // C column 2, 3
+" dup  z9.d, #0                              \n\t"
+" dup  z10.d, #0                             \n\t"
+" dup  z11.d, #0                             \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+"                                            \n\t" // x21 is address of c(0,2)
+"                                            \n\t" // x7  is address of c(4,2)
+"                                            \n\t" // x22 is address of c(0,3)
+"                                            \n\t" // x8  is address of c(4,3)
+" ld1d {z8.d},  p0/z, [x21, z4.d]            \n\t" // Load c( 0:3,2 ) into z8
+" ld1d {z9.d},  p0/z, [x7 , z4.d]            \n\t" // Load c( 4:7,2 ) into z9
+" ld1d {z10.d}, p0/z, [x22, z4.d]            \n\t" // Load c( 0:3,3 ) into z10
+" ld1d {z11.d}, p0/z, [x8 , z4.d]            \n\t" // Load c( 4:7,3 ) into z11
+"                                            \n\t"
+" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
+" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
+" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
+" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS2:                     \n\t"
+"                                            \n\t"
+" fmla z8.d,  z20.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z9.d,  z21.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z10.d, z22.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z11.d, z23.d, z6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" st1d {z8.d},  p0, [x21, z4.d]              \n\t" // Store c( 0:3,2 ) <- z8
+" st1d {z9.d},  p0, [x7 , z4.d]              \n\t" // Store c( 4:7,2 ) <- z9
+" st1d {z10.d}, p0, [x22, z4.d]              \n\t" // Store c( 0:3,3 ) <- z10
+" st1d {z11.d}, p0, [x8 , z4.d]              \n\t" // Store c( 4:7,3 ) <- z11
+"                                            \n\t"
+" dup  z0.d, #0                              \n\t" // C column 4, 5
+" dup  z1.d, #0                              \n\t"
+" dup  z2.d, #0                              \n\t"
+" dup  z3.d, #0                              \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS3                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+"                                            \n\t" // x23 is address of c(0,4)
+"                                            \n\t" // x16 is address of c(4,4)
+"                                            \n\t" // x24 is address of c(0,5)
+"                                            \n\t" // x17 is address of c(4,5)
+" ld1d {z0.d}, p0/z, [x23, z4.d]             \n\t" // Load c( 0:3,4 ) into z0
+" ld1d {z1.d}, p0/z, [x16, z4.d]             \n\t" // Load c( 4:7,4 ) into z1
+" ld1d {z2.d}, p0/z, [x24, z4.d]             \n\t" // Load c( 0:3,5 ) into z2
+" ld1d {z3.d}, p0/z, [x17, z4.d]             \n\t" // Load c( 4:7,5 ) into z3
+"                                            \n\t"
+" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
+" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
+" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
+" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS3:                     \n\t"
+"                                            \n\t"
+" fmla z0.d, z24.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z1.d, z25.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z2.d, z26.d, z6.d[0]                  \n\t" // Scale by alpha
+" fmla z3.d, z27.d, z6.d[0]                  \n\t" // Scale by alpha
+"                                            \n\t"
+" st1d {z0.d}, p0, [x23, z4.d]               \n\t" // Store c( 0:3,4 ) <- z0
+" st1d {z1.d}, p0, [x16, z4.d]               \n\t" // Store c( 4:7,4 ) <- z1
+" st1d {z2.d}, p0, [x24, z4.d]               \n\t" // Store c( 0:3,5 ) <- z2
+" st1d {z3.d}, p0, [x17, z4.d]               \n\t" // Store c( 4:7,5 ) <- z3
+"                                            \n\t"
+" dup  z8.d, #0                              \n\t" // C column 6, 7
+" dup  z9.d, #0                              \n\t"
+" dup  z10.d, #0                             \n\t"
+" dup  z11.d, #0                             \n\t"
+"                                            \n\t"
+" fcmp d7,#0.0                               \n\t"
+" beq .DBETAZEROGENSTOREDS4                  \n\t" // Taking care of the beta==0 case.
+"                                            \n\t"
+"                                            \n\t" // x25 is address of c(0,6)
+"                                            \n\t" // x18 is address of c(4,6)
+"                                            \n\t" // x26 is address of c(0,7)
+"                                            \n\t" // x19 is address of c(4,7)
+" ld1d {z8.d},  p0/z, [x25, z4.d]            \n\t" // Load c( 0:3,6 ) into z8
+" ld1d {z9.d},  p0/z, [x18, z4.d]            \n\t" // Load c( 4:7,6 ) into z9
+" ld1d {z10.d}, p0/z, [x26, z4.d]            \n\t" // Load c( 0:3,7 ) into z10
+" ld1d {z11.d}, p0/z, [x19, z4.d]            \n\t" // Load c( 4:7,7 ) into z11
+"                                            \n\t"
+" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
+" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
+" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
+" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
+"                                            \n\t"
+" .DBETAZEROGENSTOREDS4:                     \n\t"
+"                                            \n\t"
+" fmla z8.d,  z28.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z9.d,  z29.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z10.d, z30.d, z6.d[0]                 \n\t" // Scale by alpha
+" fmla z11.d, z31.d, z6.d[0]                 \n\t" // Scale by alpha
+"                                            \n\t"
+" st1d {z8.d},  p0, [x25, z4.d]              \n\t" // Store c( 0:3,6 ) <- z8
+" st1d {z9.d},  p0, [x18, z4.d]              \n\t" // Store c( 4:7,6 ) <- z9
+" st1d {z10.d}, p0, [x26, z4.d]              \n\t" // Store c( 0:3,7 ) <- z10
+" st1d {z11.d}, p0, [x19, z4.d]              \n\t" // Store c( 4:7,7 ) <- z11
+"                                            \n\t"
+" .DEND:                                     \n\t" // Done!
+"                                            \n\t"
+:// output operands (none)
+:// input operands
+ [aaddr]  "m" (a),      // 0
+ [baddr]  "m" (b),      // 1
+ [caddr]  "m" (c),      // 2
+ [k_iter] "m" (k_iter), // 3
+ [k_left] "m" (k_left), // 4
+ [alpha]  "m" (alpha),  // 5
+ [beta]   "m" (beta),   // 6
+ [rs_c]   "m" (rs_c),   // 6
+ [cs_c]   "m" (cs_c),   // 7
+ [a_next] "m" (a_next), // 8
+ [b_next] "m" (b_next)  // 9
+:// Register clobber list
+ "x0","x1","x2","x3",
+ "x4","x5","x6",
+ "x7","x8","x9",
+ "x10","x11","x12","x13","x14","x15","x16","x17","x18","x19",
+ "x20","x21","x22","x23","x24","x25","x26",
+ "x27",       
+ "v0","v1","v2",
+ "v3","v4","v5",
+ "v6","v7","v8",
+ "v9","v10","v11",
+ "v12","v13","v14",
+ "v15","v16","v17","v18","v19",
+ "v20","v21","v22","v23",
+ "v24","v25","v26","v27",
+ "v28","v29","v30","v31"
+);
+
+}
diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h
new file mode 100644
index 0000000000..a5934312a0
--- /dev/null
+++ b/kernels/armsve/bli_kernels_armsve.h
@@ -0,0 +1,37 @@
+/*
+
+   BLIS
+   An object-based framework for developing high-performance BLAS-like
+   libraries.
+
+   Copyright (C) 2014, The University of Texas at Austin
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+    - Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    - Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    - Neither the name(s) of the copyright holder(s) nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+GEMM_UKR_PROT( double,   d, gemm_armsve256_asm_8x8 )
+
+PACKM_KER_PROT( double,   d, packm_armsve256_asm_8xk )