From a2cef0bbf3f0d7df69cacbe1f3ff37b2746bec24 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Thu, 3 Feb 2022 02:31:29 +0900 Subject: [PATCH 1/4] Armv8a, ArmSVE: Simplify Gen-C --- .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 74 +- .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 68 +- .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 62 +- .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 74 +- .../armsve/3/{ => old}/armsve_asm_2vx7cmplx.h | 0 .../armsve/3/{ => old}/armsve_asm_2vx8cmplx.h | 0 .../3/{ => old}/armsve_asm_macros_half.h | 0 .../bli_gemm_armsve_asm_z2vx7_unindexed.c | 0 .../bli_gemm_armsve_asm_z2vx8_unindexed.c | 0 kernels/armsve/bli_kernels_armsve.h | 6 +- kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c | 659 +----------------- 11 files changed, 154 insertions(+), 789 deletions(-) rename kernels/armsve/3/{ => old}/armsve_asm_2vx7cmplx.h (100%) rename kernels/armsve/3/{ => old}/armsve_asm_2vx8cmplx.h (100%) rename kernels/armsve/3/{ => old}/armsve_asm_macros_half.h (100%) rename kernels/armsve/3/{ => old}/bli_gemm_armsve_asm_z2vx7_unindexed.c (100%) rename kernels/armsve/3/{ => old}/bli_gemm_armsve_asm_z2vx8_unindexed.c (100%) diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index c84a59f07c..60a64515fd 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -118,8 +118,8 @@ void bli_cgemm_armsve_asm_2vx10_unindexed GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" " CCOL_PRFM: \n\t" -" cmp %3, #1 \n\t" -" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// " cmp %3, #1 \n\t" +// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" @@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +// " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" @@ -233,8 +233,8 @@ MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " WRITE_MEM_EXEC: \n\t" " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. -" cmp %3, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" +// " cmp %3, #1 \n\t" +// " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" " fmov s29, wzr \n\t" @@ -260,38 +260,38 @@ GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" -" add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, -" mov x3, %3 \n\t" // s.t. 2*sizeof(float) = 2*4 = 8. -" index z28.s, wzr, w3 \n\t" -" fmov s29, wzr \n\t" -" fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. -" fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. -" b.eq ZERO_BETA_G_0_1_2_3 \n\t" -GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) -GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) -GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -" ZERO_BETA_G_0_1_2_3: \n\t" -GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) -" \n\t" -" b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" -GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) -GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) -GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) -GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -" ZERO_BETA_G_4_5_6_7_8_9: \n\t" -GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" b END_EXEC \n\t" +// " b END_WRITE_MEM \n\t" +// " \n\t" +// " WRITE_MEM_G: \n\t" +// " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, +// " mov x3, %3 \n\t" // s.t. 2*sizeof(float) = 2*4 = 8. +// " index z28.s, wzr, w3 \n\t" +// " fmov s29, wzr \n\t" +// " fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. +// " fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. +// " b.eq ZERO_BETA_G_0_1_2_3 \n\t" +// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +// GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) +// GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) +// " ZERO_BETA_G_0_1_2_3: \n\t" +// GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) +// " \n\t" +// " b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" +// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) +// GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) +// GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) +// GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) +// " ZERO_BETA_G_4_5_6_7_8_9: \n\t" +// GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) +// " \n\t" +// " END_WRITE_MEM: \n\t" +// " b END_EXEC \n\t" " \n\t" " END_EXEC: \n\t" " mov %11, #0 \n\t" // Return normal. diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index 5a662df4e7..7136104b5b 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -78,7 +78,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " mov x3, #10 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" -" ldr x6, %[rs_c] \n\t" // Row-skip of C. +// " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x8, 0x3 \n\t" // Tag C address. @@ -117,8 +117,8 @@ void bli_dgemm_armsve_asm_2vx10_unindexed GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " \n\t" " CCOL_PRFM: \n\t" -" cmp x6, #1 \n\t" -" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// " cmp x6, #1 \n\t" +// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" @@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +// " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" @@ -253,8 +253,8 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. -" cmp x6, #1 \n\t" // Preload first half of C for contiguous case. -" b.ne WRITE_MEM \n\t" +// " cmp x6, #1 \n\t" // Preload first half of C for contiguous case. +// " b.ne WRITE_MEM \n\t" GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) " \n\t" " WRITE_MEM: \n\t" @@ -265,8 +265,8 @@ GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" " UNIT_ALPHA: \n\t" -" cmp x6, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" +// " cmp x6, #1 \n\t" +// " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. @@ -281,32 +281,32 @@ GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28, " BETA_ZERO_C: \n\t" GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -" \n\t" // Here used scratch: Z[20-30] - Z30 as index. -" mov x8, xzr \n\t" -" incb x8 \n\t" -" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. -" index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. -" \n\t" -" fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. -" b.eq BETA_ZERO_G \n\t" -" \n\t" -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -" \n\t" -" BETA_ZERO_G: \n\t" -GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) -GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" b END_EXEC \n\t" -" \n\t" -" END_ERROR: \n\t" -" mov x0, #1 \n\t" // Return error. +// " b END_WRITE_MEM \n\t" +// " \n\t" +// " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +// " \n\t" // Here used scratch: Z[20-30] - Z30 as index. +// " mov x8, xzr \n\t" +// " incb x8 \n\t" +// " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. +// " index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. +// " \n\t" +// " fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. +// " b.eq BETA_ZERO_G \n\t" +// " \n\t" +// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +// " \n\t" +// " BETA_ZERO_G: \n\t" +// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) +// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) +// " \n\t" +// " END_WRITE_MEM: \n\t" +// " b END_EXEC \n\t" +// " \n\t" +// " END_ERROR: \n\t" +// " mov x0, #1 \n\t" // Return error. " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index caa70a5e56..20841891bf 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -78,7 +78,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " mov x3, #10 \n\t" // Row-skip of B. " \n\t" " ldr x5, %[c] \n\t" -" ldr x6, %[rs_c] \n\t" // Row-skip of C. +// " ldr x6, %[rs_c] \n\t" // Row-skip of C. " ldr x7, %[cs_c] \n\t" // Column-skip of C. #ifdef _A64FX " mov x8, 0x3 \n\t" // Tag C address. @@ -117,8 +117,8 @@ void bli_sgemm_armsve_asm_2vx10_unindexed GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " \n\t" " CCOL_PRFM: \n\t" -" cmp x6, #1 \n\t" -" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// " cmp x6, #1 \n\t" +// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" @@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +// " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" @@ -253,8 +253,8 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " UNIT_ALPHA: \n\t" " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. -" cmp x6, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" +// " cmp x6, #1 \n\t" +// " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. @@ -268,31 +268,31 @@ GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28, " BETA_ZERO_C: \n\t" GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -" \n\t" // Here used scratch: Z[20-30] - Z30 as index. -" mov x8, xzr \n\t" -" incb x8 \n\t" -" madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. -" index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. -" \n\t" -" fcmp s31, #0.0 \n\t" -" b.eq BETA_ZERO_G \n\t" -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -" \n\t" -" BETA_ZERO_G: \n\t" -GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) -GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" b END_EXEC \n\t" -" \n\t" -" END_ERROR: \n\t" -" mov x0, #1 \n\t" // Return error. +// " b END_WRITE_MEM \n\t" +// " \n\t" +// " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. +// " \n\t" // Here used scratch: Z[20-30] - Z30 as index. +// " mov x8, xzr \n\t" +// " incb x8 \n\t" +// " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. +// " index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. +// " \n\t" +// " fcmp s31, #0.0 \n\t" +// " b.eq BETA_ZERO_G \n\t" +// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) +// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) +// " \n\t" +// " BETA_ZERO_G: \n\t" +// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) +// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) +// " \n\t" +// " END_WRITE_MEM: \n\t" +// " b END_EXEC \n\t" +// " \n\t" +// " END_ERROR: \n\t" +// " mov x0, #1 \n\t" // Return error. " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index 25084af35e..7e630894f2 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -118,8 +118,8 @@ void bli_zgemm_armsve_asm_2vx10_unindexed GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" " CCOL_PRFM: \n\t" -" cmp %3, #1 \n\t" -" b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// " cmp %3, #1 \n\t" +// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" @@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +// " END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" @@ -233,8 +233,8 @@ MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " WRITE_MEM_EXEC: \n\t" " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. -" cmp %3, #1 \n\t" -" b.ne WRITE_MEM_G \n\t" +// " cmp %3, #1 \n\t" +// " b.ne WRITE_MEM_G \n\t" " \n\t" " WRITE_MEM_C: \n\t" " fmov d29, xzr \n\t" @@ -260,38 +260,38 @@ GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) -" b END_WRITE_MEM \n\t" -" \n\t" -" WRITE_MEM_G: \n\t" -" add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, -" index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. -" fmov d29, xzr \n\t" -" fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. -" fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. -" b.eq ZERO_BETA_G_0_1_2_3 \n\t" -GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) -GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) -GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -" ZERO_BETA_G_0_1_2_3: \n\t" -GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) -" \n\t" -" b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" -GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) -GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) -GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) -GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) -GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -" ZERO_BETA_G_4_5_6_7_8_9: \n\t" -GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) -GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) -" \n\t" -" END_WRITE_MEM: \n\t" -" b END_EXEC \n\t" -" \n\t" +// " b END_WRITE_MEM \n\t" +// " \n\t" +// " WRITE_MEM_G: \n\t" +// " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, +// " index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. +// " fmov d29, xzr \n\t" +// " fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. +// " fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. +// " b.eq ZERO_BETA_G_0_1_2_3 \n\t" +// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +// GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) +// GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) +// " ZERO_BETA_G_0_1_2_3: \n\t" +// GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) +// " \n\t" +// " b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" +// GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) +// GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) +// GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) +// GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) +// GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) +// " ZERO_BETA_G_4_5_6_7_8_9: \n\t" +// GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) +// GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) +// " \n\t" +// " END_WRITE_MEM: \n\t" +// " b END_EXEC \n\t" +// " \n\t" " END_EXEC: \n\t" " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 diff --git a/kernels/armsve/3/armsve_asm_2vx7cmplx.h b/kernels/armsve/3/old/armsve_asm_2vx7cmplx.h similarity index 100% rename from kernels/armsve/3/armsve_asm_2vx7cmplx.h rename to kernels/armsve/3/old/armsve_asm_2vx7cmplx.h diff --git a/kernels/armsve/3/armsve_asm_2vx8cmplx.h b/kernels/armsve/3/old/armsve_asm_2vx8cmplx.h similarity index 100% rename from kernels/armsve/3/armsve_asm_2vx8cmplx.h rename to kernels/armsve/3/old/armsve_asm_2vx8cmplx.h diff --git a/kernels/armsve/3/armsve_asm_macros_half.h b/kernels/armsve/3/old/armsve_asm_macros_half.h similarity index 100% rename from kernels/armsve/3/armsve_asm_macros_half.h rename to kernels/armsve/3/old/armsve_asm_macros_half.h diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c b/kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx7_unindexed.c similarity index 100% rename from kernels/armsve/3/bli_gemm_armsve_asm_z2vx7_unindexed.c rename to kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx7_unindexed.c diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c b/kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx8_unindexed.c similarity index 100% rename from kernels/armsve/3/bli_gemm_armsve_asm_z2vx8_unindexed.c rename to kernels/armsve/3/old/bli_gemm_armsve_asm_z2vx8_unindexed.c diff --git a/kernels/armsve/bli_kernels_armsve.h b/kernels/armsve/bli_kernels_armsve.h index 408300308b..39daf30c69 100644 --- a/kernels/armsve/bli_kernels_armsve.h +++ b/kernels/armsve/bli_kernels_armsve.h @@ -33,13 +33,13 @@ */ #include "./3/bli_armsve_utils.h" -GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) +// GEMM_UKR_PROT( double, d, gemm_armsve256_asm_8x8 ) GEMM_UKR_PROT( double, d, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( float, s, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( scomplex, c, gemm_armsve_asm_2vx10_unindexed ) GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx10_unindexed ) -GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed ) -GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed ) +// GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx8_unindexed ) +// GEMM_UKR_PROT( dcomplex, z, gemm_armsve_asm_2vx7_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_2vx10_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_cv_armsve_2vx10_unindexed ) //GEMMSUP_KER_PROT( double, d, gemmsup_rv_armsve_10x2v_unindexed ) diff --git a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c index 7b420f202f..4d9a888178 100644 --- a/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c +++ b/kernels/armv8a/3/bli_gemm_armv8a_asm_d6x8.c @@ -92,8 +92,8 @@ void bli_sgemm_armv8a_asm_8x12 " ldr x10,%[cs_c] \n\t" // Load cs_c. " lsl x10,x10,#2 \n\t" // cs_c * sizeof(float) -- AUX. " \n\t" - " ldr x14,%[rs_c] \n\t" // Load rs_c. - " lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). + // " ldr x14,%[rs_c] \n\t" // Load rs_c. + // " lsl x14,x14,#2 \n\t" // rs_c * sizeof(float). " \n\t" " add x16,x2,x10 \n\t" //Load address Column 1 of C " add x17,x16,x10 \n\t" //Load address Column 2 of C @@ -509,9 +509,6 @@ void bli_sgemm_armv8a_asm_8x12 " ldr x0,%[a_next] \n\t" // Pointer to next block of A. " ldr x1,%[b_next] \n\t" // Pointer to next pointer of B. " \n\t" - " cmp x14,#4 \n\t" // If rs_c != 1 (column-major) - BNE(SGENSTORED) - " \n\t" LABEL(SCOLSTORED) // C is column-major. " \n\t" " dup v0.4s, wzr \n\t" @@ -678,384 +675,8 @@ void bli_sgemm_armv8a_asm_8x12 " str q13, [x27, #16] \n\t" " \n\t" " \n\t" - BRANCH(SEND) // Done. - " \n\t" - " \n\t" - LABEL(SGENSTORED) // C is general-stride stored. - " \n\t" - " \n\t" - " dup v0.4s, wzr \n\t" - " dup v1.4s, wzr \n\t" - " dup v2.4s, wzr \n\t" - " dup v3.4s, wzr \n\t" - " dup v4.4s, wzr \n\t" - " dup v5.4s, wzr \n\t" - " \n\t" - " fcmp s7,#0.0 \n\t" - BEQ(SBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. - " \n\t" - " mov x5, x2 \n\t" - " \n\t" - " ld1 {v0.s}[0],[x5],x14 \n\t" // Load c00 into quad and increment by rs_c. - " ld1 {v0.s}[1],[x5],x14 \n\t" // Load c01 into quad and increment by rs_c. - " ld1 {v0.s}[2],[x5],x14 \n\t" // Load c02 into quad and increment by rs_c. - " ld1 {v0.s}[3],[x5],x14 \n\t" // Load c03 into quad and increment by rs_c. - " ld1 {v1.s}[0],[x5],x14 \n\t" // Load c04 into quad and increment by rs_c. - " ld1 {v1.s}[1],[x5],x14 \n\t" // Load c05 into quad and increment by rs_c. - " ld1 {v1.s}[2],[x5],x14 \n\t" // Load c06 into quad and increment by rs_c. - " ld1 {v1.s}[3],[x5],x14 \n\t" // Load c07 into quad and increment by rs_c. - " \n\t" - " mov x5, x16 \n\t" - " \n\t" - " ld1 {v2.s}[0],[x5],x14 \n\t" // Load c10 into quad and increment by rs_c. - " ld1 {v2.s}[1],[x5],x14 \n\t" // Load c11 into quad and increment by rs_c. - " ld1 {v2.s}[2],[x5],x14 \n\t" // Load c12 into quad and increment by rs_c. - " ld1 {v2.s}[3],[x5],x14 \n\t" // Load c13 into quad and increment by rs_c. - " ld1 {v3.s}[0],[x5],x14 \n\t" // Load c14 into quad and increment by rs_c. - " ld1 {v3.s}[1],[x5],x14 \n\t" // Load c15 into quad and increment by rs_c. - " ld1 {v3.s}[2],[x5],x14 \n\t" // Load c16 into quad and increment by rs_c. - " ld1 {v3.s}[3],[x5],x14 \n\t" // Load c17 into quad and increment by rs_c. - " \n\t" - " mov x5, x17 \n\t" - " \n\t" - " ld1 {v4.s}[0],[x5],x14 \n\t" // Load c20 into quad and increment by rs_c. - " ld1 {v4.s}[1],[x5],x14 \n\t" // Load c21 into quad and increment by rs_c. - " ld1 {v4.s}[2],[x5],x14 \n\t" // Load c22 into quad and increment by rs_c. - " ld1 {v4.s}[3],[x5],x14 \n\t" // Load c23 into quad and increment by rs_c. - " ld1 {v5.s}[0],[x5],x14 \n\t" // Load c24 into quad and increment by rs_c. - " ld1 {v5.s}[1],[x5],x14 \n\t" // Load c25 into quad and increment by rs_c. - " ld1 {v5.s}[2],[x5],x14 \n\t" // Load c26 into quad and increment by rs_c. - " ld1 {v5.s}[3],[x5],x14 \n\t" // Load c27 into quad and increment by rs_c. - " \n\t" - " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta - " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta - " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta - " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta - " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta - " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta - " \n\t" - LABEL(SBETAZEROGENSTOREDS1) - " \n\t" - " fmla v0.4s, v8.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v1.4s, v9.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v2.4s,v10.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v3.4s,v11.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v4.4s,v12.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v5.4s,v13.4s,v6.s[0] \n\t" // Scale by alpha - " \n\t" - " mov x5, x2 \n\t" - " \n\t" - " st1 {v0.s}[0],[x5],x14 \n\t" // Store c00 into quad and increment by rs_c. - " st1 {v0.s}[1],[x5],x14 \n\t" // Store c01 into quad and increment by rs_c. - " st1 {v0.s}[2],[x5],x14 \n\t" // Store c02 into quad and increment by rs_c. - " st1 {v0.s}[3],[x5],x14 \n\t" // Store c03 into quad and increment by rs_c. - " st1 {v1.s}[0],[x5],x14 \n\t" // Store c04 into quad and increment by rs_c. - " st1 {v1.s}[1],[x5],x14 \n\t" // Store c05 into quad and increment by rs_c. - " st1 {v1.s}[2],[x5],x14 \n\t" // Store c06 into quad and increment by rs_c. - " st1 {v1.s}[3],[x5],x14 \n\t" // Store c07 into quad and increment by rs_c. - " \n\t" - " mov x5, x16 \n\t" - " \n\t" - " st1 {v2.s}[0],[x5],x14 \n\t" // Store c10 into quad and increment by rs_c. - " st1 {v2.s}[1],[x5],x14 \n\t" // Store c11 into quad and increment by rs_c. - " st1 {v2.s}[2],[x5],x14 \n\t" // Store c12 into quad and increment by rs_c. - " st1 {v2.s}[3],[x5],x14 \n\t" // Store c13 into quad and increment by rs_c. - " st1 {v3.s}[0],[x5],x14 \n\t" // Store c14 into quad and increment by rs_c. - " st1 {v3.s}[1],[x5],x14 \n\t" // Store c15 into quad and increment by rs_c. - " st1 {v3.s}[2],[x5],x14 \n\t" // Store c16 into quad and increment by rs_c. - " st1 {v3.s}[3],[x5],x14 \n\t" // Store c17 into quad and increment by rs_c. - " \n\t" - " mov x5, x17 \n\t" - " \n\t" - " st1 {v4.s}[0],[x5],x14 \n\t" // Store c20 into quad and increment by rs_c. - " st1 {v4.s}[1],[x5],x14 \n\t" // Store c21 into quad and increment by rs_c. - " st1 {v4.s}[2],[x5],x14 \n\t" // Store c22 into quad and increment by rs_c. - " st1 {v4.s}[3],[x5],x14 \n\t" // Store c23 into quad and increment by rs_c. - " st1 {v5.s}[0],[x5],x14 \n\t" // Store c24 into quad and increment by rs_c. - " st1 {v5.s}[1],[x5],x14 \n\t" // Store c25 into quad and increment by rs_c. - " st1 {v5.s}[2],[x5],x14 \n\t" // Store c26 into quad and increment by rs_c. - " st1 {v5.s}[3],[x5],x14 \n\t" // Store c27 into quad and increment by rs_c. - " \n\t" - " dup v8.4s, wzr \n\t" - " dup v9.4s, wzr \n\t" - " dup v10.4s, wzr \n\t" - " dup v11.4s, wzr \n\t" - " dup v12.4s, wzr \n\t" - " dup v13.4s, wzr \n\t" - " \n\t" - " fcmp s7,#0.0 \n\t" - BEQ(SBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. - " \n\t" - " mov x5, x19 \n\t" - " \n\t" - " ld1 {v8.s}[0],[x5],x14 \n\t" // Load c30 into quad and increment by rs_c. - " ld1 {v8.s}[1],[x5],x14 \n\t" // Load c31 into quad and increment by rs_c. - " ld1 {v8.s}[2],[x5],x14 \n\t" // Load c32 into quad and increment by rs_c. - " ld1 {v8.s}[3],[x5],x14 \n\t" // Load c33 into quad and increment by rs_c. - " ld1 {v9.s}[0],[x5],x14 \n\t" // Load c34 into quad and increment by rs_c. - " ld1 {v9.s}[1],[x5],x14 \n\t" // Load c35 into quad and increment by rs_c. - " ld1 {v9.s}[2],[x5],x14 \n\t" // Load c36 into quad and increment by rs_c. - " ld1 {v9.s}[3],[x5],x14 \n\t" // Load c37 into quad and increment by rs_c. - " \n\t" - " mov x5, x20 \n\t" - " \n\t" - " ld1 {v10.s}[0],[x5],x14 \n\t" // Load c40 into quad and increment by rs_c. - " ld1 {v10.s}[1],[x5],x14 \n\t" // Load c41 into quad and increment by rs_c. - " ld1 {v10.s}[2],[x5],x14 \n\t" // Load c42 into quad and increment by rs_c. - " ld1 {v10.s}[3],[x5],x14 \n\t" // Load c43 into quad and increment by rs_c. - " ld1 {v11.s}[0],[x5],x14 \n\t" // Load c44 into quad and increment by rs_c. - " ld1 {v11.s}[1],[x5],x14 \n\t" // Load c45 into quad and increment by rs_c. - " ld1 {v11.s}[2],[x5],x14 \n\t" // Load c46 into quad and increment by rs_c. - " ld1 {v11.s}[3],[x5],x14 \n\t" // Load c47 into quad and increment by rs_c. - " \n\t" - " mov x5, x21 \n\t" - " \n\t" - " ld1 {v12.s}[0],[x5],x14 \n\t" // Load c50 into quad and increment by rs_c. - " ld1 {v12.s}[1],[x5],x14 \n\t" // Load c51 into quad and increment by rs_c. - " ld1 {v12.s}[2],[x5],x14 \n\t" // Load c52 into quad and increment by rs_c. - " ld1 {v12.s}[3],[x5],x14 \n\t" // Load c53 into quad and increment by rs_c. - " ld1 {v13.s}[0],[x5],x14 \n\t" // Load c54 into quad and increment by rs_c. - " ld1 {v13.s}[1],[x5],x14 \n\t" // Load c55 into quad and increment by rs_c. - " ld1 {v13.s}[2],[x5],x14 \n\t" // Load c56 into quad and increment by rs_c. - " ld1 {v13.s}[3],[x5],x14 \n\t" // Load c57 into quad and increment by rs_c. - " \n\t" - " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta - " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta - " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta - " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta - " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta - " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta - " \n\t" - LABEL(SBETAZEROGENSTOREDS2) - " \n\t" - " fmla v8.4s, v14.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v9.4s, v15.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v10.4s,v16.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v11.4s,v17.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v12.4s,v18.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v13.4s,v19.4s,v6.s[0] \n\t" // Scale by alpha - " \n\t" - " mov x5, x19 \n\t" - " \n\t" - " st1 {v8.s}[0],[x5],x14 \n\t" // Store c30 into quad and increment by rs_c. - " st1 {v8.s}[1],[x5],x14 \n\t" // Store c31 into quad and increment by rs_c. - " st1 {v8.s}[2],[x5],x14 \n\t" // Store c32 into quad and increment by rs_c. - " st1 {v8.s}[3],[x5],x14 \n\t" // Store c33 into quad and increment by rs_c. - " st1 {v9.s}[0],[x5],x14 \n\t" // Store c34 into quad and increment by rs_c. - " st1 {v9.s}[1],[x5],x14 \n\t" // Store c35 into quad and increment by rs_c. - " st1 {v9.s}[2],[x5],x14 \n\t" // Store c36 into quad and increment by rs_c. - " st1 {v9.s}[3],[x5],x14 \n\t" // Store c37 into quad and increment by rs_c. - " \n\t" - " mov x5, x20 \n\t" - " \n\t" - " st1 {v10.s}[0],[x5],x14 \n\t" // Store c40 into quad and increment by rs_c. - " st1 {v10.s}[1],[x5],x14 \n\t" // Store c41 into quad and increment by rs_c. - " st1 {v10.s}[2],[x5],x14 \n\t" // Store c42 into quad and increment by rs_c. - " st1 {v10.s}[3],[x5],x14 \n\t" // Store c43 into quad and increment by rs_c. - " st1 {v11.s}[0],[x5],x14 \n\t" // Store c44 into quad and increment by rs_c. - " st1 {v11.s}[1],[x5],x14 \n\t" // Store c45 into quad and increment by rs_c. - " st1 {v11.s}[2],[x5],x14 \n\t" // Store c46 into quad and increment by rs_c. - " st1 {v11.s}[3],[x5],x14 \n\t" // Store c47 into quad and increment by rs_c. - " \n\t" - " mov x5, x21 \n\t" - " \n\t" - " st1 {v12.s}[0],[x5],x14 \n\t" // Store c50 into quad and increment by rs_c. - " st1 {v12.s}[1],[x5],x14 \n\t" // Store c51 into quad and increment by rs_c. - " st1 {v12.s}[2],[x5],x14 \n\t" // Store c52 into quad and increment by rs_c. - " st1 {v12.s}[3],[x5],x14 \n\t" // Store c53 into quad and increment by rs_c. - " st1 {v13.s}[0],[x5],x14 \n\t" // Store c54 into quad and increment by rs_c. - " st1 {v13.s}[1],[x5],x14 \n\t" // Store c55 into quad and increment by rs_c. - " st1 {v13.s}[2],[x5],x14 \n\t" // Store c56 into quad and increment by rs_c. - " st1 {v13.s}[3],[x5],x14 \n\t" // Store c57 into quad and increment by rs_c. - " \n\t" - " dup v0.4s, wzr \n\t" - " dup v1.4s, wzr \n\t" - " dup v2.4s, wzr \n\t" - " dup v3.4s, wzr \n\t" - " dup v4.4s, wzr \n\t" - " dup v5.4s, wzr \n\t" - " \n\t" - " fcmp s7,#0.0 \n\t" - BEQ(SBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. - " \n\t" - " mov x5, x22 \n\t" - " \n\t" - " ld1 {v0.s}[0],[x5],x14 \n\t" // Load c60 into quad and increment by rs_c. - " ld1 {v0.s}[1],[x5],x14 \n\t" // Load c61 into quad and increment by rs_c. - " ld1 {v0.s}[2],[x5],x14 \n\t" // Load c62 into quad and increment by rs_c. - " ld1 {v0.s}[3],[x5],x14 \n\t" // Load c63 into quad and increment by rs_c. - " ld1 {v1.s}[0],[x5],x14 \n\t" // Load c64 into quad and increment by rs_c. - " ld1 {v1.s}[1],[x5],x14 \n\t" // Load c65 into quad and increment by rs_c. - " ld1 {v1.s}[2],[x5],x14 \n\t" // Load c66 into quad and increment by rs_c. - " ld1 {v1.s}[3],[x5],x14 \n\t" // Load c67 into quad and increment by rs_c. - " \n\t" - " mov x5, x23 \n\t" - " \n\t" - " ld1 {v2.s}[0],[x5],x14 \n\t" // Load c70 into quad and increment by rs_c. - " ld1 {v2.s}[1],[x5],x14 \n\t" // Load c71 into quad and increment by rs_c. - " ld1 {v2.s}[2],[x5],x14 \n\t" // Load c72 into quad and increment by rs_c. - " ld1 {v2.s}[3],[x5],x14 \n\t" // Load c73 into quad and increment by rs_c. - " ld1 {v3.s}[0],[x5],x14 \n\t" // Load c74 into quad and increment by rs_c. - " ld1 {v3.s}[1],[x5],x14 \n\t" // Load c75 into quad and increment by rs_c. - " ld1 {v3.s}[2],[x5],x14 \n\t" // Load c76 into quad and increment by rs_c. - " ld1 {v3.s}[3],[x5],x14 \n\t" // Load c77 into quad and increment by rs_c. - " \n\t" - " mov x5, x24 \n\t" - " \n\t" - " ld1 {v4.s}[0],[x5],x14 \n\t" // Load c80 into quad and increment by rs_c. - " ld1 {v4.s}[1],[x5],x14 \n\t" // Load c81 into quad and increment by rs_c. - " ld1 {v4.s}[2],[x5],x14 \n\t" // Load c82 into quad and increment by rs_c. - " ld1 {v4.s}[3],[x5],x14 \n\t" // Load c83 into quad and increment by rs_c. - " ld1 {v5.s}[0],[x5],x14 \n\t" // Load c84 into quad and increment by rs_c. - " ld1 {v5.s}[1],[x5],x14 \n\t" // Load c85 into quad and increment by rs_c. - " ld1 {v5.s}[2],[x5],x14 \n\t" // Load c86 into quad and increment by rs_c. - " ld1 {v5.s}[3],[x5],x14 \n\t" // Load c87 into quad and increment by rs_c. - " \n\t" - " fmul v0.4s,v0.4s,v7.s[0] \n\t" // Scale by beta - " fmul v1.4s,v1.4s,v7.s[0] \n\t" // Scale by beta - " fmul v2.4s,v2.4s,v7.s[0] \n\t" // Scale by beta - " fmul v3.4s,v3.4s,v7.s[0] \n\t" // Scale by beta - " fmul v4.4s,v4.4s,v7.s[0] \n\t" // Scale by beta - " fmul v5.4s,v5.4s,v7.s[0] \n\t" // Scale by beta - " \n\t" - LABEL(SBETAZEROGENSTOREDS3) - " \n\t" - " fmla v0.4s,v20.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v1.4s,v21.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v2.4s,v22.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v3.4s,v23.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v4.4s,v24.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v5.4s,v25.4s,v6.s[0] \n\t" // Scale by alpha - " \n\t" - " mov x5, x22 \n\t" - " \n\t" - " st1 {v0.s}[0],[x5],x14 \n\t" // Store c60 into quad and increment by rs_c. - " st1 {v0.s}[1],[x5],x14 \n\t" // Store c61 into quad and increment by rs_c. - " st1 {v0.s}[2],[x5],x14 \n\t" // Store c62 into quad and increment by rs_c. - " st1 {v0.s}[3],[x5],x14 \n\t" // Store c63 into quad and increment by rs_c. - " st1 {v1.s}[0],[x5],x14 \n\t" // Store c64 into quad and increment by rs_c. - " st1 {v1.s}[1],[x5],x14 \n\t" // Store c65 into quad and increment by rs_c. - " st1 {v1.s}[2],[x5],x14 \n\t" // Store c66 into quad and increment by rs_c. - " st1 {v1.s}[3],[x5],x14 \n\t" // Store c67 into quad and increment by rs_c. - " \n\t" - " mov x5, x23 \n\t" - " \n\t" - " st1 {v2.s}[0],[x5],x14 \n\t" // Store c70 into quad and increment by rs_c. - " st1 {v2.s}[1],[x5],x14 \n\t" // Store c71 into quad and increment by rs_c. - " st1 {v2.s}[2],[x5],x14 \n\t" // Store c72 into quad and increment by rs_c. - " st1 {v2.s}[3],[x5],x14 \n\t" // Store c73 into quad and increment by rs_c. - " st1 {v3.s}[0],[x5],x14 \n\t" // Store c74 into quad and increment by rs_c. - " st1 {v3.s}[1],[x5],x14 \n\t" // Store c75 into quad and increment by rs_c. - " st1 {v3.s}[2],[x5],x14 \n\t" // Store c76 into quad and increment by rs_c. - " st1 {v3.s}[3],[x5],x14 \n\t" // Store c77 into quad and increment by rs_c. - " \n\t" - " mov x5, x24 \n\t" - " \n\t" - " st1 {v4.s}[0],[x5],x14 \n\t" // Store c80 into quad and increment by rs_c. - " st1 {v4.s}[1],[x5],x14 \n\t" // Store c81 into quad and increment by rs_c. - " st1 {v4.s}[2],[x5],x14 \n\t" // Store c82 into quad and increment by rs_c. - " st1 {v4.s}[3],[x5],x14 \n\t" // Store c83 into quad and increment by rs_c. - " st1 {v5.s}[0],[x5],x14 \n\t" // Store c84 into quad and increment by rs_c. - " st1 {v5.s}[1],[x5],x14 \n\t" // Store c85 into quad and increment by rs_c. - " st1 {v5.s}[2],[x5],x14 \n\t" // Store c86 into quad and increment by rs_c. - " st1 {v5.s}[3],[x5],x14 \n\t" // Store c87 into quad and increment by rs_c. - " \n\t" - " dup v8.4s, wzr \n\t" - " dup v9.4s, wzr \n\t" - " dup v10.4s, wzr \n\t" - " dup v11.4s, wzr \n\t" - " dup v12.4s, wzr \n\t" - " dup v13.4s, wzr \n\t" - " \n\t" - " fcmp s7,#0.0 \n\t" - BEQ(SBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. - " \n\t" - " mov x5, x25 \n\t" - " \n\t" - " ld1 {v8.s}[0],[x5],x14 \n\t" // Load c90 into quad and increment by rs_c. - " ld1 {v8.s}[1],[x5],x14 \n\t" // Load c91 into quad and increment by rs_c. - " ld1 {v8.s}[2],[x5],x14 \n\t" // Load c92 into quad and increment by rs_c. - " ld1 {v8.s}[3],[x5],x14 \n\t" // Load c93 into quad and increment by rs_c. - " ld1 {v9.s}[0],[x5],x14 \n\t" // Load c94 into quad and increment by rs_c. - " ld1 {v9.s}[1],[x5],x14 \n\t" // Load c95 into quad and increment by rs_c. - " ld1 {v9.s}[2],[x5],x14 \n\t" // Load c96 into quad and increment by rs_c. - " ld1 {v9.s}[3],[x5],x14 \n\t" // Load c97 into quad and increment by rs_c. - " \n\t" - " mov x5, x26 \n\t" - " \n\t" - " ld1 {v10.s}[0],[x5],x14 \n\t" // Load c100 into quad and increment by rs_c. - " ld1 {v10.s}[1],[x5],x14 \n\t" // Load c101 into quad and increment by rs_c. - " ld1 {v10.s}[2],[x5],x14 \n\t" // Load c102 into quad and increment by rs_c. - " ld1 {v10.s}[3],[x5],x14 \n\t" // Load c103 into quad and increment by rs_c. - " ld1 {v11.s}[0],[x5],x14 \n\t" // Load c104 into quad and increment by rs_c. - " ld1 {v11.s}[1],[x5],x14 \n\t" // Load c105 into quad and increment by rs_c. - " ld1 {v11.s}[2],[x5],x14 \n\t" // Load c106 into quad and increment by rs_c. - " ld1 {v11.s}[3],[x5],x14 \n\t" // Load c107 into quad and increment by rs_c. - " \n\t" - " mov x5, x27 \n\t" - " \n\t" - " ld1 {v12.s}[0],[x5],x14 \n\t" // Load c110 into quad and increment by rs_c. - " ld1 {v12.s}[1],[x5],x14 \n\t" // Load c111 into quad and increment by rs_c. - " ld1 {v12.s}[2],[x5],x14 \n\t" // Load c112 into quad and increment by rs_c. - " ld1 {v12.s}[3],[x5],x14 \n\t" // Load c113 into quad and increment by rs_c. - " ld1 {v13.s}[0],[x5],x14 \n\t" // Load c114 into quad and increment by rs_c. - " ld1 {v13.s}[1],[x5],x14 \n\t" // Load c115 into quad and increment by rs_c. - " ld1 {v13.s}[2],[x5],x14 \n\t" // Load c116 into quad and increment by rs_c. - " ld1 {v13.s}[3],[x5],x14 \n\t" // Load c117 into quad and increment by rs_c. - " \n\t" - " fmul v8.4s, v8.4s, v7.s[0] \n\t" // Scale by beta - " fmul v9.4s, v9.4s, v7.s[0] \n\t" // Scale by beta - " fmul v10.4s,v10.4s,v7.s[0] \n\t" // Scale by beta - " fmul v11.4s,v11.4s,v7.s[0] \n\t" // Scale by beta - " fmul v12.4s,v12.4s,v7.s[0] \n\t" // Scale by beta - " fmul v13.4s,v13.4s,v7.s[0] \n\t" // Scale by beta - " \n\t" - LABEL(SBETAZEROGENSTOREDS4) - " \n\t" - " prfm pldl2keep,[x0] \n\t" - " prfm pldl2keep,[x1] \n\t" - " \n\t" - " fmla v8.4s, v26.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v9.4s, v27.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v10.4s,v28.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v11.4s,v29.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v12.4s,v30.4s,v6.s[0] \n\t" // Scale by alpha - " fmla v13.4s,v31.4s,v6.s[0] \n\t" // Scale by alpha - " \n\t" - " mov x5, x25 \n\t" - " \n\t" - " st1 {v8.s}[0],[x5],x14 \n\t" // Store c90 into quad and increment by rs_c. - " st1 {v8.s}[1],[x5],x14 \n\t" // Store c91 into quad and increment by rs_c. - " st1 {v8.s}[2],[x5],x14 \n\t" // Store c92 into quad and increment by rs_c. - " st1 {v8.s}[3],[x5],x14 \n\t" // Store c93 into quad and increment by rs_c. - " st1 {v9.s}[0],[x5],x14 \n\t" // Store c94 into quad and increment by rs_c. - " st1 {v9.s}[1],[x5],x14 \n\t" // Store c95 into quad and increment by rs_c. - " st1 {v9.s}[2],[x5],x14 \n\t" // Store c96 into quad and increment by rs_c. - " st1 {v9.s}[3],[x5],x14 \n\t" // Store c97 into quad and increment by rs_c. - " \n\t" - " mov x5, x26 \n\t" - " \n\t" - " st1 {v10.s}[0],[x5],x14 \n\t" // Store c100 into quad and increment by rs_c. - " st1 {v10.s}[1],[x5],x14 \n\t" // Store c101 into quad and increment by rs_c. - " st1 {v10.s}[2],[x5],x14 \n\t" // Store c102 into quad and increment by rs_c. - " st1 {v10.s}[3],[x5],x14 \n\t" // Store c103 into quad and increment by rs_c. - " st1 {v11.s}[0],[x5],x14 \n\t" // Store c104 into quad and increment by rs_c. - " st1 {v11.s}[1],[x5],x14 \n\t" // Store c105 into quad and increment by rs_c. - " st1 {v11.s}[2],[x5],x14 \n\t" // Store c106 into quad and increment by rs_c. - " st1 {v11.s}[3],[x5],x14 \n\t" // Store c107 into quad and increment by rs_c. - " \n\t" - " mov x5, x27 \n\t" - " \n\t" - " st1 {v12.s}[0],[x5],x14 \n\t" // Store c110 into quad and increment by rs_c. - " st1 {v12.s}[1],[x5],x14 \n\t" // Store c111 into quad and increment by rs_c. - " st1 {v12.s}[2],[x5],x14 \n\t" // Store c112 into quad and increment by rs_c. - " st1 {v12.s}[3],[x5],x14 \n\t" // Store c113 into quad and increment by rs_c. - " st1 {v13.s}[0],[x5],x14 \n\t" // Store c114 into quad and increment by rs_c. - " st1 {v13.s}[1],[x5],x14 \n\t" // Store c115 into quad and increment by rs_c. - " st1 {v13.s}[2],[x5],x14 \n\t" // Store c116 into quad and increment by rs_c. - " st1 {v13.s}[3],[x5],x14 \n\t" // Store c147 into quad and increment by rs_c. - " \n\t" - LABEL(SEND) // Done! + // BRANCH(SEND) // Done. + // LABEL(SEND) // Done! " \n\t" :// output operands (none) :// input operands @@ -1072,7 +693,7 @@ void bli_sgemm_armv8a_asm_8x12 [b_next] "m" (b_next) // 10 :// Register clobber list "x0", "x1", "x2", - "x5", "x6", "x10","x14", + "x5", "x6", "x10", "x16","x17","x19","x20", "x21","x22","x23","x24", "x25","x26","x27", @@ -1148,8 +769,8 @@ void bli_dgemm_armv8a_asm_6x8 " ldr x10,%[cs_c] \n\t" // Load cs_c " lsl x10,x10,#3 \n\t" // cs_c * sizeof(double) " \n\t" - " ldr x14,%[rs_c] \n\t" // Load rs_c. - " lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). + // " ldr x14,%[rs_c] \n\t" // Load rs_c. + // " lsl x14,x14,#3 \n\t" // rs_c * sizeof(double). " \n\t" " add x20,x2,x10 \n\t" //Load address Column 1 of C " add x21,x20,x10 \n\t" //Load address Column 2 of C @@ -1620,9 +1241,6 @@ void bli_dgemm_armv8a_asm_6x8 " ldr x0,%[a_next] \n\t" // Next A address for later use. " ldr x1,%[b_next] \n\t" // Next B address for later use. " \n\t" - " cmp x14,#8 \n\t" // If rs_c != 1 (column-major) - BNE(DGENSTORED) - " \n\t" LABEL(DCOLSTORED) // C is column-major. " \n\t" " dup v0.2d, xzr \n\t" @@ -1796,262 +1414,8 @@ void bli_dgemm_armv8a_asm_6x8 " str q12, [x26, #16] \n\t" " str q13, [x26, #32] \n\t" " \n\t" - BRANCH(DEND) - " \n\t" - LABEL(DGENSTORED) // C is general-stride stored. - " \n\t" - " dup v0.2d, xzr \n\t" - " dup v1.2d, xzr \n\t" - " dup v2.2d, xzr \n\t" - " dup v3.2d, xzr \n\t" - " dup v4.2d, xzr \n\t" - " dup v5.2d, xzr \n\t" - " \n\t" - " fcmp d7,#0.0 \n\t" - BEQ(DBETAZEROGENSTOREDS1) // Taking care of the beta==0 case. - " \n\t" - " mov x27, x2 \n\t" - " \n\t" // Load address of C. - " ld1 {v0.d}[0],[x27],x14 \n\t" // Load c00 into quad and increment by rs_c. - " ld1 {v0.d}[1],[x27],x14 \n\t" // Load c01 into quad and increment by rs_c. - " ld1 {v1.d}[0],[x27],x14 \n\t" // Load c02 into quad and increment by rs_c. - " ld1 {v1.d}[1],[x27],x14 \n\t" // Load c03 into quad and increment by rs_c. - " ld1 {v2.d}[0],[x27],x14 \n\t" // Load c04 into quad and increment by rs_c. - " ld1 {v2.d}[1],[x27],x14 \n\t" // Load c05 into quad and increment by rs_c. - " \n\t" - " mov x27, x20 \n\t" // Load address of C. - " \n\t" - " ld1 {v3.d}[0],[x27],x14 \n\t" // Load c10 into quad and increment by rs_c. - " ld1 {v3.d}[1],[x27],x14 \n\t" // Load c11 into quad and increment by rs_c. - " ld1 {v4.d}[0],[x27],x14 \n\t" // Load c12 into quad and increment by rs_c. - " ld1 {v4.d}[1],[x27],x14 \n\t" // Load c13 into quad and increment by rs_c. - " ld1 {v5.d}[0],[x27],x14 \n\t" // Load c14 into quad and increment by rs_c. - " ld1 {v5.d}[1],[x27],x14 \n\t" // Load c15 into quad and increment by rs_c. - " \n\t" - " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta - " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta - " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta - " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta - " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta - " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta - " \n\t" - LABEL(DBETAZEROGENSTOREDS1) - " \n\t" - " fmla v0.2d,v8.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v1.2d,v9.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v2.2d,v10.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v3.2d,v11.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v4.2d,v12.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v5.2d,v13.2d,v6.d[0] \n\t" // Scale by alpha - " \n\t" - " mov x27, x2 \n\t" // Load address of C. - " \n\t" - " st1 {v0.d}[0],[x27],x14 \n\t" // Store c00 into quad and increment by rs_c. - " st1 {v0.d}[1],[x27],x14 \n\t" // Store c01 into quad and increment by rs_c. - " st1 {v1.d}[0],[x27],x14 \n\t" // Store c02 into quad and increment by rs_c. - " st1 {v1.d}[1],[x27],x14 \n\t" // Store c03 into quad and increment by rs_c. - " st1 {v2.d}[0],[x27],x14 \n\t" // Store c04 into quad and increment by rs_c. - " st1 {v2.d}[1],[x27],x14 \n\t" // Store c05 into quad and increment by rs_c. - " \n\t" - " mov x27, x20 \n\t" // Load address of C. - " \n\t" - " st1 {v3.d}[0],[x27],x14 \n\t" // Store c10 into quad and increment by rs_c. - " st1 {v3.d}[1],[x27],x14 \n\t" // Store c11 into quad and increment by rs_c. - " st1 {v4.d}[0],[x27],x14 \n\t" // Store c12 into quad and increment by rs_c. - " st1 {v4.d}[1],[x27],x14 \n\t" // Store c13 into quad and increment by rs_c. - " st1 {v5.d}[0],[x27],x14 \n\t" // Store c14 into quad and increment by rs_c. - " st1 {v5.d}[1],[x27],x14 \n\t" // Store c15 into quad and increment by rs_c. - " \n\t" - " dup v8.2d, xzr \n\t" - " dup v9.2d, xzr \n\t" - " dup v10.2d, xzr \n\t" - " dup v11.2d, xzr \n\t" - " dup v12.2d, xzr \n\t" - " dup v13.2d, xzr \n\t" - " \n\t" - " fcmp d7,#0.0 \n\t" - BEQ(DBETAZEROGENSTOREDS2) // Taking care of the beta==0 case. - " \n\t" - " mov x27, x21 \n\t" // Load address of C. - " \n\t" - " ld1 {v8.d}[0], [x27],x14 \n\t" // Load c20 into quad and increment by rs_c. - " ld1 {v8.d}[1], [x27],x14 \n\t" // Load c21 into quad and increment by rs_c. - " ld1 {v9.d}[0], [x27],x14 \n\t" // Load c22 into quad and increment by rs_c. - " ld1 {v9.d}[1], [x27],x14 \n\t" // Load c23 into quad and increment by rs_c. - " ld1 {v10.d}[0],[x27],x14 \n\t" // Load c24 into quad and increment by rs_c. - " ld1 {v10.d}[1],[x27],x14 \n\t" // Load c25 into quad and increment by rs_c. - " \n\t" - " mov x27, x22 \n\t" // Load address of C. - " \n\t" - " ld1 {v11.d}[0],[x27],x14 \n\t" // Load c30 into quad and increment by rs_c. - " ld1 {v11.d}[1],[x27],x14 \n\t" // Load c31 into quad and increment by rs_c. - " ld1 {v12.d}[0],[x27],x14 \n\t" // Load c32 into quad and increment by rs_c. - " ld1 {v12.d}[1],[x27],x14 \n\t" // Load c33 into quad and increment by rs_c. - " ld1 {v13.d}[0],[x27],x14 \n\t" // Load c34 into quad and increment by rs_c. - " ld1 {v13.d}[1],[x27],x14 \n\t" // Load c35 into quad and increment by rs_c. - " \n\t" - " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta - " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta - " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta - " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta - " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta - " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta - " \n\t" - LABEL(DBETAZEROGENSTOREDS2) - " \n\t" - " fmla v8.2d, v14.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v9.2d, v15.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v10.2d,v16.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v11.2d,v17.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v12.2d,v18.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v13.2d,v19.2d,v6.d[0] \n\t" // Scale by alpha - " \n\t" - " mov x27, x21 \n\t" // Load address of C. - " \n\t" - " st1 {v8.d}[0], [x27],x14 \n\t" // Store c20 into quad and increment by rs_c. - " st1 {v8.d}[1], [x27],x14 \n\t" // Store c21 into quad and increment by rs_c. - " st1 {v9.d}[0], [x27],x14 \n\t" // Store c22 into quad and increment by rs_c. - " st1 {v9.d}[1], [x27],x14 \n\t" // Store c23 into quad and increment by rs_c. - " st1 {v10.d}[0],[x27],x14 \n\t" // Store c24 into quad and increment by rs_c. - " st1 {v10.d}[1],[x27],x14 \n\t" // Store c25 into quad and increment by rs_c. - " \n\t" - " mov x27, x22 \n\t" // Load address of C. - " \n\t" - " st1 {v11.d}[0],[x27],x14 \n\t" // Store c30 into quad and increment by rs_c. - " st1 {v11.d}[1],[x27],x14 \n\t" // Store c31 into quad and increment by rs_c. - " st1 {v12.d}[0],[x27],x14 \n\t" // Store c32 into quad and increment by rs_c. - " st1 {v12.d}[1],[x27],x14 \n\t" // Store c33 into quad and increment by rs_c. - " st1 {v13.d}[0],[x27],x14 \n\t" // Store c34 into quad and increment by rs_c. - " st1 {v13.d}[1],[x27],x14 \n\t" // Store c35 into quad and increment by rs_c. - " \n\t" - " dup v0.2d, xzr \n\t" - " dup v1.2d, xzr \n\t" - " dup v2.2d, xzr \n\t" - " dup v3.2d, xzr \n\t" - " dup v4.2d, xzr \n\t" - " dup v5.2d, xzr \n\t" - " \n\t" - " fcmp d7,#0.0 \n\t" - BEQ(DBETAZEROGENSTOREDS3) // Taking care of the beta==0 case. - " \n\t" - " mov x27, x23 \n\t" // Load address of C. - " \n\t" - " ld1 {v0.d}[0],[x27],x14 \n\t" // Load c40 into quad and increment by rs_c. - " ld1 {v0.d}[1],[x27],x14 \n\t" // Load c41 into quad and increment by rs_c. - " ld1 {v1.d}[0],[x27],x14 \n\t" // Load c42 into quad and increment by rs_c. - " ld1 {v1.d}[1],[x27],x14 \n\t" // Load c43 into quad and increment by rs_c. - " ld1 {v2.d}[0],[x27],x14 \n\t" // Load c44 into quad and increment by rs_c. - " ld1 {v2.d}[1],[x27],x14 \n\t" // Load c45 into quad and increment by rs_c. - " \n\t" - " mov x27, x24 \n\t" // Load address of C. - " \n\t" - " ld1 {v3.d}[0],[x27],x14 \n\t" // Load c50 into quad and increment by rs_c. - " ld1 {v3.d}[1],[x27],x14 \n\t" // Load c51 into quad and increment by rs_c. - " ld1 {v4.d}[0],[x27],x14 \n\t" // Load c52 into quad and increment by rs_c. - " ld1 {v4.d}[1],[x27],x14 \n\t" // Load c53 into quad and increment by rs_c. - " ld1 {v5.d}[0],[x27],x14 \n\t" // Load c54 into quad and increment by rs_c. - " ld1 {v5.d}[1],[x27],x14 \n\t" // Load c55 into quad and increment by rs_c. - " \n\t" - " fmul v0.2d,v0.2d,v7.d[0] \n\t" // Scale by beta - " fmul v1.2d,v1.2d,v7.d[0] \n\t" // Scale by beta - " fmul v2.2d,v2.2d,v7.d[0] \n\t" // Scale by beta - " fmul v3.2d,v3.2d,v7.d[0] \n\t" // Scale by beta - " fmul v4.2d,v4.2d,v7.d[0] \n\t" // Scale by beta - " fmul v5.2d,v5.2d,v7.d[0] \n\t" // Scale by beta - " \n\t" - LABEL(DBETAZEROGENSTOREDS3) - " \n\t" - " fmla v0.2d,v20.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v1.2d,v21.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v2.2d,v22.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v3.2d,v23.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v4.2d,v24.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v5.2d,v25.2d,v6.d[0] \n\t" // Scale by alpha - " \n\t" - " mov x27, x23 \n\t" // Load address of C. - " \n\t" - " st1 {v0.d}[0],[x27],x14 \n\t" // Store c40 into quad and increment by rs_c. - " st1 {v0.d}[1],[x27],x14 \n\t" // Store c41 into quad and increment by rs_c. - " st1 {v1.d}[0],[x27],x14 \n\t" // Store c42 into quad and increment by rs_c. - " st1 {v1.d}[1],[x27],x14 \n\t" // Store c43 into quad and increment by rs_c. - " st1 {v2.d}[0],[x27],x14 \n\t" // Store c44 into quad and increment by rs_c. - " st1 {v2.d}[1],[x27],x14 \n\t" // Store c45 into quad and increment by rs_c. - " \n\t" - " mov x27, x24 \n\t" // Load address of C. - " \n\t" - " st1 {v3.d}[0],[x27],x14 \n\t" // Store c50 into quad and increment by rs_c. - " st1 {v3.d}[1],[x27],x14 \n\t" // Store c51 into quad and increment by rs_c. - " st1 {v4.d}[0],[x27],x14 \n\t" // Store c52 into quad and increment by rs_c. - " st1 {v4.d}[1],[x27],x14 \n\t" // Store c53 into quad and increment by rs_c. - " st1 {v5.d}[0],[x27],x14 \n\t" // Store c54 into quad and increment by rs_c. - " st1 {v5.d}[1],[x27],x14 \n\t" // Store c55 into quad and increment by rs_c. - " \n\t" - " dup v8.2d, xzr \n\t" - " dup v9.2d, xzr \n\t" - " dup v10.2d, xzr \n\t" - " dup v11.2d, xzr \n\t" - " dup v12.2d, xzr \n\t" - " dup v13.2d, xzr \n\t" - " \n\t" - " fcmp d7,#0.0 \n\t" - BEQ(DBETAZEROGENSTOREDS4) // Taking care of the beta==0 case. - " \n\t" - " mov x27, x25 \n\t" - " \n\t" - " ld1 {v8.d}[0], [x27],x14 \n\t" // Load c60 into quad and increment by rs_c. - " ld1 {v8.d}[1], [x27],x14 \n\t" // Load c61 into quad and increment by rs_c. - " ld1 {v9.d}[0], [x27],x14 \n\t" // Load c62 into quad and increment by rs_c. - " ld1 {v9.d}[1], [x27],x14 \n\t" // Load c63 into quad and increment by rs_c. - " ld1 {v10.d}[0],[x27],x14 \n\t" // Load c64 into quad and increment by rs_c. - " ld1 {v10.d}[1],[x27],x14 \n\t" // Load c65 into quad and increment by rs_c. - " \n\t" - " mov x27, x26 \n\t" // Load address of C. - " \n\t" - " ld1 {v11.d}[0],[x27],x14 \n\t" // Load c70 into quad and increment by rs_c. - " ld1 {v11.d}[1],[x27],x14 \n\t" // Load c71 into quad and increment by rs_c. - " ld1 {v12.d}[0],[x27],x14 \n\t" // Load c72 into quad and increment by rs_c. - " ld1 {v12.d}[1],[x27],x14 \n\t" // Load c73 into quad and increment by rs_c. - " ld1 {v13.d}[0],[x27],x14 \n\t" // Load c74 into quad and increment by rs_c. - " ld1 {v13.d}[1],[x27],x14 \n\t" // Load c75 into quad and increment by rs_c. - " \n\t" - " fmul v8.2d, v8.2d, v7.d[0] \n\t" // Scale by beta - " fmul v9.2d, v9.2d, v7.d[0] \n\t" // Scale by beta - " fmul v10.2d,v10.2d,v7.d[0] \n\t" // Scale by beta - " fmul v11.2d,v11.2d,v7.d[0] \n\t" // Scale by beta - " fmul v12.2d,v12.2d,v7.d[0] \n\t" // Scale by beta - " fmul v13.2d,v13.2d,v7.d[0] \n\t" // Scale by beta - " \n\t" - LABEL(DBETAZEROGENSTOREDS4) - " \n\t" - " prfm pldl2keep,[x0] \n\t" - " prfm pldl2keep,[x1] \n\t" - " \n\t" - " fmla v8.2d, v26.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v9.2d, v27.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v10.2d,v28.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v11.2d,v29.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v12.2d,v30.2d,v6.d[0] \n\t" // Scale by alpha - " fmla v13.2d,v31.2d,v6.d[0] \n\t" // Scale by alpha - " \n\t" - " mov x27, x25 \n\t" // Load address of C. - " \n\t" - " st1 {v8.d}[0], [x27],x14 \n\t" // Store c60 into quad and increment by rs_c. - " st1 {v8.d}[1], [x27],x14 \n\t" // Store c61 into quad and increment by rs_c. - " st1 {v9.d}[0], [x27],x14 \n\t" // Store c62 into quad and increment by rs_c. - " st1 {v9.d}[1], [x27],x14 \n\t" // Store c63 into quad and increment by rs_c. - " st1 {v10.d}[0],[x27],x14 \n\t" // Store c64 into quad and increment by rs_c. - " st1 {v10.d}[1],[x27],x14 \n\t" // Store c65 into quad and increment by rs_c. - " \n\t" - " mov x27, x26 \n\t" // Load address of C. - " \n\t" - " st1 {v11.d}[0],[x27],x14 \n\t" // Store c70 into quad and increment by rs_c. - " st1 {v11.d}[1],[x27],x14 \n\t" // Store c71 into quad and increment by rs_c. - " st1 {v12.d}[0],[x27],x14 \n\t" // Store c72 into quad and increment by rs_c. - " st1 {v12.d}[1],[x27],x14 \n\t" // Store c73 into quad and increment by rs_c. - " st1 {v13.d}[0],[x27],x14 \n\t" // Store c74 into quad and increment by rs_c. - " st1 {v13.d}[1],[x27],x14 \n\t" // Store c75 into quad and increment by rs_c. - " \n\t" - LABEL(DEND) // Done! + // BRANCH(DEND) + // LABEL(DEND) // Done! " \n\t" :// output operands (none) :// input operands @@ -2069,8 +1433,9 @@ void bli_dgemm_armv8a_asm_6x8 :// Register clobber list "x0","x1","x2", "x5","x6","x10", - "x14","x16","x17", - "x20","x21","x22","x23","x24","x25","x26","x27", + "x16","x17","x20", + "x21","x22","x23", + "x24","x25","x26","x27", "v0","v1","v2", "v3","v4","v5", "v6","v7","v8", From 0c3ff05829a138b3d950934db53c0f43259728dc Mon Sep 17 00:00:00 2001 From: Ruqing Xu Date: Thu, 3 Feb 2022 16:40:02 +0000 Subject: [PATCH 2/4] Fix SVE Compil. --- config/a64fx/bli_family_a64fx.h | 10 ++++++++++ .../armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 2 +- .../armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 2 +- .../armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 2 +- .../armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 2 +- 5 files changed, 14 insertions(+), 4 deletions(-) diff --git a/config/a64fx/bli_family_a64fx.h b/config/a64fx/bli_family_a64fx.h index 5e3f29fd4b..b67ae7c606 100644 --- a/config/a64fx/bli_family_a64fx.h +++ b/config/a64fx/bli_family_a64fx.h @@ -41,6 +41,16 @@ #define BLIS_SIMD_ALIGN_SIZE 256 #define BLIS_SIMD_NUM_REGISTERS 32 +// SVE-specific configs. +#define N_L1_SVE_DEFAULT 64 +#define W_L1_SVE_DEFAULT 4 +#define C_L1_SVE_DEFAULT 256 +#define N_L2_SVE_DEFAULT 2048 +#define W_L2_SVE_DEFAULT 16 +#define C_L2_SVE_DEFAULT 256 +#define N_L3_SVE_DEFAULT 8192 +#define W_L3_SVE_DEFAULT 16 +#define C_L3_SVE_DEFAULT 256 //#endif diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index 60a64515fd..0327f6dbcc 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -// " END_CCOL_PRFM: \n\t" +" END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index 7136104b5b..e92eba9d6a 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -// " END_CCOL_PRFM: \n\t" +" END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index 20841891bf..deb01f9fea 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -139,7 +139,7 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" -// " END_CCOL_PRFM: \n\t" +" END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index 7e630894f2..e941f5abda 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -140,7 +140,7 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -// " END_CCOL_PRFM: \n\t" +" END_CCOL_PRFM: \n\t" " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" From 2aaf31132f21489183a2c339dbb48efb530a0ff1 Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Sat, 5 Feb 2022 16:56:04 +0900 Subject: [PATCH 3/4] ArmSVE Use Predicate in M-Direction No need to query MR during kernel runtime. --- .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 7 ++- .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 53 +++++++------------ .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 50 +++++++---------- .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 7 ++- 4 files changed, 43 insertions(+), 74 deletions(-) diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index 0327f6dbcc..c24384b02a 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -68,10 +68,10 @@ void bli_cgemm_armsve_asm_2vx10_unindexed uint64_t cs_c = cs_c0; uint64_t info = 0; - uint64_t mr = bli_vl_bytes_armsve() * 2 / 8; - GEMM_UKR_SETUP_CT( c, mr, 10, false ); + GEMM_UKR_SETUP_CT( c, m, 10, false ); __asm__ volatile ( +" whilelo p0.s, xzr, %12 \n\t" // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" @@ -97,7 +97,6 @@ void bli_cgemm_armsve_asm_2vx10_unindexed " madd x2, x16, x2, xzr \n\t" // cs_a " madd x3, x16, x3, xzr \n\t" // rs_b " madd %4, x16, %4, xzr \n\t" // cs_c -" ptrue p0.s \n\t" " \n\t" // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" @@ -307,7 +306,7 @@ GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) "+r" (a_next), // %9 "+r" (b_next), // %10 "=r" (info) // %11 -: +: "r" (m) // %12 : "x2","x3","x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index e92eba9d6a..1c2c37208c 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -67,10 +67,14 @@ void bli_dgemm_armsve_asm_2vx10_unindexed uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - uint64_t mr = bli_vl_bytes_armsve() * 2 / 8; - GEMM_UKR_SETUP_CT( d, mr, 10, false ); + GEMM_UKR_SETUP_CT( d, m, 10, false ); __asm__ volatile ( +" mov x0, xzr \n\t" +" ldr x1, %[m] \n\t" +" whilelo p0.d, x0, x1 \n\t" " incd x0 \n\t" +" whilelo p1.d, x0, x1 \n\t" +" \n\t" " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" @@ -96,7 +100,6 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c -" ptrue p0.d \n\t" " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" @@ -114,7 +117,7 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " ld1rd z26.d, p0/z, [x1, 48] \n\t" " ld1rd z27.d, p0/z, [x1, 56] \n\t" " \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" " CCOL_PRFM: \n\t" // " cmp x6, #1 \n\t" @@ -149,22 +152,22 @@ CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " K_MKER_LOOP: \n\t" " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. " b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " b K_MKER_LOOP \n\t" " \n\t" @@ -176,7 +179,7 @@ GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3 " cmp x8, #0 \n\t" // End of execution. " b.eq WRITE_MEM_PREP \n\t" " \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rd z21.d, p0/z, [x1, 8] \n\t" " ld1rd z22.d, p0/z, [x1, 16] \n\t" @@ -255,7 +258,7 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " \n\t" // C address for storing is x5 itself. // " cmp x6, #1 \n\t" // Preload first half of C for contiguous case. // " b.ne WRITE_MEM \n\t" -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) " \n\t" " WRITE_MEM: \n\t" " \n\t" @@ -273,35 +276,16 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. " b.eq BETA_ZERO_C \n\t" // First half of C is already loaded in this case. -// GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) +// GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" " BETA_ZERO_C: \n\t" -GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) -GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) +GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) +GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) // " b END_WRITE_MEM \n\t" // " \n\t" -// " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -// " \n\t" // Here used scratch: Z[20-30] - Z30 as index. -// " mov x8, xzr \n\t" -// " incb x8 \n\t" -// " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. -// " index z30.d, xzr, x6 \n\t" // Skips passed to index is not multiplied by 8. -// " \n\t" -// " fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. -// " b.eq BETA_ZERO_G \n\t" -// " \n\t" -// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -// " \n\t" -// " BETA_ZERO_G: \n\t" -// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) -// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) -// " \n\t" // " END_WRITE_MEM: \n\t" // " b END_EXEC \n\t" // " \n\t" @@ -310,7 +294,8 @@ GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : -: [a] "m" (a), +: [m] "m" (m), + [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index deb01f9fea..7dad6953f9 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -67,10 +67,14 @@ void bli_sgemm_armsve_asm_2vx10_unindexed uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; - uint64_t mr = bli_vl_bytes_armsve() * 2 / 4; - GEMM_UKR_SETUP_CT( s, mr, 10, false ); + GEMM_UKR_SETUP_CT( s, m, 10, false ); __asm__ volatile ( +" mov x0, xzr \n\t" +" ldr x1, %[m] \n\t" +" whilelo p0.s, x0, x1 \n\t" " incw x0 \n\t" +" whilelo p1.s, x0, x1 \n\t" +" \n\t" " ldr x0, %[a] \n\t" " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" @@ -96,7 +100,6 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " madd x2, x8, x2, xzr \n\t" // cs_a " madd x3, x8, x3, xzr \n\t" // rs_b " madd x7, x8, x7, xzr \n\t" // cs_c -" ptrue p0.s \n\t" " \n\t" " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" @@ -114,7 +117,7 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " ld1rw z26.s, p0/z, [x1, 24] \n\t" " ld1rw z27.s, p0/z, [x1, 28] \n\t" " \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" " CCOL_PRFM: \n\t" // " cmp x6, #1 \n\t" @@ -149,22 +152,22 @@ CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " K_MKER_LOOP: \n\t" " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. " b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. -GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " b K_MKER_LOOP \n\t" " \n\t" @@ -176,7 +179,7 @@ GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3 " cmp x8, #0 \n\t" // End of execution. " b.eq WRITE_MEM_PREP \n\t" " \n\t" -GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p0,x0) +GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rw z21.s, p0/z, [x1, 4] \n\t" " ld1rw z22.s, p0/z, [x1, 8] \n\t" @@ -260,34 +263,16 @@ SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z1 " \n\t" // Here used scratch: Z[20-29]. " fcmp s31, #0.0 \n\t" " b.eq BETA_ZERO_C \n\t" -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p0,x9,x7) +GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" " BETA_ZERO_C: \n\t" -GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p0,x5,x7) -GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) +GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) +GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) // " b END_WRITE_MEM \n\t" // " \n\t" -// " WRITE_MEM_G: \n\t" // Available scratch: Z[20-30]. -// " \n\t" // Here used scratch: Z[20-30] - Z30 as index. -// " mov x8, xzr \n\t" -// " incb x8 \n\t" -// " madd x8, x8, x6, xzr \n\t" // C-column's logical 1-vector skip. -// " index z30.s, wzr, w6 \n\t" // Skips passed to index is not multiplied by 8. -// " \n\t" -// " fcmp s31, #0.0 \n\t" -// " b.eq BETA_ZERO_G \n\t" -// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -// GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -// GEMM_C_LOAD_UKER_G(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z30,p0,p0,x9,x7,x8,x16) -// GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) -// " \n\t" -// " BETA_ZERO_G: \n\t" -// GEMM_C_STORE_UKER_G(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z30,p0,p0,x5,x7,x8,x16) -// GEMM_C_STORE_UKER_G(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,z30,p0,p0,x5,x7,x8,x16) -// " \n\t" // " END_WRITE_MEM: \n\t" // " b END_EXEC \n\t" // " \n\t" @@ -296,7 +281,8 @@ GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p0,x5,x7) " END_EXEC: \n\t" " mov x0, #0 \n\t" // Return normal. : -: [a] "m" (a), +: [m] "m" (m), + [a] "m" (a), [b] "m" (b), [c] "m" (c), [rs_c] "m" (rs_c), diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index e941f5abda..42b1345ffa 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -68,10 +68,10 @@ void bli_zgemm_armsve_asm_2vx10_unindexed uint64_t cs_c = cs_c0; uint64_t info = 0; - uint64_t mr = bli_vl_bytes_armsve() * 2 / 16; - GEMM_UKR_SETUP_CT( z, mr, 10, false ); + GEMM_UKR_SETUP_CT( z, m, 10, false ); __asm__ volatile ( +" whilelo p0.d, xzr, %12 \n\t" // " ldr x0, %[a] \n\t" // " ldr x1, %[b] \n\t" " mov x2, xzr \n\t" @@ -97,7 +97,6 @@ void bli_zgemm_armsve_asm_2vx10_unindexed " madd x2, x16, x2, xzr \n\t" // cs_a " madd x3, x16, x3, xzr \n\t" // rs_b " madd %4, x16, %4, xzr \n\t" // cs_c -" ptrue p0.d \n\t" " \n\t" // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" @@ -306,7 +305,7 @@ GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) "+r" (a_next), // %9 "+r" (b_next), // %10 "=r" (info) // %11 -: +: "r" (m) // %12 : "x2","x3","x9","x16", "z0","z1","z2","z3","z4","z5","z6","z7", "z8","z9","z10","z11","z12","z13","z14","z15", From 5436689d1a24a1493aff43255523260d7d8db59c Mon Sep 17 00:00:00 2001 From: RuQing Xu Date: Mon, 7 Feb 2022 17:14:49 +0900 Subject: [PATCH 4/4] ArmSVE Adopts Label Wrapper For clang (& armclang?) compilation. Hopefully solves #609 . --- .../armsve/1m/bli_dpackm_armsve512_asm_10xk.c | 37 +++++----- .../armsve/1m/bli_dpackm_armsve512_asm_16xk.c | 37 +++++----- kernels/armsve/3/armsve_asm_macros.h | 13 ++++ .../3/bli_gemm_armsve_asm_c2vx10_unindexed.c | 70 +++++++++---------- .../3/bli_gemm_armsve_asm_d2vx10_unindexed.c | 57 +++++++-------- .../3/bli_gemm_armsve_asm_s2vx10_unindexed.c | 55 ++++++++------- .../3/bli_gemm_armsve_asm_z2vx10_unindexed.c | 70 +++++++++---------- kernels/armv8a/3/armv8a_asm_utils.h | 2 +- 8 files changed, 179 insertions(+), 162 deletions(-) diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c index 44718fa578..a086b3a76e 100644 --- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c +++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c @@ -36,6 +36,7 @@ #include "blis.h" #include "armsve512_asm_transpose_d8x8.h" #include "armsve512_asm_transpose_d8x2.h" +#include "../3/armsve_asm_macros.h" // assumption: // SVE vector length = 512 bits. @@ -93,9 +94,9 @@ void bli_dpackm_armsve512_asm_10xk "mov x8, %[n_mker] \n\t" "mov x9, %[n_left] \n\t" "ptrue p0.d \n\t" - "b.ne .AROWSTOR \n\t" + BNE(AROWSTOR) // A stored in columns. - " .ACOLSTOR: \n\t" + LABEL(ACOLSTOR) // Prefetch distance. "mov x17, #8 \n\t" "madd x17, x17, x3, xzr \n\t" @@ -105,9 +106,9 @@ void bli_dpackm_armsve512_asm_10xk "lsl x16, x16, #60 \n\t" "orr x0, x0, x16 \n\t" #endif - " .ACOLSTORMKER: \n\t" + LABEL(ACOLSTORMKER) "cmp x8, xzr \n\t" - "b.eq .ACOLSTORMKEREND \n\t" + BEQ(ACOLSTORMKEREND) "add x5, x0, x3 \n\t" "add x6, x5, x3 \n\t" "add x7, x6, x3 \n\t" @@ -201,11 +202,11 @@ void bli_dpackm_armsve512_asm_10xk // "add x1, x1, #320 \n\t" "add x0, x7, x3 \n\t" "sub x8, x8, #1 \n\t" - "b .ACOLSTORMKER \n\t" - " .ACOLSTORMKEREND: \n\t" - " .ACOLSTORLEFT: \n\t" + BRANCH(ACOLSTORMKER) + LABEL(ACOLSTORMKEREND) + LABEL(ACOLSTORLEFT) "cmp x9, xzr \n\t" - "b.eq .UNITKDONE \n\t" + BEQ(UNITKDONE) "ld1d z0.d, p0/z, [x0] \n\t" "ldr q1, [x0, #64] \n\t" "st1d z0.d, p0, [x1] \n\t" @@ -213,14 +214,14 @@ void bli_dpackm_armsve512_asm_10xk "add x0, x0, x3 \n\t" "add x1, x1, x2 \n\t" "sub x9, x9, #1 \n\t" - "b .ACOLSTORLEFT \n\t" + BRANCH(ACOLSTORLEFT) // A stored in rows. - " .AROWSTOR: \n\t" + LABEL(AROWSTOR) // Prepare predicates for in-reg transpose. SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6) - " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful. + LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful. "cmp x8, xzr \n\t" - "b.eq .AROWSTORMKEREND \n\t" + BEQ(AROWSTORMKEREND) "add x10, x0, x4 \n\t" "add x11, x10, x4 \n\t" "add x12, x11, x4 \n\t" @@ -271,15 +272,15 @@ void bli_dpackm_armsve512_asm_10xk "add x1, x16, x2 \n\t" "add x0, x0, #64 \n\t" "sub x8, x8, #1 \n\t" - "b .AROWSTORMKER \n\t" - " .AROWSTORMKEREND: \n\t" + BRANCH(AROWSTORMKER) + LABEL(AROWSTORMKEREND) "mov x4, %[inca] \n\t" // Restore unshifted inca. "index z30.d, xzr, x4 \n\t" // Generate index. "lsl x4, x4, #3 \n\t" // Shift again. "lsl x5, x4, #3 \n\t" // Virtual column vl. - " .AROWSTORLEFT: \n\t" + LABEL(AROWSTORLEFT) "cmp x9, xzr \n\t" - "b.eq .UNITKDONE \n\t" + BEQ(UNITKDONE) "add x6, x0, x5 \n\t" "add x7, x6, x4 \n\t" "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t" @@ -291,8 +292,8 @@ void bli_dpackm_armsve512_asm_10xk "add x1, x1, x2 \n\t" "add x0, x0, #8 \n\t" "sub x9, x9, #1 \n\t" - "b .AROWSTORLEFT \n\t" - " .UNITKDONE: \n\t" + BRANCH(AROWSTORLEFT) + LABEL(UNITKDONE) "mov x0, #0 \n\t" : : [a] "r" (a), diff --git a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c index f02b87a7a0..aeb323c0ca 100644 --- a/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c +++ b/kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c @@ -35,6 +35,7 @@ #include "blis.h" #include "armsve512_asm_transpose_d8x8.h" +#include "../3/armsve_asm_macros.h" // assumption: // SVE vector length = 512 bits. @@ -99,9 +100,9 @@ void bli_dpackm_armsve512_asm_16xk "mov x8, %[n_mker] \n\t" "mov x9, %[n_left] \n\t" "ptrue p0.d \n\t" - "b.ne .AROWSTOR \n\t" + BNE(AROWSTOR) // A stored in columns. - " .ACOLSTOR: \n\t" + LABEL(ACOLSTOR) // Prefetch distance. "mov x17, #8 \n\t" "madd x17, x17, x3, xzr \n\t" @@ -125,9 +126,9 @@ void bli_dpackm_armsve512_asm_16xk // "prfm PLDL1STRM, [x5] \n\t" // "prfm PLDL1STRM, [x6] \n\t" // "prfm PLDL1STRM, [x7] \n\t" - " .ACOLSTORMKER: \n\t" + LABEL(ACOLSTORMKER) "cmp x8, xzr \n\t" - "b.eq .ACOLSTORMKEREND \n\t" + BEQ(ACOLSTORMKEREND) "add x5, x0, x3 \n\t" "add x6, x5, x3 \n\t" "add x7, x6, x3 \n\t" @@ -193,11 +194,11 @@ void bli_dpackm_armsve512_asm_16xk "add x0, x7, x3 \n\t" "add x1, x16, x2 \n\t" "sub x8, x8, #1 \n\t" - "b .ACOLSTORMKER \n\t" - " .ACOLSTORMKEREND: \n\t" - " .ACOLSTORLEFT: \n\t" + BRANCH(ACOLSTORMKER) + LABEL(ACOLSTORMKEREND) + LABEL(ACOLSTORLEFT) "cmp x9, xzr \n\t" - "b.eq .UNITKDONE \n\t" + BEQ(UNITKDONE) "ld1d z0.d, p0/z, [x0] \n\t" "ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t" "st1d z0.d, p0, [x1] \n\t" @@ -205,14 +206,14 @@ void bli_dpackm_armsve512_asm_16xk "add x0, x0, x3 \n\t" "add x1, x1, x2 \n\t" "sub x9, x9, #1 \n\t" - "b .ACOLSTORLEFT \n\t" + BRANCH(ACOLSTORLEFT) // A stored in rows. - " .AROWSTOR: \n\t" + LABEL(AROWSTOR) // Prepare predicates for in-reg transpose. SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6) - " .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful. + LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful. "cmp x8, xzr \n\t" - "b.eq .AROWSTORMKEREND \n\t" + BEQ(AROWSTORMKEREND) "add x10, x0, x4 \n\t" "add x11, x10, x4 \n\t" "add x12, x11, x4 \n\t" @@ -274,15 +275,15 @@ void bli_dpackm_armsve512_asm_16xk "add x0, x0, #64 \n\t" "add x1, x16, x2 \n\t" "sub x8, x8, #1 \n\t" - "b .AROWSTORMKER \n\t" - " .AROWSTORMKEREND: \n\t" + BRANCH(AROWSTORMKER) + LABEL(AROWSTORMKEREND) "mov x4, %[inca] \n\t" // Restore unshifted inca. "index z30.d, xzr, x4 \n\t" // Generate index. "lsl x4, x4, #3 \n\t" // Shift again. "lsl x5, x4, #3 \n\t" // Virtual column vl. - " .AROWSTORLEFT: \n\t" + LABEL(AROWSTORLEFT) "cmp x9, xzr \n\t" - "b.eq .UNITKDONE \n\t" + BEQ(UNITKDONE) "add x6, x0, x5 \n\t" "ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t" "ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t" @@ -291,8 +292,8 @@ void bli_dpackm_armsve512_asm_16xk "add x1, x1, x2 \n\t" "add x0, x0, #8 \n\t" "sub x9, x9, #1 \n\t" - "b .AROWSTORLEFT \n\t" - " .UNITKDONE: \n\t" + BRANCH(AROWSTORLEFT) + LABEL(UNITKDONE) "mov x0, #0 \n\t" : : [a] "r" (a), diff --git a/kernels/armsve/3/armsve_asm_macros.h b/kernels/armsve/3/armsve_asm_macros.h index 5e8eb3c623..9cbbeab920 100644 --- a/kernels/armsve/3/armsve_asm_macros.h +++ b/kernels/armsve/3/armsve_asm_macros.h @@ -33,6 +33,19 @@ */ +// Clang's label requirements. +#if defined(__clang__) +#define LABEL(str) " L" #str"%=: \n\t" +#define BEQ(str) "b.eq L" #str"%= \n\t" +#define BNE(str) "b.ne L" #str"%= \n\t" +#define BRANCH(str) "b L" #str"%= \n\t" +#else +#define LABEL(str) " ." #str": \n\t" +#define BEQ(str) "b.eq ." #str" \n\t" +#define BNE(str) "b.ne ." #str" \n\t" +#define BRANCH(str) "b ." #str" \n\t" +#endif + #define CLEAR_COL2(Z0,Z1) \ " dup "#Z0"."DT", #0 \n\t" \ " dup "#Z1"."DT", #0 \n\t" diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c index c24384b02a..098d5d4b5e 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_c2vx10_unindexed.c @@ -101,9 +101,9 @@ void bli_cgemm_armsve_asm_2vx10_unindexed // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" " \n\t" -" LOAD_ABC: \n\t" +LABEL(LOAD_ABC) " cmp %5, #0 \n\t" // Don't preload if no microkernel there. -" b.eq END_CCOL_PRFM \n\t" +BEQ(END_CCOL_PRFM) " \n\t" " ld1rw z20.s, p0/z, [%1, 4*0] \n\t" // Load B's real 8/10, no imaginary. " ld1rw z21.s, p0/z, [%1, 4*2] \n\t" @@ -116,9 +116,9 @@ void bli_cgemm_armsve_asm_2vx10_unindexed " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" -" CCOL_PRFM: \n\t" +LABEL(CCOL_PRFM) // " cmp %3, #1 \n\t" -// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" @@ -139,14 +139,14 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp %5, #0 \n\t" // If no 4-microkernel can be applied. -" b.eq K_LEFT_LOOP \n\t" +BEQ(K_LEFT_LOOP) " \n\t" -" K_MKER_LOOP: \n\t" +LABEL(K_MKER_LOOP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) @@ -158,18 +158,18 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" " subs %5, %5, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) -" b K_MKER_LOOP \n\t" +BRANCH(K_MKER_LOOP) " \n\t" -" FIN_MKER_LOOP: \n\t" +LABEL(FIN_MKER_LOOP) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" -" K_LEFT_LOOP: \n\t" +LABEL(K_LEFT_LOOP) " cmp %6, #0 \n\t" // End of execution. -" b.eq WRITE_MEM_PREP \n\t" +BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rw z20.s, p0/z, [%1, 4*0] \n\t" // Load B's real 8/10, no imaginary. @@ -182,9 +182,9 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rw z27.s, p0/z, [%1, 4*14] \n\t" GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " sub %6, %6, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. +BRANCH(K_LEFT_LOOP) " \n\t" -" WRITE_MEM_PREP: \n\t" +LABEL(WRITE_MEM_PREP) " \n\t" // " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). // " ldr x8, %[beta] \n\t" @@ -193,7 +193,7 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18, " ld1rw z30.s, p0/z, [%8] \n\t" // Real(beta). " ld1rw z31.s, p0/z, [%8, 4] \n\t" // Imag(beta). " \n\t" -" PREFETCH_ABNEXT: \n\t" +LABEL(PREFETCH_ABNEXT) // " ldr x9, %[a_next] \n\t" // " ldr x10, %[b_next] \n\t" #ifdef _A64FX @@ -209,90 +209,90 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18, " prfm PLDL1STRM, [%10] \n\t" " prfm PLDL1STRM, [%10, 256*1] \n\t" " \n\t" -" WRITE_MEM: \n\t" +LABEL(WRITE_MEM) " fmov s27, #1.0 \n\t" " fcmp s29, #0.0 \n\t" // Whether Imag(alpha) == 0. " fccmp s28, s27, 0, eq \n\t" // Whether Real(alpha) == 1. -" b.eq UNIT_ALPHA \n\t" +BEQ(UNIT_ALPHA) " \n\t" GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29) GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29) GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29) GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29) GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29) -" b WRITE_MEM_EXEC \n\t" +BRANCH(WRITE_MEM_EXEC) " \n\t" -" UNIT_ALPHA: \n\t" +LABEL(UNIT_ALPHA) MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 ) MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 ) MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11) MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15) MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " \n\t" -" WRITE_MEM_EXEC: \n\t" +LABEL(WRITE_MEM_EXEC) " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. // " cmp %3, #1 \n\t" -// " b.ne WRITE_MEM_G \n\t" +// BNE(WRITE_MEM_G) " \n\t" -" WRITE_MEM_C: \n\t" +LABEL(WRITE_MEM_C) " fmov s29, wzr \n\t" " fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. " fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. -" b.eq ZERO_BETA_C_0_1_2_3 \n\t" +BEQ(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -" ZERO_BETA_C_0_1_2_3: \n\t" +LABEL(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4) " \n\t" -" b.eq ZERO_BETA_C_4_5_6_7_8_9 \n\t" +BEQ(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4) GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -" ZERO_BETA_C_4_5_6_7_8_9: \n\t" +LABEL(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) -// " b END_WRITE_MEM \n\t" +// BRANCH(END_WRITE_MEM) // " \n\t" -// " WRITE_MEM_G: \n\t" +// LABEL(WRITE_MEM_G) // " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, // " mov x3, %3 \n\t" // s.t. 2*sizeof(float) = 2*4 = 8. // " index z28.s, wzr, w3 \n\t" // " fmov s29, wzr \n\t" // " fcmp s31, #0.0 \n\t" // Whether Imag(beta) == 0. // " fccmp s30, s29, 0, eq \n\t" // Whether Real(beta) == 0. -// " b.eq ZERO_BETA_G_0_1_2_3 \n\t" +// BEQ(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -// " ZERO_BETA_G_0_1_2_3: \n\t" +// LABEL(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) // " \n\t" -// " b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" +// BEQ(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) // GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -// " ZERO_BETA_G_4_5_6_7_8_9: \n\t" +// LABEL(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) // " \n\t" -// " END_WRITE_MEM: \n\t" -// " b END_EXEC \n\t" +// LABEL(END_WRITE_MEM) +// BRANCH(END_EXEC) " \n\t" -" END_EXEC: \n\t" +LABEL(END_EXEC) " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 "+r" (b), // %1 diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c index 1c2c37208c..0ee470f240 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_d2vx10_unindexed.c @@ -104,9 +104,9 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" -" LOAD_ABC: \n\t" +LABEL(LOAD_ABC) " cmp x4, #0 \n\t" // Don't preload if no microkernel there. -" b.eq END_CCOL_PRFM \n\t" +BEQ(END_CCOL_PRFM) " ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rd z21.d, p0/z, [x1, 8] \n\t" @@ -119,9 +119,9 @@ void bli_dgemm_armsve_asm_2vx10_unindexed " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" -" CCOL_PRFM: \n\t" +LABEL(CCOL_PRFM) // " cmp x6, #1 \n\t" -// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" @@ -142,14 +142,14 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp x4, #0 \n\t" // If no 4-microkernel can be applied -" b.eq K_LEFT_LOOP \n\t" +BEQ(K_LEFT_LOOP) " \n\t" -" K_MKER_LOOP: \n\t" +LABEL(K_MKER_LOOP) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) @@ -164,20 +164,20 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) -" b K_MKER_LOOP \n\t" +BRANCH(K_MKER_LOOP) " \n\t" -" FIN_MKER_LOOP: \n\t" +LABEL(FIN_MKER_LOOP) GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " add x0, x0, x2 \n\t" // Forward A to fill the blank. " \n\t" -" K_LEFT_LOOP: \n\t" +LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of execution. -" b.eq WRITE_MEM_PREP \n\t" +BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rd z20.d, p0/z, [x1] \n\t" // Load 8/10 of first B row. @@ -203,9 +203,9 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " add x0, x0, x2 \n\t" // Forward A. " add x1, x1, x3 \n\t" // Forward B. " sub x8, x8, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. +BRANCH(K_LEFT_LOOP) " \n\t" -" WRITE_MEM_PREP: \n\t" +LABEL(WRITE_MEM_PREP) " \n\t" " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" @@ -216,7 +216,7 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " fmov d28, #1.0 \n\t" // Prepare FP 1.0. " fmov x16, d28 \n\t" " \n\t" -" PREFETCH_ABNEXT: \n\t" +LABEL(PREFETCH_ABNEXT) " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" #ifdef _A64FX @@ -257,41 +257,42 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. // " cmp x6, #1 \n\t" // Preload first half of C for contiguous case. -// " b.ne WRITE_MEM \n\t" +// BNE(WRITE_MEM) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) " \n\t" -" WRITE_MEM: \n\t" +LABEL(WRITE_MEM) " \n\t" " cmp x16, x4 \n\t" -" b.eq UNIT_ALPHA \n\t" +BEQ(UNIT_ALPHA) " \n\t" SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" -" UNIT_ALPHA: \n\t" +LABEL(UNIT_ALPHA) // " cmp x6, #1 \n\t" -// " b.ne WRITE_MEM_G \n\t" +// BNE(WRITE_MEM_G) " \n\t" -" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. +LABEL(WRITE_MEM_C) +" \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. " fcmp d31, #0.0 \n\t" // Skip loading if *beta == 0 to override NaN. -" b.eq BETA_ZERO_C \n\t" +BEQ(BETA_ZERO_C) // First half of C is already loaded in this case. // GEMM_C_FMAD_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,z31,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" -" BETA_ZERO_C: \n\t" +LABEL(BETA_ZERO_C) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) -// " b END_WRITE_MEM \n\t" +// BRANCH(END_WRITE_MEM) // " \n\t" -// " END_WRITE_MEM: \n\t" -// " b END_EXEC \n\t" +// LABEL(END_WRITE_MEM) +// BRANCH(END_EXEC) // " \n\t" -// " END_ERROR: \n\t" +// LABEL(END_ERROR) // " mov x0, #1 \n\t" // Return error. -" END_EXEC: \n\t" +LABEL(END_EXEC) " mov x0, #0 \n\t" // Return normal. : : [m] "m" (m), diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c index 7dad6953f9..d03af59230 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_s2vx10_unindexed.c @@ -104,9 +104,9 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " ldr x4, %[k_mker] \n\t" // Number of loops. " ldr x8, %[k_left] \n\t" " \n\t" -" LOAD_ABC: \n\t" +LABEL(LOAD_ABC) " cmp x4, #0 \n\t" // Don't preload if no microkernel there. -" b.eq END_CCOL_PRFM \n\t" +BEQ(END_CCOL_PRFM) " ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. " ld1rw z21.s, p0/z, [x1, 4] \n\t" @@ -119,9 +119,9 @@ void bli_sgemm_armsve_asm_2vx10_unindexed " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " \n\t" -" CCOL_PRFM: \n\t" +LABEL(CCOL_PRFM) // " cmp x6, #1 \n\t" -// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, x5 \n\t" " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" @@ -142,14 +142,14 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) " prfm PLDL1STRM, [x16] \n\t" " add x16, x16, x7 \n\t" " prfm PLDL1STRM, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp x4, #0 \n\t" // If no 4-microkernel can be applied -" b.eq K_LEFT_LOOP \n\t" +BEQ(K_LEFT_LOOP) " \n\t" -" K_MKER_LOOP: \n\t" +LABEL(K_MKER_LOOP) " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) @@ -164,20 +164,20 @@ GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_3(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " \n\t" " subs x4, x4, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" " add x0, x0, x2 \n\t" // Forward A's address to the next column. GEMM_ACOL_CONTIGUOUS_LOAD(z28,z29,p0,p1,x0) GEMM_2VX10_MKER_LOOP_PLAIN_C_4(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) -" b K_MKER_LOOP \n\t" +BRANCH(K_MKER_LOOP) " \n\t" -" FIN_MKER_LOOP: \n\t" +LABEL(FIN_MKER_LOOP) GEMM_2VX10_MKER_LOOP_PLAIN_C_4_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,x1,x3) " add x0, x0, x2 \n\t" // Forward A to fill the blank. " \n\t" -" K_LEFT_LOOP: \n\t" +LABEL(K_LEFT_LOOP) " cmp x8, #0 \n\t" // End of execution. -" b.eq WRITE_MEM_PREP \n\t" +BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOL_CONTIGUOUS_LOAD(z30,z31,p0,p1,x0) " ld1rw z20.s, p0/z, [x1] \n\t" // Load 8/10 of first B row. @@ -203,9 +203,9 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " add x0, x0, x2 \n\t" // Forward A. " add x1, x1, x3 \n\t" // Forward B. " sub x8, x8, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. +BRANCH(K_LEFT_LOOP) " \n\t" -" WRITE_MEM_PREP: \n\t" +LABEL(WRITE_MEM_PREP) " \n\t" " ldr x4, %[alpha] \n\t" // Load alpha & beta (address). " ldr x8, %[beta] \n\t" @@ -214,7 +214,7 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " dup z30.s, w4 \n\t" // Broadcast alpha & beta into vectors. " dup z31.s, w8 \n\t" " \n\t" -" PREFETCH_ABNEXT: \n\t" +LABEL(PREFETCH_ABNEXT) " ldr x0, %[a_next] \n\t" " ldr x1, %[b_next] \n\t" " prfm PLDL2KEEP, [x0] \n\t" @@ -244,41 +244,42 @@ GEMM_FMLA2(z18,z19,p0,z30,z31,z29) " prfm PLDL2KEEP, [x1, 256*8] \n\t" " prfm PLDL2KEEP, [x1, 256*9] \n\t" " \n\t" -" WRITE_MEM: \n\t" +LABEL(WRITE_MEM) " \n\t" " fmov s28, #1.0 \n\t" " fmov w16, s28 \n\t" " cmp w16, w4 \n\t" -" b.eq UNIT_ALPHA \n\t" +BEQ(UNIT_ALPHA) " \n\t" SCALE_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19,z30) " \n\t" -" UNIT_ALPHA: \n\t" +LABEL(UNIT_ALPHA) " mov x9, x5 \n\t" // C address for loading. " \n\t" // C address for storing is x5 itself. // " cmp x6, #1 \n\t" -// " b.ne WRITE_MEM_G \n\t" +// BNE(WRITE_MEM_G) " \n\t" -" WRITE_MEM_C: \n\t" // Available scratch: Z[20-30]. +LABEL(WRITE_MEM_C) +" \n\t" // Available scratch: Z[20-30]. " \n\t" // Here used scratch: Z[20-29]. " fcmp s31, #0.0 \n\t" -" b.eq BETA_ZERO_C \n\t" +BEQ(BETA_ZERO_C) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) GEMM_C_LOAD_UKER_C(z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,p0,p1,x9,x7) GEMM_C_FMLA_UKER(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,z20,z22,z24,z26,z28,z21,z23,z25,z27,z29,z31) " \n\t" -" BETA_ZERO_C: \n\t" +LABEL(BETA_ZERO_C) GEMM_C_STORE_UKER_C(z0,z2,z4,z6,z8,z1,z3,z5,z7,z9,p0,p1,x5,x7) GEMM_C_STORE_UKER_C(z10,z12,z14,z16,z18,z11,z13,z15,z17,z19,p0,p1,x5,x7) -// " b END_WRITE_MEM \n\t" +// BRANCH(END_WRITE_MEM) // " \n\t" -// " END_WRITE_MEM: \n\t" -// " b END_EXEC \n\t" +// LABEL(END_WRITE_MEM) +// BRANCH(END_EXEC) // " \n\t" -// " END_ERROR: \n\t" +// LABEL(END_ERROR) // " mov x0, #1 \n\t" // Return error. -" END_EXEC: \n\t" +LABEL(END_EXEC) " mov x0, #0 \n\t" // Return normal. : : [m] "m" (m), diff --git a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c index 42b1345ffa..8636a527ba 100644 --- a/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c +++ b/kernels/armsve/3/bli_gemm_armsve_asm_z2vx10_unindexed.c @@ -101,9 +101,9 @@ void bli_zgemm_armsve_asm_2vx10_unindexed // " ldr x5, %[k_mker] \n\t" // Number of loops. // " ldr x6, %[k_left] \n\t" " \n\t" -" LOAD_ABC: \n\t" +LABEL(LOAD_ABC) " cmp %5, #0 \n\t" // Don't preload if no microkernel there. -" b.eq END_CCOL_PRFM \n\t" +BEQ(END_CCOL_PRFM) " \n\t" " ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. " ld1rd z21.d, p0/z, [%1, 8*2] \n\t" @@ -116,9 +116,9 @@ void bli_zgemm_armsve_asm_2vx10_unindexed " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " \n\t" -" CCOL_PRFM: \n\t" +LABEL(CCOL_PRFM) // " cmp %3, #1 \n\t" -// " b.ne END_CCOL_PRFM \n\t" // Do not prefetch for generic C storage. +// BNE(END_CCOL_PRFM) // Do not prefetch for generic C storage. " mov x16, %2 \n\t" " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" @@ -139,14 +139,14 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " prfm PLDL1KEEP, [x16] \n\t" " add x16, x16, %4 \n\t" " prfm PLDL1KEEP, [x16] \n\t" -" END_CCOL_PRFM: \n\t" +LABEL(END_CCOL_PRFM) " \n\t" CLEAR_COL20(z0,z1,z2,z3,z4,z5,z6,z7,z8,z9,z10,z11,z12,z13,z14,z15,z16,z17,z18,z19) " \n\t" " cmp %5, #0 \n\t" // If no 4-microkernel can be applied. -" b.eq K_LEFT_LOOP \n\t" +BEQ(K_LEFT_LOOP) " \n\t" -" K_MKER_LOOP: \n\t" +LABEL(K_MKER_LOOP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) @@ -158,18 +158,18 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z30,z31,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" " subs %5, %5, #1 \n\t" // Decrease counter before final replica. -" b.eq FIN_MKER_LOOP \n\t" // Branch early to avoid reading excess mem. +BEQ(FIN_MKER_LOOP) // Branch early to avoid reading excess mem. " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) -" b K_MKER_LOOP \n\t" +BRANCH(K_MKER_LOOP) " \n\t" -" FIN_MKER_LOOP: \n\t" +LABEL(FIN_MKER_LOOP) GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_2_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z30,z31,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " \n\t" -" K_LEFT_LOOP: \n\t" +LABEL(K_LEFT_LOOP) " cmp %6, #0 \n\t" // End of execution. -" b.eq WRITE_MEM_PREP \n\t" +BEQ(WRITE_MEM_PREP) " \n\t" GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rd z20.d, p0/z, [%1, 8*0] \n\t" // Load B's real 8/10, no imaginary. @@ -182,9 +182,9 @@ GEMM_ACOLCMPLX_CONTIGUOUS_LOAD_FWD(z28,z29,p0,%0,x2) " ld1rd z27.d, p0/z, [%1, 8*14] \n\t" GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18,z1,z3,z5,z7,z9,z11,z13,z15,z17,z19,p0,z28,z29,z20,z21,z22,z23,z24,z25,z26,z27,%1,x3) " sub %6, %6, #1 \n\t" -" b K_LEFT_LOOP \n\t" // Next column / row. +BRANCH(K_LEFT_LOOP) " \n\t" -" WRITE_MEM_PREP: \n\t" +LABEL(WRITE_MEM_PREP) " \n\t" // " ldr x7, %[alpha] \n\t" // Load alpha & beta (address). // " ldr x8, %[beta] \n\t" @@ -193,7 +193,7 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18, " ld1rd z30.d, p0/z, [%8] \n\t" // Real(beta). " ld1rd z31.d, p0/z, [%8, 8] \n\t" // Imag(beta). " \n\t" -" PREFETCH_ABNEXT: \n\t" +LABEL(PREFETCH_ABNEXT) // " ldr x9, %[a_next] \n\t" // " ldr x10, %[b_next] \n\t" #ifdef _A64FX @@ -209,89 +209,89 @@ GEMM_2VX10CMPLX_MKER_LOOP_PLAIN_C_1_RESIDUAL(z0,z2,z4,z6,z8,z10,z12,z14,z16,z18, " prfm PLDL1STRM, [%10] \n\t" " prfm PLDL1STRM, [%10, 256*1] \n\t" " \n\t" -" WRITE_MEM: \n\t" +LABEL(WRITE_MEM) " fmov d27, #1.0 \n\t" " fcmp d29, #0.0 \n\t" // Whether Imag(alpha) == 0. " fccmp d28, d27, 0, eq \n\t" // Whether Real(alpha) == 1. -" b.eq UNIT_ALPHA \n\t" +BEQ(UNIT_ALPHA) " \n\t" GEMM_FMULCMPLX_COL2(z20,z21,z22,z23,p0,z0 ,z1 ,z2 ,z3 ,z28,z29) GEMM_FMULCMPLX_COL2(z24,z25,z26,z27,p0,z4 ,z5 ,z6 ,z7 ,z28,z29) GEMM_FMULCMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z8, z9, z10,z11,z28,z29) GEMM_FMULCMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z12,z13,z14,z15,z28,z29) GEMM_FMULCMPLX_COL2(z8 ,z9 ,z10,z11,p0,z16,z17,z18,z19,z28,z29) -" b WRITE_MEM_EXEC \n\t" +BRANCH(WRITE_MEM_EXEC) " \n\t" -" UNIT_ALPHA: \n\t" +LABEL(UNIT_ALPHA) MOV_COL2(z20,z21,z22,z23,z0 ,z1 ,z2 ,z3 ) MOV_COL2(z24,z25,z26,z27,z4 ,z5 ,z6 ,z7 ) MOV_COL2(z0 ,z1 ,z2 ,z3 ,z8, z9, z10,z11) MOV_COL2(z4 ,z5 ,z6 ,z7 ,z12,z13,z14,z15) MOV_COL2(z8 ,z9 ,z10,z11,z16,z17,z18,z19) " \n\t" -" WRITE_MEM_EXEC: \n\t" +LABEL(WRITE_MEM_EXEC) " mov x9, %2 \n\t" // C address for loading. " \n\t" // C address for storing is %2 itself. // " cmp %3, #1 \n\t" -// " b.ne WRITE_MEM_G \n\t" +// BNE(WRITE_MEM_G) " \n\t" -" WRITE_MEM_C: \n\t" +LABEL(WRITE_MEM_C) " fmov d29, xzr \n\t" " fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. " fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. -" b.eq ZERO_BETA_C_0_1_2_3 \n\t" +BEQ(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -" ZERO_BETA_C_0_1_2_3: \n\t" +LABEL(ZERO_BETA_C_0_1_2_3) GEMM_CCMPLX_STORE_COL2_C(z20,z21,z22,z23,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z24,z25,z26,z27,p0,%2,%4) " \n\t" -" b.eq ZERO_BETA_C_4_5_6_7_8_9 \n\t" +BEQ(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_LOAD_COL2_C(z12,z13,z14,z15,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z16,z17,z18,z19,p0,x9,%4) GEMM_CCMPLX_LOAD_COL2_C(z20,z21,z22,z23,p0,x9,%4) GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -" ZERO_BETA_C_4_5_6_7_8_9: \n\t" +LABEL(ZERO_BETA_C_4_5_6_7_8_9) GEMM_CCMPLX_STORE_COL2_C(z0 ,z1 ,z2 ,z3 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z4 ,z5 ,z6 ,z7 ,p0,%2,%4) GEMM_CCMPLX_STORE_COL2_C(z8 ,z9 ,z10,z11,p0,%2,%4) -// " b END_WRITE_MEM \n\t" +// BRANCH(END_WRITE_MEM) // " \n\t" -// " WRITE_MEM_G: \n\t" +// LABEL(WRITE_MEM_G) // " add %3, %3, %3 \n\t" // Skips passed to index is multiplied by 2, // " index z28.d, xzr, %3 \n\t" // s.t. 2*sizeof(double) = 2*8 = 16. // " fmov d29, xzr \n\t" // " fcmp d31, #0.0 \n\t" // Whether Imag(beta) == 0. // " fccmp d30, d29, 0, eq \n\t" // Whether Real(beta) == 0. -// " b.eq ZERO_BETA_G_0_1_2_3 \n\t" +// BEQ(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z20,z21,z22,z23,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z24,z25,z26,z27,p0,z16,z17,z18,z19,z30,z31) -// " ZERO_BETA_G_0_1_2_3: \n\t" +// LABEL(ZERO_BETA_G_0_1_2_3) // GEMM_CCMPLX_STORE_COL2_G(z20,z21,z22,z23,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z24,z25,z26,z27,p0,z28,%2,%4,x16) // " \n\t" -// " b.eq ZERO_BETA_G_4_5_6_7_8_9 \n\t" +// BEQ(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_LOAD_COL2_G(z12,z13,z14,z15,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z16,z17,z18,z19,p0,z28,x9,%4,x16) // GEMM_CCMPLX_LOAD_COL2_G(z20,z21,z22,z23,p0,z28,x9,%4,x16) // GEMM_FMLACMPLX_COL2(z0 ,z1 ,z2 ,z3 ,p0,z12,z13,z14,z15,z30,z31) // GEMM_FMLACMPLX_COL2(z4 ,z5 ,z6 ,z7 ,p0,z16,z17,z18,z19,z30,z31) // GEMM_FMLACMPLX_COL2(z8 ,z9 ,z10,z11,p0,z20,z21,z22,z23,z30,z31) -// " ZERO_BETA_G_4_5_6_7_8_9: \n\t" +// LABEL(ZERO_BETA_G_4_5_6_7_8_9) // GEMM_CCMPLX_STORE_COL2_G(z0 ,z1 ,z2 ,z3 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z4 ,z5 ,z6 ,z7 ,p0,z28,%2,%4,x16) // GEMM_CCMPLX_STORE_COL2_G(z8 ,z9 ,z10,z11,p0,z28,%2,%4,x16) // " \n\t" -// " END_WRITE_MEM: \n\t" -// " b END_EXEC \n\t" +// LABEL(END_WRITE_MEM) +// BRANCH(END_EXEC) // " \n\t" -" END_EXEC: \n\t" +LABEL(END_EXEC) " mov %11, #0 \n\t" // Return normal. : "+r" (a), // %0 "+r" (b), // %1 diff --git a/kernels/armv8a/3/armv8a_asm_utils.h b/kernels/armv8a/3/armv8a_asm_utils.h index 4659509994..0c405dfd26 100644 --- a/kernels/armv8a/3/armv8a_asm_utils.h +++ b/kernels/armv8a/3/armv8a_asm_utils.h @@ -34,7 +34,7 @@ */ -// Apple's local label requirements. +// Apple/Clang's local label requirements. #if defined(__APPLE__) || defined(__clang__) #define LABEL(str) " L" #str"%=: \n\t" #define BEQ(str) "b.eq L" #str"%= \n\t"