Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions config/a64fx/bli_family_a64fx.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,16 @@
#define BLIS_SIMD_ALIGN_SIZE 256
#define BLIS_SIMD_NUM_REGISTERS 32

// SVE-specific configs.
#define N_L1_SVE_DEFAULT 64
#define W_L1_SVE_DEFAULT 4
#define C_L1_SVE_DEFAULT 256
#define N_L2_SVE_DEFAULT 2048
#define W_L2_SVE_DEFAULT 16
#define C_L2_SVE_DEFAULT 256
#define N_L3_SVE_DEFAULT 8192
#define W_L3_SVE_DEFAULT 16
#define C_L3_SVE_DEFAULT 256

//#endif

37 changes: 19 additions & 18 deletions kernels/armsve/1m/bli_dpackm_armsve512_asm_10xk.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "blis.h"
#include "armsve512_asm_transpose_d8x8.h"
#include "armsve512_asm_transpose_d8x2.h"
#include "../3/armsve_asm_macros.h"

// assumption:
// SVE vector length = 512 bits.
Expand Down Expand Up @@ -93,9 +94,9 @@ void bli_dpackm_armsve512_asm_10xk
"mov x8, %[n_mker] \n\t"
"mov x9, %[n_left] \n\t"
"ptrue p0.d \n\t"
"b.ne .AROWSTOR \n\t"
BNE(AROWSTOR)
// A stored in columns.
" .ACOLSTOR: \n\t"
LABEL(ACOLSTOR)
// Prefetch distance.
"mov x17, #8 \n\t"
"madd x17, x17, x3, xzr \n\t"
Expand All @@ -105,9 +106,9 @@ void bli_dpackm_armsve512_asm_10xk
"lsl x16, x16, #60 \n\t"
"orr x0, x0, x16 \n\t"
#endif
" .ACOLSTORMKER: \n\t"
LABEL(ACOLSTORMKER)
"cmp x8, xzr \n\t"
"b.eq .ACOLSTORMKEREND \n\t"
BEQ(ACOLSTORMKEREND)
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
Expand Down Expand Up @@ -201,26 +202,26 @@ void bli_dpackm_armsve512_asm_10xk
// "add x1, x1, #320 \n\t"
"add x0, x7, x3 \n\t"
"sub x8, x8, #1 \n\t"
"b .ACOLSTORMKER \n\t"
" .ACOLSTORMKEREND: \n\t"
" .ACOLSTORLEFT: \n\t"
BRANCH(ACOLSTORMKER)
LABEL(ACOLSTORMKEREND)
LABEL(ACOLSTORLEFT)
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
BEQ(UNITKDONE)
"ld1d z0.d, p0/z, [x0] \n\t"
"ldr q1, [x0, #64] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"str q1, [x1, #64] \n\t"
"add x0, x0, x3 \n\t"
"add x1, x1, x2 \n\t"
"sub x9, x9, #1 \n\t"
"b .ACOLSTORLEFT \n\t"
BRANCH(ACOLSTORLEFT)
// A stored in rows.
" .AROWSTOR: \n\t"
LABEL(AROWSTOR)
// Prepare predicates for in-reg transpose.
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful.
"cmp x8, xzr \n\t"
"b.eq .AROWSTORMKEREND \n\t"
BEQ(AROWSTORMKEREND)
"add x10, x0, x4 \n\t"
"add x11, x10, x4 \n\t"
"add x12, x11, x4 \n\t"
Expand Down Expand Up @@ -271,15 +272,15 @@ void bli_dpackm_armsve512_asm_10xk
"add x1, x16, x2 \n\t"
"add x0, x0, #64 \n\t"
"sub x8, x8, #1 \n\t"
"b .AROWSTORMKER \n\t"
" .AROWSTORMKEREND: \n\t"
BRANCH(AROWSTORMKER)
LABEL(AROWSTORMKEREND)
"mov x4, %[inca] \n\t" // Restore unshifted inca.
"index z30.d, xzr, x4 \n\t" // Generate index.
"lsl x4, x4, #3 \n\t" // Shift again.
"lsl x5, x4, #3 \n\t" // Virtual column vl.
" .AROWSTORLEFT: \n\t"
LABEL(AROWSTORLEFT)
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
BEQ(UNITKDONE)
"add x6, x0, x5 \n\t"
"add x7, x6, x4 \n\t"
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
Expand All @@ -291,8 +292,8 @@ void bli_dpackm_armsve512_asm_10xk
"add x1, x1, x2 \n\t"
"add x0, x0, #8 \n\t"
"sub x9, x9, #1 \n\t"
"b .AROWSTORLEFT \n\t"
" .UNITKDONE: \n\t"
BRANCH(AROWSTORLEFT)
LABEL(UNITKDONE)
"mov x0, #0 \n\t"
:
: [a] "r" (a),
Expand Down
37 changes: 19 additions & 18 deletions kernels/armsve/1m/bli_dpackm_armsve512_asm_16xk.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

#include "blis.h"
#include "armsve512_asm_transpose_d8x8.h"
#include "../3/armsve_asm_macros.h"

// assumption:
// SVE vector length = 512 bits.
Expand Down Expand Up @@ -99,9 +100,9 @@ void bli_dpackm_armsve512_asm_16xk
"mov x8, %[n_mker] \n\t"
"mov x9, %[n_left] \n\t"
"ptrue p0.d \n\t"
"b.ne .AROWSTOR \n\t"
BNE(AROWSTOR)
// A stored in columns.
" .ACOLSTOR: \n\t"
LABEL(ACOLSTOR)
// Prefetch distance.
"mov x17, #8 \n\t"
"madd x17, x17, x3, xzr \n\t"
Expand All @@ -125,9 +126,9 @@ void bli_dpackm_armsve512_asm_16xk
// "prfm PLDL1STRM, [x5] \n\t"
// "prfm PLDL1STRM, [x6] \n\t"
// "prfm PLDL1STRM, [x7] \n\t"
" .ACOLSTORMKER: \n\t"
LABEL(ACOLSTORMKER)
"cmp x8, xzr \n\t"
"b.eq .ACOLSTORMKEREND \n\t"
BEQ(ACOLSTORMKEREND)
"add x5, x0, x3 \n\t"
"add x6, x5, x3 \n\t"
"add x7, x6, x3 \n\t"
Expand Down Expand Up @@ -193,26 +194,26 @@ void bli_dpackm_armsve512_asm_16xk
"add x0, x7, x3 \n\t"
"add x1, x16, x2 \n\t"
"sub x8, x8, #1 \n\t"
"b .ACOLSTORMKER \n\t"
" .ACOLSTORMKEREND: \n\t"
" .ACOLSTORLEFT: \n\t"
BRANCH(ACOLSTORMKER)
LABEL(ACOLSTORMKEREND)
LABEL(ACOLSTORLEFT)
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
BEQ(UNITKDONE)
"ld1d z0.d, p0/z, [x0] \n\t"
"ld1d z1.d, p0/z, [x0, #1, mul vl] \n\t"
"st1d z0.d, p0, [x1] \n\t"
"st1d z1.d, p0, [x1, #1, mul vl] \n\t"
"add x0, x0, x3 \n\t"
"add x1, x1, x2 \n\t"
"sub x9, x9, #1 \n\t"
"b .ACOLSTORLEFT \n\t"
BRANCH(ACOLSTORLEFT)
// A stored in rows.
" .AROWSTOR: \n\t"
LABEL(AROWSTOR)
// Prepare predicates for in-reg transpose.
SVE512_IN_REG_TRANSPOSE_d8x8_PREPARE(x16,p0,p1,p2,p3,p8,p4,p6)
" .AROWSTORMKER: \n\t" // X[10-16] for A here not P. Be careful.
LABEL(AROWSTORMKER) // X[10-16] for A here not P. Be careful.
"cmp x8, xzr \n\t"
"b.eq .AROWSTORMKEREND \n\t"
BEQ(AROWSTORMKEREND)
"add x10, x0, x4 \n\t"
"add x11, x10, x4 \n\t"
"add x12, x11, x4 \n\t"
Expand Down Expand Up @@ -274,15 +275,15 @@ void bli_dpackm_armsve512_asm_16xk
"add x0, x0, #64 \n\t"
"add x1, x16, x2 \n\t"
"sub x8, x8, #1 \n\t"
"b .AROWSTORMKER \n\t"
" .AROWSTORMKEREND: \n\t"
BRANCH(AROWSTORMKER)
LABEL(AROWSTORMKEREND)
"mov x4, %[inca] \n\t" // Restore unshifted inca.
"index z30.d, xzr, x4 \n\t" // Generate index.
"lsl x4, x4, #3 \n\t" // Shift again.
"lsl x5, x4, #3 \n\t" // Virtual column vl.
" .AROWSTORLEFT: \n\t"
LABEL(AROWSTORLEFT)
"cmp x9, xzr \n\t"
"b.eq .UNITKDONE \n\t"
BEQ(UNITKDONE)
"add x6, x0, x5 \n\t"
"ld1d z0.d, p0/z, [x0, z30.d, lsl #3] \n\t"
"ld1d z1.d, p0/z, [x6, z30.d, lsl #3] \n\t"
Expand All @@ -291,8 +292,8 @@ void bli_dpackm_armsve512_asm_16xk
"add x1, x1, x2 \n\t"
"add x0, x0, #8 \n\t"
"sub x9, x9, #1 \n\t"
"b .AROWSTORLEFT \n\t"
" .UNITKDONE: \n\t"
BRANCH(AROWSTORLEFT)
LABEL(UNITKDONE)
"mov x0, #0 \n\t"
:
: [a] "r" (a),
Expand Down
13 changes: 13 additions & 0 deletions kernels/armsve/3/armsve_asm_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,19 @@


*/
// Clang's label requirements.
#if defined(__clang__)
#define LABEL(str) " L" #str"%=: \n\t"
#define BEQ(str) "b.eq L" #str"%= \n\t"
#define BNE(str) "b.ne L" #str"%= \n\t"
#define BRANCH(str) "b L" #str"%= \n\t"
#else
#define LABEL(str) " ." #str": \n\t"
#define BEQ(str) "b.eq ." #str" \n\t"
#define BNE(str) "b.ne ." #str" \n\t"
#define BRANCH(str) "b ." #str" \n\t"
#endif

#define CLEAR_COL2(Z0,Z1) \
" dup "#Z0"."DT", #0 \n\t" \
" dup "#Z1"."DT", #0 \n\t"
Expand Down
Loading