From 01100456b17068398937a8f02d4523316675d905 Mon Sep 17 00:00:00 2001 From: Jun He Date: Wed, 18 May 2022 16:02:42 +0000 Subject: [PATCH 1/2] decomp: add generic fast huf4x* decoding The is C version of the fast decompression algorithm implemented in huf_decompress_amd64.S. Signed-off-by: Jun He Change-Id: I964b109f4fd7fc9ca256b280e9add37c84f2e597 --- lib/decompress/huf_decompress.c | 315 +++++++++++++++++++++----- lib/decompress/huf_decompress_amd64.S | 28 +-- 2 files changed, 275 insertions(+), 68 deletions(-) diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index c6fd9286006..b6cd27720ee 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -57,18 +57,27 @@ #endif #define HUF_ASM_DECL HUF_EXTERN_C +#ifndef HUF_ENABLE_FAST_DEC +# if (defined(__x86_64__) || defined(__aarch64__)) +# define HUF_ENABLE_FAST_DEC 1 +# else +# define HUF_ENABLE_FAST_DEC 0 +# endif +#endif + #if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) # define HUF_NEED_BMI2_FUNCTION 1 #else # define HUF_NEED_BMI2_FUNCTION 0 #endif -#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +#if !((ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) || HUF_ENABLE_FAST_DEC) # define HUF_NEED_DEFAULT_FUNCTION 1 #else # define HUF_NEED_DEFAULT_FUNCTION 0 #endif + /* ************************************************************** * Error Management ****************************************************************/ @@ -139,7 +148,7 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) return dtd; } -#if ZSTD_ENABLE_ASM_X86_64_BMI2 +#if HUF_ENABLE_FAST_DEC static size_t HUF_initDStream(BYTE const* ip) { BYTE const lastByte = ip[7]; @@ -156,15 +165,15 @@ typedef struct { BYTE const* ilimit; BYTE* oend; BYTE const* iend[4]; -} HUF_DecompressAsmArgs; +} HUF_DecompressFastArgs; /** - * Initializes args for the asm decoding loop. + * Initializes args for the fast decoding loop. * @returns 0 on success * 1 if the fallback implementation should be used. * Or an error code on failure. */ -static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) +static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) { void const* dt = DTable + 1; U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; @@ -249,7 +258,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, return 0; } -static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) +static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) { /* Validate that we haven't overwritten. */ if (args->op[stream] > segmentEnd) @@ -272,7 +281,7 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs return 0; } -#endif +#endif /* HUF_ENABLE_FAST_DEC */ #ifndef HUF_FORCE_DECOMPRESS_X2 @@ -648,29 +657,130 @@ HUF_decompress4X1_usingDTable_internal_body( } } +static #if HUF_NEED_BMI2_FUNCTION -static BMI2_TARGET_ATTRIBUTE -size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, - size_t cSrcSize, HUF_DTable const* DTable) { - return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); -} +BMI2_TARGET_ATTRIBUTE #endif - -#if HUF_NEED_DEFAULT_FUNCTION -static size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, size_t cSrcSize, HUF_DTable const* DTable) { return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); } -#endif -#if ZSTD_ENABLE_ASM_X86_64_BMI2 +#if HUF_ENABLE_FAST_DEC + +#define FOR_EACH_STREAM(X) \ + X(0); \ + X(1); \ + X(2); \ + X(3) + +/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ +#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ + X(0, idx); \ + X(1, idx); \ + X(2, idx); \ + X(3, idx) -HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; +#if ZSTD_ENABLE_ASM_X86_64_BMI2 +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; +#else +#if !HUF_NEED_BMI2_FUNCTION +/* This is the c version of algorithm implemented in huf_decompress_amd64.S */ +static void HUF_decompress4X1_usingDTable_internal_fast_loop(HUF_DecompressFastArgs* args) +{ + BYTE *ip0, *ip1, *ip2, *ip3; + BYTE *op0, *op1, *op2, *op3; + U64 bits0, bits1, bits2, bits3; + BYTE const *ilimit = args->ilimit; + BYTE *oend = args->oend; + BYTE *olimit = NULL; + + U64 oloop, iloop, loop; + U64 var0, var1, var2, var3; + U16 *dtable; + +#define GET_NEXT_DELT(n) \ + var##n = (U64)(dtable[(bits##n >> 53)]); + +#define DECODE_FROM_DELT(n, idx) \ + op##n[idx] = var##n >>8; \ + bits##n <<= var##n + +#define DECODE_AND_GET_NEXT(n, idx) \ + DECODE_FROM_DELT(n, idx); \ + GET_NEXT_DELT(n) + +#define RELOAD_BITS(n) \ + bits##n = __builtin_ctzl(bits##n); \ + var##n = bits##n & 7; \ + op##n += 5; \ + bits##n >>= 3; \ + ip##n -= bits##n; \ + memcpy(&bits##n, ip##n, sizeof(U64)); \ + bits##n |= 1; \ + bits##n <<= var##n; + +#define PREPARE_NEXT_ITER(n) \ + RELOAD_BITS(n); \ + GET_NEXT_DELT(n) + + /* read streams params */ + memcpy(&ip0, &args->ip[0], sizeof(BYTE *)); + memcpy(&ip1, &args->ip[1], sizeof(BYTE *)); + memcpy(&ip2, &args->ip[2], sizeof(BYTE *)); + memcpy(&ip3, &args->ip[3], sizeof(BYTE *)); + op0 = args->op[0]; op1 = args->op[1]; + op2 = args->op[2]; op3 = args->op[3]; + bits0 = args->bits[0]; bits1 = args->bits[1]; + bits2 = args->bits[2]; bits3 = args->bits[3]; + memcpy(&dtable, &args->dt, sizeof(U16 *)); + + /* compute boundry of loop */ + oloop = (oend - op3) / 5; + iloop = (ip0 - ilimit) / 7; + loop = oloop > iloop? iloop : oloop; + + while (loop >= (4) && (ip1 >= ip0) + && (ip2 >= ip1) && (ip3 >= ip2)) { + olimit = op3 + loop * 5; + FOR_EACH_STREAM(GET_NEXT_DELT); + + do { + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0); + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1); + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2); + FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3); + FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4); + + /* Reload each stream & fetch the next table entry + * to prepare for the next iteration + */ + FOR_EACH_STREAM(PREPARE_NEXT_ITER); + } while (op3 < olimit); /* If op3 < olimit: continue the loop */ + /* Re-compute olimit */ + oloop = (oend - op3) / 5; + iloop = (ip0 - ilimit) / 7; + loop = oloop > iloop? iloop : oloop; + } +#undef PREPARE_NEXT_ITER +#undef RELOAD_BITS +#undef DECODE_AND_GET_NEXT +#undef DECODE_FROM_DELT +#undef GET_NEXT_DELT + + args->ip[0] = ip0; args->ip[1] = ip1; + args->ip[2] = ip2; args->ip[3] = ip3; + args->op[0] = op0; args->op[1] = op1; + args->op[2] = op2; args->op[3] = op3; + args->bits[0] = bits0; args->bits[1] = bits1; + args->bits[2] = bits2; args->bits[3] = bits3; + return; +} +#endif /* !HUF_NEED_BMI2_FUNCTION */ +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ -static HUF_ASM_X86_64_BMI2_ATTRS -size_t -HUF_decompress4X1_usingDTable_internal_bmi2_asm( +static size_t +HUF_decompress4X1_usingDTable_internal_fast( void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable) @@ -678,16 +788,23 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( void const* dt = DTable + 1; const BYTE* const iend = (const BYTE*)cSrc + 6; BYTE* const oend = (BYTE*)dst + dstSize; - HUF_DecompressAsmArgs args; + HUF_DecompressFastArgs args; + { - size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); FORWARD_IF_ERROR(ret, "Failed to init asm args"); if (ret != 0) - return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); } assert(args.ip[0] >= args.ilimit); - HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + HUF_decompress4X1_usingDTable_internal_asm_loop(&args); +#elif HUF_NEED_BMI2_FUNCTION + return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); +#else + HUF_decompress4X1_usingDTable_internal_fast_loop(&args); +#endif /* Our loop guarantees that ip[] >= ilimit and that we haven't * overwritten any op[]. @@ -720,7 +837,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( /* decoded size */ return dstSize; } -#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ +#endif /* HUF_ENABLE_FAST_DEC */ typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, const void *cSrc, @@ -734,18 +851,18 @@ static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, { #if DYNAMIC_BMI2 if (bmi2) { -# if ZSTD_ENABLE_ASM_X86_64_BMI2 - return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# if HUF_ENABLE_FAST_DEC + return HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable); # else - return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); # endif } #else (void)bmi2; #endif -#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) - return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#if HUF_ENABLE_FAST_DEC + return HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable); #else return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); #endif @@ -1364,44 +1481,134 @@ HUF_decompress4X2_usingDTable_internal_body( } } +static #if HUF_NEED_BMI2_FUNCTION -static BMI2_TARGET_ATTRIBUTE -size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, void const* cSrc, - size_t cSrcSize, HUF_DTable const* DTable) { - return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); -} +BMI2_TARGET_ATTRIBUTE #endif - -#if HUF_NEED_DEFAULT_FUNCTION -static size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, size_t cSrcSize, HUF_DTable const* DTable) { return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); } -#endif +#if HUF_ENABLE_FAST_DEC #if ZSTD_ENABLE_ASM_X86_64_BMI2 +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; +#else +#if !HUF_NEED_BMI2_FUNCTION +static void HUF_decompress4X2_usingDTable_internal_fast_loop(HUF_DecompressFastArgs* args) +{ + BYTE *ip0, *ip1, *ip2, *ip3; + BYTE *op0, *op1, *op2, *op3; + BYTE *oend0, *oend1, *oend2, *oend3; + U64 bits0, bits1, bits2, bits3; + BYTE const *ilimit = args->ilimit; + BYTE *olimit = NULL; + + U16 *dtable; + U64 var0, var1, var2, var3; + U64 oloop, iloop, loop; + + /* load stream params */ + memcpy(&ip0, &args->ip[0], sizeof(BYTE *)); + memcpy(&ip1, &args->ip[1], sizeof(BYTE *)); + memcpy(&ip2, &args->ip[2], sizeof(BYTE *)); + memcpy(&ip3, &args->ip[3], sizeof(BYTE *)); + op0 = args->op[0]; op1 = args->op[1]; + op2 = args->op[2]; op3 = args->op[3]; + bits0 = args->bits[0]; bits1 = args->bits[1]; + bits2 = args->bits[2]; bits3 = args->bits[3]; + memcpy(&dtable, &(args->dt), sizeof(U16 *)); + oend3 = args->oend; oend2 = op3; + oend1 = op2; oend0 = op1; + + /* compute boundry */ + iloop = (ip0 - ilimit) / 7; + var0 = oend3 - op3; var1 = oend2 - op2; + var2 = oend1 - op1; var3 = oend0 - op0; + var0 = MIN(var0, var1); var2 = MIN(var2, var3); + oloop = MIN(var0, var2); + oloop = oloop / 10; + loop = MIN(oloop, iloop); + + while(loop >= (2) && (ip1 >= ip0) + && (ip2 >= ip1) && (ip3 >= ip2)) { + olimit = op3 + loop * 5; + +#define DECODE(n) \ + var##n = (bits##n >> 53) *2; \ + memcpy(mem##n, dtable+var##n, 2*sizeof(U16)); \ + memcpy(op##n, mem##n, sizeof(U16)); \ + bits##n <<= mem##n[2]; \ + op##n += mem##n[3] + +#define RELOAD_BITS(n) \ + bits##n = __builtin_ctzl(bits##n); \ + var##n = bits##n & 7; \ + bits##n >>= 3; \ + ip##n -= bits##n; \ + memcpy(&bits##n, ip##n, sizeof(U64)); \ + bits##n |= 1; \ + bits##n <<= var##n + + do { + BYTE mem0[4],mem1[4],mem2[4],mem3[4]; + /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ + FOR_EACH_STREAM(DECODE); + FOR_EACH_STREAM(DECODE); + FOR_EACH_STREAM(DECODE); + FOR_EACH_STREAM(DECODE); + FOR_EACH_STREAM(DECODE); + + FOR_EACH_STREAM(RELOAD_BITS); + } while (op3 < olimit); /* If op3 < olimit: continue the loop */ + iloop = (ip0 - ilimit) / 7; + var0 = oend3 - op3; var1 = oend2 - op2; + var2 = oend1 - op1; var3 = oend0 - op0; + var0 = MIN(var0, var1); var2 = MIN(var2, var3); + oloop = MIN(var0, var2); + oloop = oloop / 10; + loop = MIN(oloop, iloop); + } +#undef DECODE +#undef RELOAD_BITS + + args->ip[0] = ip0; args->ip[1] = ip1; + args->ip[2] = ip2; args->ip[3] = ip3; + args->op[0] = op0; args->op[1] = op1; + args->op[2] = op2; args->op[3] = op3; + args->bits[0] = bits0; args->bits[1] = bits1; + args->bits[2] = bits2; args->bits[3] = bits3; + return; +} +#undef FOR_EACH_STREAM +#undef FOR_EACH_STREAM_WITH_INDEX +#endif /* !HUF_NEED_BMI2_FUNCTION */ +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ -HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; - -static HUF_ASM_X86_64_BMI2_ATTRS size_t -HUF_decompress4X2_usingDTable_internal_bmi2_asm( +static size_t +HUF_decompress4X2_usingDTable_internal_fast( void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable) { void const* dt = DTable + 1; const BYTE* const iend = (const BYTE*)cSrc + 6; BYTE* const oend = (BYTE*)dst + dstSize; - HUF_DecompressAsmArgs args; + HUF_DecompressFastArgs args; { - size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); FORWARD_IF_ERROR(ret, "Failed to init asm args"); if (ret != 0) - return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); } assert(args.ip[0] >= args.ilimit); - HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + HUF_decompress4X2_usingDTable_internal_asm_loop(&args); + #elif HUF_NEED_BMI2_FUNCTION + return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); + #else + HUF_decompress4X2_usingDTable_internal_fast_loop(&args); + #endif /* note : op4 already verified within main loop */ assert(args.ip[0] >= iend); @@ -1432,25 +1639,25 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm( /* decoded size */ return dstSize; } -#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ +#endif /* HUF_ENABLE_FAST_DEC */ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, size_t cSrcSize, HUF_DTable const* DTable, int bmi2) { #if DYNAMIC_BMI2 if (bmi2) { -# if ZSTD_ENABLE_ASM_X86_64_BMI2 - return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +# if HUF_ENABLE_FAST_DEC + return HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable); # else - return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); # endif } #else (void)bmi2; #endif -#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) - return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +#if HUF_ENABLE_FAST_DEC + return HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable); #else return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); #endif diff --git a/lib/decompress/huf_decompress_amd64.S b/lib/decompress/huf_decompress_amd64.S index 3f0e5c26c7d..86cf296d797 100644 --- a/lib/decompress/huf_decompress_amd64.S +++ b/lib/decompress/huf_decompress_amd64.S @@ -21,7 +21,7 @@ /* Calling convention: * - * %rdi contains the first argument: HUF_DecompressAsmArgs*. + * %rdi contains the first argument: HUF_DecompressFastArgs*. * %rbp isn't maintained (no frame pointer). * %rsp contains the stack pointer that grows down. * No red-zone is assumed, only addresses >= %rsp are used. @@ -30,14 +30,14 @@ * TODO: Support Windows calling convention. */ -ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) -ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) -ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) -ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) -.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop -.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop -.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop -.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_asm_loop) +.global HUF_decompress4X1_usingDTable_internal_asm_loop +.global HUF_decompress4X2_usingDTable_internal_asm_loop +.global _HUF_decompress4X1_usingDTable_internal_asm_loop +.global _HUF_decompress4X2_usingDTable_internal_asm_loop .text /* Sets up register mappings for clarity. @@ -95,8 +95,8 @@ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) /* Define both _HUF_* & HUF_* symbols because MacOS * C symbols are prefixed with '_' & Linux symbols aren't. */ -_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: -HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: +_HUF_decompress4X1_usingDTable_internal_asm_loop: +HUF_decompress4X1_usingDTable_internal_asm_loop: /* Save all registers - even if they are callee saved for simplicity. */ push %rax push %rbx @@ -114,7 +114,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: push %r14 push %r15 - /* Read HUF_DecompressAsmArgs* args from %rax */ + /* Read HUF_DecompressFastArgs* args from %rax */ movq %rdi, %rax movq 0(%rax), %ip0 movq 8(%rax), %ip1 @@ -350,8 +350,8 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: pop %rax ret -_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: -HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: +_HUF_decompress4X2_usingDTable_internal_asm_loop: +HUF_decompress4X2_usingDTable_internal_asm_loop: /* Save all registers - even if they are callee saved for simplicity. */ push %rax push %rbx From 9a7d7dad5516aa9d73e8c08ca7d61733c0cdc8b7 Mon Sep 17 00:00:00 2001 From: Jun He Date: Mon, 6 Jun 2022 20:39:18 +0800 Subject: [PATCH 2/2] decomp: add aarch64 assembly of HUF_decompress4X1 This is based on the fast HUF_4x1 decoding firstly introduced by Nick Terrell. It is manually tuned to balance performance across various Arm micro-architectures including N1/A72/A57. Signed-off-by: Jun He Change-Id: I2de7afd44a4b43cfbedc80747aef4a36c6ae35eb --- lib/common/portability_macros.h | 8 + lib/decompress/huf_decompress.c | 4 +- lib/decompress/huf_decompress_aarch64.S | 309 ++++++++++++++++++++++++ lib/libzstd.mk | 4 +- 4 files changed, 321 insertions(+), 4 deletions(-) create mode 100644 lib/decompress/huf_decompress_aarch64.S diff --git a/lib/common/portability_macros.h b/lib/common/portability_macros.h index 1650fa3d8cf..1d56f8b0c34 100644 --- a/lib/common/portability_macros.h +++ b/lib/common/portability_macros.h @@ -114,6 +114,14 @@ # define ZSTD_ASM_SUPPORTED 0 #endif +#if !defined(ZSTD_DISABLE_ASM) && \ + ZSTD_ASM_SUPPORTED && \ + defined(__aarch64__) +# define ZSTD_ENABLE_ASM_AARCH64 1 +#else +# define ZSTD_ENABLE_ASM_AARCH64 0 +#endif + /** * Determines whether we should enable assembly for x86-64 * with BMI2. diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index b6cd27720ee..022d10738cb 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -681,7 +681,7 @@ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, X(2, idx); \ X(3, idx) -#if ZSTD_ENABLE_ASM_X86_64_BMI2 +#if ZSTD_ENABLE_ASM_X86_64_BMI2 || ZSTD_ENABLE_ASM_AARCH64 HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; #else #if !HUF_NEED_BMI2_FUNCTION @@ -798,7 +798,7 @@ HUF_decompress4X1_usingDTable_internal_fast( } assert(args.ip[0] >= args.ilimit); -#if ZSTD_ENABLE_ASM_X86_64_BMI2 +#if ZSTD_ENABLE_ASM_X86_64_BMI2 || ZSTD_ENABLE_ASM_AARCH64 HUF_decompress4X1_usingDTable_internal_asm_loop(&args); #elif HUF_NEED_BMI2_FUNCTION return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); diff --git a/lib/decompress/huf_decompress_aarch64.S b/lib/decompress/huf_decompress_aarch64.S new file mode 100644 index 00000000000..87237b9e1c0 --- /dev/null +++ b/lib/decompress/huf_decompress_aarch64.S @@ -0,0 +1,309 @@ +/********************************************************************** + Copyright(c) 2022 Arm Corporation All rights reserved. + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + +#include "../common/portability_macros.h" + +#if defined(__ELF__) && defined(__GNUC__) +.section .note.GNU-stack,"",%progbits +#endif + +#if ZSTD_ENABLE_ASM_AARCH64 +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_asm_loop) + +.global HUF_decompress4X1_usingDTable_internal_asm_loop +.global _HUF_decompress4X1_usingDTable_internal_asm_loop + +.arch armv8-a + +.text + +.p2align 3,,7 +/* + * ArmV8-A calling conventions: + * (https://developer.arm.com/documentation/den0024/a/The-ABI-for-ARM-64-bit-Architecture/Register-use-in-the-AArch64-Procedure-Call-Standard/Parameters-in-general-purpose-registers) + * x30 (LR): Procedure link register, used to return from subroutines. + * x29 (FP): Frame pointer. + * x19 to x29: Callee-saved. + * x18 (PR): Platform register. Used for some operating-system-specific special purpose, or an additional caller-saved register. + * x16 (IP0) and x17 (IP1): Intra-Procedure-call scratch registers. + * x9 to x15: Local variables, caller saved. + * x8 (XR): Indirect return value address. + * x0 to x7: Argument values passed to and results returned from a subroutine. + */ + +#define coef7 x18 +#define coef5 x17 + +#define oend x16 +#define ilimit x15 +#define olimit x14 + +#define ip0 x13 +#define ip1 x12 +#define ip2 x11 +#define ip3 x10 + +#define op0 x9 +#define op1 x8 +#define op2 x7 +#define op3 x6 + +#define bits0 x5 +#define bits1 x4 +#define bits2 x3 +#define bits3 x2 + +#define dtable x1 + +/* tmp variables registers: x19 - x24 */ +#define iloop x29 +#define v0x x20 +#define v0w w20 +#define v1x x21 +#define v1w w21 +#define v2x x22 +#define v2w w22 +#define v3x x23 +#define v3w w23 +#define v4x x19 +#define v4w w19 +#define v5x x24 +#define v5w w24 + +/* c prototype: https://www.godbolt.org/z/nTY38f9hY */ +_HUF_decompress4X1_usingDTable_internal_asm_loop: +HUF_decompress4X1_usingDTable_internal_asm_loop: + stp x29, x30, [sp, -64]! + mov x29, sp + mov coef7, 37450 + mov coef5, 52429 + ldp ip0, ip1, [x0] + ldp op2, op3, [x0, 48] + ldp ilimit, oend, [x0, 104] + ldp ip2, ip3, [x0, 16] + + sub iloop, ip0, ilimit + sub olimit, oend, op3 + stp v4x, v5x, [sp, 16] + mul iloop, iloop, coef7 + mul olimit, olimit, coef5 + ldp op0, op1, [x0, 32] + asr iloop, iloop, 18 + asr olimit, olimit, 18 + + cmp olimit, iloop + csel olimit, olimit, iloop, ls + ldp bits0, bits1, [x0, 64] + cmp ip0, ip1 + ccmp ip1, ip2, 2, ls + cset v5w, ls + cmp olimit, 3 + ccmp ip2, ip3, 2, hi + cset v4w, ls + ldp bits2, bits3, [x0, 80] + tst v5w, v4w + ldr dtable, [x0, 96] + beq .exit_4x1 + + stp v2x, v3x, [sp, 48] + lsr v2x, bits2, 53 + lsr v3x, bits3, 53 + stp v0x, v1x, [sp, 32] + lsr v0x, bits0, 53 + lsr v1x, bits1, 53 + ldrh v2w, [dtable, v2x, lsl 1] + ldrh v3w, [dtable, v3x, lsl 1] + ldrh v0w, [dtable, v0x, lsl 1] + ldrh v1w, [dtable, v1x, lsl 1] + +.update_4x1_olimit: + add olimit, olimit, olimit, lsl 2 + add olimit, op3, olimit + +.decode_4x1_loop: +#if defined(__APPLE__) && (__APPLE__) +/* on Apple platforms '%%' is used as seperator instead of ';' + * on other Linux systems. + */ +#define DECODE_STREAM(idx) \ + lsl bits0, bits0, v0x%% \ + ubfx v0x, v0x, 8, 8%% \ + lsr v4x, bits0, 53%% \ + lsl bits1, bits1, v1x%% \ + strb v0w, [op0, -##idx]%% \ + lsr v5x, bits1, 53%% \ + ldrh v0w, [dtable, v4x, lsl 1]%% \ + ubfx v1x, v1x, 8, 8%% \ + lsl bits2, bits2, v2x%% \ + strb v1w, [op1, -##idx]%% \ + lsr v4x, bits2, 53%% \ + ldrh v1w, [dtable, v5x, lsl 1]%% \ + ubfx v2x, v2x, 8, 8%% \ + lsl bits3, bits3, v3x%% \ + strb v2w, [op2, -##idx]%% \ + ubfx v3x, v3x, 8, 8%% \ + lsr v5x, bits3, 53%% \ + ldrh v2w, [dtable, v4x, lsl 1]%% \ + strb v3w, [op3, -##idx]%% \ + ldrh v3w, [dtable, v5x, lsl 1] + +#define PROCESS_EACH_STREAM(FN) \ + FN(5)%% \ + FN(4)%% \ + FN(3)%% \ + FN(2) +#else +#define DECODE_STREAM(idx) \ + lsl bits0, bits0, v0x; \ + ubfx v0x, v0x, 8, 8; \ + lsr v4x, bits0, 53; \ + lsl bits1, bits1, v1x; \ + strb v0w, [op0, -##idx]; \ + lsr v5x, bits1, 53; \ + ldrh v0w, [dtable, v4x, lsl 1]; \ + ubfx v1x, v1x, 8, 8; \ + lsl bits2, bits2, v2x; \ + strb v1w, [op1, -##idx]; \ + lsr v4x, bits2, 53; \ + ldrh v1w, [dtable, v5x, lsl 1]; \ + ubfx v2x, v2x, 8, 8; \ + lsl bits3, bits3, v3x; \ + strb v2w, [op2, -##idx]; \ + ubfx v3x, v3x, 8, 8; \ + lsr v5x, bits3, 53; \ + ldrh v2w, [dtable, v4x, lsl 1]; \ + strb v3w, [op3, -##idx]; \ + ldrh v3w, [dtable, v5x, lsl 1] + +#define PROCESS_EACH_STREAM(FN) \ + FN(5); \ + FN(4); \ + FN(3); \ + FN(2) +#endif + + add op3, op3, 5 + add op0, op0, 5 + add op1, op1, 5 + add op2, op2, 5 + + PROCESS_EACH_STREAM(DECODE_STREAM) + + // DECODE_FROM_DELT and PREPARE_NEXT_ITER + lsl bits0, bits0, v0x + ubfx v4x, v0x, 8, 8 + rbit bits0, bits0 + ubfx v5x, v1x, 8, 8 + lsl bits1, bits1, v1x + strb v4w, [op0, -1] + + clz bits0, bits0 + rbit bits1, bits1 + ubfx v4x, v2x, 8, 8 + lsl bits2, bits2, v2x + sub ip0, ip0, bits0, lsr 3 + clz bits1, bits1 + rbit bits2, bits2 + strb v5w, [op1, -1] + + lsl bits3, bits3, v3x + ubfx v5x, v3x, 8, 8 + sub ip1, ip1, bits1, lsr 3 + rbit bits3, bits3 + clz bits2, bits2 + strb v4w, [op2, -1] + + clz bits3, bits3 + sub ip2, ip2, bits2, lsr 3 + strb v5w, [op3, -1] + sub ip3, ip3, bits3, lsr 3 + ubfx v0x, bits0, 0, 3 + ldr bits0, [ip0] + ubfx v1x, bits1, 0, 3 + ubfx v2x, bits2, 0, 3 + ldr bits1, [ip1] + + ubfx v3x, bits3, 0, 3 + ldr bits2, [ip2] + orr bits0, bits0, 1 + ldr bits3, [ip3] + lsl bits0, bits0, v0x + orr bits1, bits1, 1 + lsr v4x, bits0, 53 + lsl bits1, bits1, v1x + ldrh v0w, [dtable, v4x, lsl 1] + + orr bits2, bits2, 1 + lsr v5x, bits1, 53 + lsl bits2, bits2, v2x + ldrh v1w, [dtable, v5x, lsl 1] + orr bits3, bits3, 1 + lsr v4x, bits2, 53 + lsl bits3, bits3, v3x + ldrh v2w, [dtable, v4x, lsl 1] + lsr v5x, bits3, 53 + + cmp olimit, op3 + ldrh v3w, [dtable, v5x, lsl 1] + bhi .decode_4x1_loop + + sub olimit, oend, op3 + sub iloop, ip0, ilimit + mul olimit, olimit, coef5 + mul iloop, iloop, coef7 + asr olimit, olimit, 18 + asr iloop, iloop, 18 + + cmp olimit, iloop + csel olimit, olimit, iloop, ls + cmp ip0, ip1 + ccmp ip1, ip2, 2, ls + cset v5w, ls + cmp olimit, 3 + ccmp ip2, ip3, 2, hi + cset v4w, ls + tst v5w, v4w + bne .update_4x1_olimit + + ldp v2x, v3x, [sp, 48] + ldp v0x, v1x, [sp, 32] + +.exit_4x1: + stp ip0, ip1, [x0] + stp ip2, ip3, [x0, 16] + stp op0, op1, [x0, 32] + stp op2, op3, [x0, 48] + stp bits0, bits1, [x0, 64] + stp bits2, bits3, [x0, 80] + ldp v4x, v5x, [sp, 16] + ldp x29, x30, [sp], 64 + ret + +#undef DECODE_STREAM +#undef PROCESS_EACH_STREAM +#endif diff --git a/lib/libzstd.mk b/lib/libzstd.mk index df298d78978..6577939fdc5 100644 --- a/lib/libzstd.mk +++ b/lib/libzstd.mk @@ -133,14 +133,14 @@ ZSTD_DICTBUILDER_FILES := $(sort $(wildcard $(LIBZSTD)/dictBuilder/*.c)) ZSTD_DEPRECATED_FILES := $(sort $(wildcard $(LIBZSTD)/deprecated/*.c)) ZSTD_LEGACY_FILES := -ZSTD_DECOMPRESS_AMD64_ASM_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*_amd64.S)) +ZSTD_DECOMPRESS_ASM_FILES := $(sort $(wildcard $(LIBZSTD)/decompress/*.S)) ifneq ($(ZSTD_NO_ASM), 0) CPPFLAGS += -DZSTD_DISABLE_ASM else # Unconditionally add the ASM files they are disabled by # macros in the .S file. - ZSTD_DECOMPRESS_FILES += $(ZSTD_DECOMPRESS_AMD64_ASM_FILES) + ZSTD_DECOMPRESS_FILES += $(ZSTD_DECOMPRESS_ASM_FILES) endif ifneq ($(HUF_FORCE_DECOMPRESS_X1), 0)