From 57732f7a3d40767d0ba921923abf33fc7232b7d8 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Fri, 13 Jan 2023 16:34:52 -0800 Subject: [PATCH 1/2] [huf] Add generic C versions of the fast decoding loops Add generic C versions of the fast decoding loops to serve architectures that don't have an assembly implementation. Also allow selecting the C decoding loop over the assembly decoding loop through a zstd decompression parameter `ZSTD_d_disableHuffmanAssembly`. I benchmarked on my Intel i9-9900K and my Macbook Air with an M1 processor. The benchmark command forces zstd to compress without any matches, using only literals compression, and measures only Huffman decompression speed: ``` zstd -b1e1 --compress-literals --zstd=tlen=131072 silesia.tar ``` The new fast decoding loops outperform the previous implementation uniformly, but don't beat the x86-64 assembly. Additionally, the fast C decoding loops suffer from the same stability problems that we've seen in the past, where the assembly version doesn't. So even though clang gets close to assembly on x86-64, it still has stability issues. | Arch | Function | Compiler | Default (MB/s) | Assembly (MB/s) | Fast (MB/s) | |---------|----------------|--------------|----------------|-----------------|-------------| | x86-64 | decompress 4X1 | gcc-12.2.0 | 1029.6 | 1308.1 | 1208.1 | | x86-64 | decompress 4X1 | clang-14.0.6 | 1019.3 | 1305.6 | 1276.3 | | x86-64 | decompress 4X2 | gcc-12.2.0 | 1348.5 | 1657.0 | 1374.1 | | x86-64 | decompress 4X2 | clang-14.0.6 | 1027.6 | 1659.9 | 1468.1 | | aarch64 | decompress 4X1 | clang-12.0.5 | 1081.0 | N/A | 1234.9 | | aarch64 | decompress 4X2 | clang-12.0.5 | 1270.0 | N/A | 1516.6 | --- lib/common/entropy_common.c | 8 +- lib/common/huf.h | 7 +- lib/common/portability_macros.h | 8 + lib/decompress/huf_decompress.c | 425 ++++++++++++++++++---- lib/decompress/huf_decompress_amd64.S | 26 +- lib/decompress/zstd_decompress.c | 13 + lib/decompress/zstd_decompress_block.c | 4 +- lib/decompress/zstd_decompress_internal.h | 1 + lib/zstd.h | 15 +- tests/fuzz/huf_decompress.c | 3 +- tests/fuzz/huf_round_trip.c | 3 +- tests/fuzzer.c | 11 + tests/zstreamtest.c | 15 +- 13 files changed, 448 insertions(+), 91 deletions(-) diff --git a/lib/common/entropy_common.c b/lib/common/entropy_common.c index e4092196618..e2173afb0a8 100644 --- a/lib/common/entropy_common.c +++ b/lib/common/entropy_common.c @@ -236,7 +236,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, const void* src, size_t srcSize) { U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; - return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); + return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); } FORCE_INLINE_TEMPLATE size_t @@ -328,13 +328,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, - int bmi2) + int flags) { #if DYNAMIC_BMI2 - if (bmi2) { + if (flags & HUF_flags_bmi2) { return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); } #endif - (void)bmi2; + (void)flags; return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); } diff --git a/lib/common/huf.h b/lib/common/huf.h index 5ab79db7af3..73d1ee56543 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -107,7 +107,12 @@ typedef enum { * If set: Don't use assembly implementations * If unset: Allow using assembly implementations */ - HUF_flags_disableAsm = (1 << 4) + HUF_flags_disableAsm = (1 << 4), + /** + * If set: Don't use the fast decoding loop, always use the fallback decoding loop. + * If unset: Use the fast decoding loop when possible. + */ + HUF_flags_disableFast = (1 << 5) } HUF_flags_e; diff --git a/lib/common/portability_macros.h b/lib/common/portability_macros.h index bf3e2b7b1c6..8fd6ea82d19 100644 --- a/lib/common/portability_macros.h +++ b/lib/common/portability_macros.h @@ -137,12 +137,20 @@ /* * For x86 ELF targets, add .note.gnu.property section for Intel CET in * assembly sources when CET is enabled. + * + * Additionally, any function that may be called indirectly must begin + * with ZSTD_CET_ENDBRANCH. */ #if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ && defined(__has_include) # if __has_include() # include +# define ZSTD_CET_ENDBRANCH _CET_ENDBR # endif #endif +#ifndef ZSTD_CET_ENDBRANCH +# define ZSTD_CET_ENDBRANCH +#endif + #endif /* ZSTD_PORTABILITY_MACROS_H */ diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index 3cafeba5340..0a29f03d422 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -43,10 +43,14 @@ #error "Cannot force the use of the X1 and X2 decoders at the same time!" #endif -#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 -# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE +/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is + * supported at runtime, so we can add the BMI2 target attribute. + * When it is disabled, we will still get BMI2 if it is enabled statically. + */ +#if DYNAMIC_BMI2 +# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE #else -# define HUF_ASM_X86_64_BMI2_ATTRS +# define HUF_FAST_BMI2_ATTRS #endif #ifdef __cplusplus @@ -56,7 +60,7 @@ #endif #define HUF_ASM_DECL HUF_EXTERN_C -#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +#if DYNAMIC_BMI2 # define HUF_NEED_BMI2_FUNCTION 1 #else # define HUF_NEED_BMI2_FUNCTION 0 @@ -78,6 +82,11 @@ /* ************************************************************** * BMI2 Variant Wrappers ****************************************************************/ +typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, + const void *cSrc, + size_t cSrcSize, + const HUF_DTable *DTable); + #if DYNAMIC_BMI2 #define HUF_DGEN(fn) \ @@ -132,15 +141,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) return dtd; } -#if ZSTD_ENABLE_ASM_X86_64_BMI2 - -static size_t HUF_initDStream(BYTE const* ip) { +static size_t HUF_initFastDStream(BYTE const* ip) { BYTE const lastByte = ip[7]; size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; size_t const value = MEM_readLEST(ip) | 1; assert(bitsConsumed <= 8); + assert(sizeof(size_t) == 8); return value << bitsConsumed; } + + +/** + * The input/output arguments to the Huffman fast decoding loop: + * + * ip [in/out] - The input pointers, must be updated to reflect what is consumed. + * op [in/out] - The output pointers, must be updated to reflect what is written. + * bits [in/out] - The bitstream containers, must be updated to reflect the current state. + * dt [in] - The decoding table. + * ilimit [in] - The input limit, stop when any input pointer is below ilimit. + * oend [in] - The end of the output stream. op[3] must not cross oend. + * iend [in] - The end of each input stream. ip[i] may cross iend[i], + * as long as it is above ilimit, but that indicates corruption. + */ typedef struct { BYTE const* ip[4]; BYTE* op[4]; @@ -149,15 +171,17 @@ typedef struct { BYTE const* ilimit; BYTE* oend; BYTE const* iend[4]; -} HUF_DecompressAsmArgs; +} HUF_DecompressFastArgs; + +typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); /** - * Initializes args for the asm decoding loop. - * @returns 0 on success - * 1 if the fallback implementation should be used. + * Initializes args for the fast decoding loop. + * @returns 1 on success + * 0 if the fallback implementation should be used. * Or an error code on failure. */ -static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) +static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) { void const* dt = DTable + 1; U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; @@ -166,9 +190,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, BYTE* const oend = (BYTE*)dst + dstSize; - /* The following condition is false on x32 platform, - * but HUF_asm is not compatible with this ABI */ - if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; + /* The fast decoding loop assumes 64-bit little-endian. + * This condition is false on x32. + */ + if (!MEM_isLittleEndian() || MEM_32bits()) + return 0; /* strict minimum : jump table + 1 byte per stream */ if (srcSize < 10) @@ -179,7 +205,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. */ if (dtLog != HUF_DECODER_FAST_TABLELOG) - return 1; + return 0; /* Read the jump table. */ { @@ -193,13 +219,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, args->iend[2] = args->iend[1] + length2; args->iend[3] = args->iend[2] + length3; - /* HUF_initDStream() requires this, and this small of an input + /* HUF_initFastDStream() requires this, and this small of an input * won't benefit from the ASM loop anyways. * length1 must be >= 16 so that ip[0] >= ilimit before the loop * starts. */ if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) - return 1; + return 0; if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ } /* ip[] contains the position that is currently loaded into bits[]. */ @@ -216,7 +242,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, /* No point to call the ASM loop for tiny outputs. */ if (args->op[3] >= oend) - return 1; + return 0; /* bits[] is the bit container. * It is read from the MSB down to the LSB. @@ -225,10 +251,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, * set, so that CountTrailingZeros(bits[]) can be used * to count how many bits we've consumed. */ - args->bits[0] = HUF_initDStream(args->ip[0]); - args->bits[1] = HUF_initDStream(args->ip[1]); - args->bits[2] = HUF_initDStream(args->ip[2]); - args->bits[3] = HUF_initDStream(args->ip[3]); + args->bits[0] = HUF_initFastDStream(args->ip[0]); + args->bits[1] = HUF_initFastDStream(args->ip[1]); + args->bits[2] = HUF_initFastDStream(args->ip[2]); + args->bits[3] = HUF_initFastDStream(args->ip[3]); /* If ip[] >= ilimit, it is guaranteed to be safe to * reload bits[]. It may be beyond its section, but is @@ -239,10 +265,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, args->oend = oend; args->dt = dt; - return 0; + return 1; } -static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) +static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) { /* Validate that we haven't overwritten. */ if (args->op[stream] > segmentEnd) @@ -257,7 +283,7 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs /* Construct the BIT_DStream_t. */ assert(sizeof(size_t) == 8); - bit->bitContainer = MEM_readLE64(args->ip[stream]); + bit->bitContainer = MEM_readLEST(args->ip[stream]); bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); bit->start = (const char*)args->iend[0]; bit->limitPtr = bit->start + sizeof(size_t); @@ -265,7 +291,6 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs return 0; } -#endif #ifndef HUF_FORCE_DECOMPRESS_X2 @@ -655,27 +680,132 @@ size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, #if ZSTD_ENABLE_ASM_X86_64_BMI2 -HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; + +#endif + +static HUF_FAST_BMI2_ATTRS +void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) +{ + U64 bits[4]; + BYTE const* ip[4]; + BYTE* op[4]; + U16 const* const dtable = (U16 const*)args->dt; + BYTE* const oend = args->oend; + BYTE const* const ilimit = args->ilimit; + + /* Copy the arguments to local variables */ + ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); + ZSTD_memcpy(&ip, &args->ip, sizeof(ip)); + ZSTD_memcpy(&op, &args->op, sizeof(op)); + + assert(MEM_isLittleEndian()); + assert(!MEM_32bits()); + + for (;;) { + BYTE* olimit; + int stream; + int symbol; + + /* Assert loop preconditions */ +#ifndef NDEBUG + for (stream = 0; stream < 4; ++stream) { + assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); + assert(ip[stream] >= ilimit); + } +#endif + /* Compute olimit */ + { + /* Each iteration produces 5 output symbols per stream */ + size_t const oiters = (size_t)(oend - op[3]) / 5; + /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes + * per stream. + */ + size_t const iiters = (size_t)(ip[0] - ilimit) / 7; + /* We can safely run iters iterations before running bounds checks */ + size_t const iters = MIN(oiters, iiters); + size_t const symbols = iters * 5; + + /* We can simply check that op[3] < olimit, instead of checking all + * of our bounds, since we can't hit the other bounds until we've run + * iters iterations, which only happens when op[3] == olimit. + */ + olimit = op[3] + symbols; + + /* Exit fast decoding loop once we get close to the end. */ + if (op[3] + 20 > olimit) + break; + + /* Exit the decoding loop if any input pointer has crossed the + * previous one. This indicates corruption, and a precondition + * to our loop is that ip[i] >= ip[0]. + */ + for (stream = 1; stream < 4; ++stream) { + if (ip[stream] < ip[stream - 1]) + break; + } + } + +#ifndef NDEBUG + for (stream = 1; stream < 4; ++stream) { + assert(ip[stream] >= ip[stream - 1]); + } +#endif + + do { + /* Decode 5 symbols in each of the 4 streams */ + for (symbol = 0; symbol < 5; ++symbol) { + for (stream = 0; stream < 4; ++stream) { + int const index = (int)(bits[stream] >> 53); + int const entry = (int)dtable[index]; + bits[stream] <<= (entry & 63); + op[stream][symbol] = (BYTE)((entry >> 8) & 0xFF); + } + } + /* Reload the bitstreams */ + for (stream = 0; stream < 4; ++stream) { + int const ctz = ZSTD_countTrailingZeros64(bits[stream]); + int const nbBits = ctz & 7; + int const nbBytes = ctz >> 3; + op[stream] += 5; + ip[stream] -= nbBytes; + bits[stream] = MEM_read64(ip[stream]) | 1; + bits[stream] <<= nbBits; + } + } while (op[3] < olimit); + } + + /* Save the final values of each of the state variables back to args. */ + ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); + ZSTD_memcpy(&args->ip, &ip, sizeof(ip)); + ZSTD_memcpy(&args->op, &op, sizeof(op)); +} -static HUF_ASM_X86_64_BMI2_ATTRS +/** + * @returns @p dstSize on success (>= 6) + * 0 if the fallback implementation should be used + * An error if an error occurred + */ +static HUF_FAST_BMI2_ATTRS size_t -HUF_decompress4X1_usingDTable_internal_bmi2_asm( +HUF_decompress4X1_usingDTable_internal_fast( void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) + const HUF_DTable* DTable, + HUF_DecompressFastLoopFn loopFn) { void const* dt = DTable + 1; const BYTE* const iend = (const BYTE*)cSrc + 6; BYTE* const oend = (BYTE*)dst + dstSize; - HUF_DecompressAsmArgs args; - { size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); - FORWARD_IF_ERROR(ret, "Failed to init asm args"); - if (ret != 0) - return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + HUF_DecompressFastArgs args; + { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); + if (ret == 0) + return 0; } assert(args.ip[0] >= args.ilimit); - HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); + loopFn(&args); /* Our loop guarantees that ip[] >= ilimit and that we haven't * overwritten any op[]. @@ -705,37 +835,43 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( } /* decoded size */ + assert(dstSize != 0); return dstSize; } -#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ - -typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, - const void *cSrc, - size_t cSrcSize, - const HUF_DTable *DTable); HUF_DGEN(HUF_decompress1X1_usingDTable_internal) static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, size_t cSrcSize, HUF_DTable const* DTable, int flags) { + HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; + HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; + #if DYNAMIC_BMI2 if (flags & HUF_flags_bmi2) { + fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; # if ZSTD_ENABLE_ASM_X86_64_BMI2 - if (!(flags & HUF_flags_disableAsm)) - return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; + } # endif - return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + } else { + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); } -#else - (void)flags; #endif #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) - if (!(flags & HUF_flags_disableAsm)) - return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; + } #endif - return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); + + if (!(flags & HUF_flags_disableFast)) { + size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); + if (ret != 0) + return ret; + } + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); } static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, @@ -1322,26 +1458,167 @@ size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, #if ZSTD_ENABLE_ASM_X86_64_BMI2 -HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; + +#endif + +static HUF_FAST_BMI2_ATTRS +void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) +{ + U64 bits[4]; + BYTE const* ip[4]; + BYTE* op[4]; + BYTE* oend[4]; + HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; + BYTE const* const ilimit = args->ilimit; + + /* Copy the arguments to local registers. */ + ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); + ZSTD_memcpy(&ip, &args->ip, sizeof(ip)); + ZSTD_memcpy(&op, &args->op, sizeof(op)); + + oend[0] = op[1]; + oend[1] = op[2]; + oend[2] = op[3]; + oend[3] = args->oend; + + assert(MEM_isLittleEndian()); + assert(!MEM_32bits()); + + for (;;) { + BYTE* olimit; + int stream; + int symbol; + + /* Assert loop preconditions */ +#ifndef NDEBUG + for (stream = 0; stream < 4; ++stream) { + assert(op[stream] <= oend[stream]); + assert(ip[stream] >= ilimit); + } +#endif + /* Compute olimit */ + { + /* Each loop does 5 table lookups for each of the 4 streams. + * Each table lookup consumes up to 11 bits of input, and produces + * up to 2 bytes of output. + */ + /* We can consume up to 7 bytes of input per iteration per stream. + * We also know that each input pointer is >= ip[0]. So we can run + * iters loops before running out of input. + */ + size_t iters = (size_t)(ip[0] - ilimit) / 7; + /* Each iteration can produce up to 10 bytes of output per stream. + * Each output stream my advance at different rates. So take the + * minimum number of safe iterations among all the output streams. + */ + for (stream = 0; stream < 4; ++stream) { + size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; + iters = MIN(iters, oiters); + } + + /* Each iteration produces at least 5 output symbols. So until + * op[3] crosses olimit, we know we haven't executed iters + * iterations yet. This saves us maintaining an iters counter, + * at the expense of computing the remaining # of iterations + * more frequently. + */ + olimit = op[3] + (iters * 5); + + /* Exit the fast decoding loop if we are too close to the end. */ + if (op[3] + 10 > olimit) + break; -static HUF_ASM_X86_64_BMI2_ATTRS size_t -HUF_decompress4X2_usingDTable_internal_bmi2_asm( + /* Exit the decoding loop if any input pointer has crossed the + * previous one. This indicates corruption, and a precondition + * to our loop is that ip[i] >= ip[0]. + */ + for (stream = 1; stream < 4; ++stream) { + if (ip[stream] < ip[stream - 1]) + break; + } + } + +#ifndef NDEBUG + for (stream = 1; stream < 4; ++stream) { + assert(ip[stream] >= ip[stream - 1]); + } +#endif + + do { + /* Do 5 table lookups for each of the first 3 streams */ + for (symbol = 0; symbol < 5; ++symbol) { + for (stream = 0; stream < 3; ++stream) { + int const index = (int)(bits[stream] >> 53); + HUF_DEltX2 const entry = dtable[index]; + MEM_write16(op[stream], entry.sequence); + bits[stream] <<= (entry.nbBits); + op[stream] += (entry.length); + } + } + /* Do 1 table lookup from the final stream */ + { + int const index = (int)(bits[3] >> 53); + HUF_DEltX2 const entry = dtable[index]; + MEM_write16(op[3], entry.sequence); + bits[3] <<= (entry.nbBits); + op[3] += (entry.length); + } + /* Do 4 table lookups from the final stream & reload bitstreams */ + for (stream = 0; stream < 4; ++stream) { + /* Do a table lookup from the final stream. + * This is interleaved with the reloading to reduce register + * pressure. This shouldn't be necessary, but compilers can + * struggle with codegen with high register pressure. + */ + { + int const index = (int)(bits[3] >> 53); + HUF_DEltX2 const entry = dtable[index]; + MEM_write16(op[3], entry.sequence); + bits[3] <<= (entry.nbBits); + op[3] += (entry.length); + } + /* Reload the bistreams. The final bitstream must be reloaded + * after the 5th symbol was decoded. + */ + { + int const ctz = ZSTD_countTrailingZeros64(bits[stream]); + int const nbBits = ctz & 7; + int const nbBytes = ctz >> 3; + ip[stream] -= nbBytes; + bits[stream] = MEM_read64(ip[stream]) | 1; + bits[stream] <<= nbBits; + } + } + } while (op[3] < olimit); + } + + /* Save the final values of each of the state variables back to args. */ + ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); + ZSTD_memcpy(&args->ip, &ip, sizeof(ip)); + ZSTD_memcpy(&args->op, &op, sizeof(op)); +} + + +static HUF_FAST_BMI2_ATTRS size_t +HUF_decompress4X2_usingDTable_internal_fast( void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, - const HUF_DTable* DTable) { + const HUF_DTable* DTable, + HUF_DecompressFastLoopFn loopFn) { void const* dt = DTable + 1; const BYTE* const iend = (const BYTE*)cSrc + 6; BYTE* const oend = (BYTE*)dst + dstSize; - HUF_DecompressAsmArgs args; + HUF_DecompressFastArgs args; { - size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); FORWARD_IF_ERROR(ret, "Failed to init asm args"); - if (ret != 0) - return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + if (ret == 0) + return 0; } assert(args.ip[0] >= args.ilimit); - HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); + loopFn(&args); /* note : op4 already verified within main loop */ assert(args.ip[0] >= iend); @@ -1372,28 +1649,38 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm( /* decoded size */ return dstSize; } -#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, size_t cSrcSize, HUF_DTable const* DTable, int flags) { + HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; + HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; + #if DYNAMIC_BMI2 if (flags & HUF_flags_bmi2) { + fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; # if ZSTD_ENABLE_ASM_X86_64_BMI2 - if (!(flags & HUF_flags_disableAsm)) - return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; + } # endif - return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); + } else { + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); } -#else - (void)flags; #endif #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) - if (!(flags & HUF_flags_disableAsm)) - return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); + if (!(flags & HUF_flags_disableAsm)) { + loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; + } #endif - return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); + + if (!(flags & HUF_flags_disableFast)) { + size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); + if (ret != 0) + return ret; + } + return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); } HUF_DGEN(HUF_decompress1X2_usingDTable_internal) diff --git a/lib/decompress/huf_decompress_amd64.S b/lib/decompress/huf_decompress_amd64.S index 8d7c7a6441b..671624fe343 100644 --- a/lib/decompress/huf_decompress_amd64.S +++ b/lib/decompress/huf_decompress_amd64.S @@ -30,14 +30,14 @@ * TODO: Support Windows calling convention. */ -ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) -ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) -ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop) -ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) -.global HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop -.global HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop -.global _HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop -.global _HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop) +ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) +.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop +.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop +.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop +.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop .text /* Sets up register mappings for clarity. @@ -95,8 +95,9 @@ ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop) /* Define both _HUF_* & HUF_* symbols because MacOS * C symbols are prefixed with '_' & Linux symbols aren't. */ -_HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: -HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: +_HUF_decompress4X1_usingDTable_internal_fast_asm_loop: +HUF_decompress4X1_usingDTable_internal_fast_asm_loop: + ZSTD_CET_ENDBRANCH /* Save all registers - even if they are callee saved for simplicity. */ push %rax push %rbx @@ -350,8 +351,9 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop: pop %rax ret -_HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: -HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop: +_HUF_decompress4X2_usingDTable_internal_fast_asm_loop: +HUF_decompress4X2_usingDTable_internal_fast_asm_loop: + ZSTD_CET_ENDBRANCH /* Save all registers - even if they are callee saved for simplicity. */ push %rax push %rbx diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 7e6ecd7df4e..d7e52a2e222 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -243,6 +243,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) dctx->outBufferMode = ZSTD_bm_buffered; dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; + dctx->disableHufAsm = 0; } static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) @@ -1811,6 +1812,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; return bounds; + case ZSTD_d_disableHuffmanAssembly: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + default:; } bounds.error = ERROR(parameter_unsupported); @@ -1851,6 +1857,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value case ZSTD_d_refMultipleDDicts: *value = (int)dctx->refMultipleDDicts; return 0; + case ZSTD_d_disableHuffmanAssembly: + *value = (int)dctx->disableHufAsm; + return 0; default:; } RETURN_ERROR(parameter_unsupported, ""); @@ -1884,6 +1893,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value } dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; return 0; + case ZSTD_d_disableHuffmanAssembly: + CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); + dctx->disableHufAsm = value != 0; + return 0; default:; } RETURN_ERROR(parameter_unsupported, ""); diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index fbe4bf67440..95a5e109b23 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -141,7 +141,9 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, U32 const lhc = MEM_readLE32(istart); size_t hufSuccess; size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); - int const flags = ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0; + int const flags = 0 + | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) + | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); switch(lhlCode) { case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ diff --git a/lib/decompress/zstd_decompress_internal.h b/lib/decompress/zstd_decompress_internal.h index 32685f2d3ba..c2ec5d9fbef 100644 --- a/lib/decompress/zstd_decompress_internal.h +++ b/lib/decompress/zstd_decompress_internal.h @@ -165,6 +165,7 @@ struct ZSTD_DCtx_s ZSTD_dictUses_e dictUses; ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ + int disableHufAsm; /* streaming */ ZSTD_dStreamStage streamStage; diff --git a/lib/zstd.h b/lib/zstd.h index d2c5d3ff5b6..d9e26ee1302 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -614,13 +614,15 @@ typedef enum { * ZSTD_d_stableOutBuffer * ZSTD_d_forceIgnoreChecksum * ZSTD_d_refMultipleDDicts + * ZSTD_d_disableHuffmanAssembly * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly */ ZSTD_d_experimentalParam1=1000, ZSTD_d_experimentalParam2=1001, ZSTD_d_experimentalParam3=1002, - ZSTD_d_experimentalParam4=1003 + ZSTD_d_experimentalParam4=1003, + ZSTD_d_experimentalParam5=1004 } ZSTD_dParameter; @@ -2345,6 +2347,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete */ #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 +/* ZSTD_d_disableHuffmanAssembly + * Set to 1 to disable the Huffman assembly implementation. + * The default value is 0, which allows zstd to use the Huffman assembly + * implementation if available. + * + * This parameter can be used to disable Huffman assembly at runtime. + * If you want to disable it at compile time you can define the macro + * ZSTD_DISABLE_ASM. + */ +#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 + /*! ZSTD_DCtx_setFormat() : * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). diff --git a/tests/fuzz/huf_decompress.c b/tests/fuzz/huf_decompress.c index a9b3540abdf..fcd4b1a3bd2 100644 --- a/tests/fuzz/huf_decompress.c +++ b/tests/fuzz/huf_decompress.c @@ -33,7 +33,8 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_optimalDepth : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_preferRepeat : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_suspectUncompressible : 0) - | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_disableAsm : 0); + | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_disableAsm : 0) + | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_disableFast : 0); /* Select a random cBufSize - it may be too small */ size_t const dBufSize = FUZZ_dataProducer_uint32Range(producer, 0, 8 * size + 500); size_t const maxTableLog = FUZZ_dataProducer_uint32Range(producer, 1, HUF_TABLELOG_MAX); diff --git a/tests/fuzz/huf_round_trip.c b/tests/fuzz/huf_round_trip.c index 9967c57bf62..4d0f8de23f5 100644 --- a/tests/fuzz/huf_round_trip.c +++ b/tests/fuzz/huf_round_trip.c @@ -49,7 +49,8 @@ int LLVMFuzzerTestOneInput(const uint8_t *src, size_t size) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_optimalDepth : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_preferRepeat : 0) | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_suspectUncompressible : 0) - | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_disableAsm : 0); + | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_disableAsm : 0) + | (FUZZ_dataProducer_int32Range(producer, 0, 1) ? HUF_flags_disableFast : 0); /* Select a random cBufSize - it may be too small */ size_t const cBufSize = FUZZ_dataProducer_uint32Range(producer, 0, 4 * size); /* Select a random tableLog - we'll adjust it up later */ diff --git a/tests/fuzzer.c b/tests/fuzzer.c index bd3ca0e028b..fc78c7fbc5f 100644 --- a/tests/fuzzer.c +++ b/tests/fuzzer.c @@ -684,6 +684,17 @@ static int basicUnitTests(U32 const seed, double compressibility) if (r != CNBuffSize) goto _output_error; } DISPLAYLEVEL(3, "OK \n"); + DISPLAYLEVEL(3, "test%3i : decompress %u bytes with Huffman assembly disabled : ", testNb++, (unsigned)CNBuffSize); + { + ZSTD_DCtx* dctx = ZSTD_createDCtx(); + size_t r; + CHECK_Z(ZSTD_DCtx_setParameter(dctx, ZSTD_d_disableHuffmanAssembly, 1)); + r = ZSTD_decompress(decodedBuffer, CNBuffSize, compressedBuffer, cSize); + if (r != CNBuffSize || memcmp(decodedBuffer, CNBuffer, CNBuffSize)) goto _output_error; + ZSTD_freeDCtx(dctx); + } + DISPLAYLEVEL(3, "OK \n"); + DISPLAYLEVEL(3, "test%3i : check decompressed result : ", testNb++); { size_t u; for (u=0; u Date: Mon, 23 Jan 2023 20:23:43 -0800 Subject: [PATCH 2/2] [version-test] Work around bugs in v0.7.3 dict builder Before calling a dictionary good, make sure that it can compress an input. If v0.7.3 rejects v0.7.3's dictionary, fall back to the v1.0 dictionary. This is not the job of the verison test to test it, because we cannot fix this code. --- tests/test-zstd-versions.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test-zstd-versions.py b/tests/test-zstd-versions.py index d6784d61567..1bcf39e2b25 100755 --- a/tests/test-zstd-versions.py +++ b/tests/test-zstd-versions.py @@ -85,6 +85,18 @@ def get_git_tags(): return tags +def dict_ok(tag, dict_name, sample): + if not os.path.isfile(dict_name): + return False + try: + cmd = ['./zstd.' + tag, '-D', dict_name] + with open(sample, "rb") as i: + subprocess.check_call(cmd, stdin=i, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + return True + except: + return False + + def create_dict(tag, dict_source_path, fallback_tag=None): dict_name = 'dict.' + tag if not os.path.isfile(dict_name): @@ -96,7 +108,7 @@ def create_dict(tag, dict_source_path, fallback_tag=None): result = execute('./dictBuilder.' + tag + ' ' + ' '.join(files) + ' -o ' + dict_name, print_output=False, param_shell=True) else: result = execute('./zstd.' + tag + ' -f --train ' + ' '.join(files) + ' -o ' + dict_name, print_output=False, param_shell=True) - if result == 0 and os.path.isfile(dict_name): + if result == 0 and dict_ok(tag, dict_name, files[0]): print(dict_name + ' created') elif fallback_tag is not None: fallback_dict_name = 'dict.' + fallback_tag