From e6dccbf48246f4e2844251972fcc0946a5de5154 Mon Sep 17 00:00:00 2001 From: Han Zhu Date: Mon, 27 Mar 2023 15:57:55 -0700 Subject: [PATCH 1/2] Inline BIT_reloadDStream Inlining `BIT_reloadDStream` provided >3% decompression speed improvement for clang PGO-optimized zstd binary, measured using the Silesia corpus with compression level 1. The win comes from improved register allocation which leads to fewer spills and reloads. Take a look at this comparison of profile-annotated hot assembly before and after this change: https://www.diffchecker.com/UjDGIyLz/. The diff is a bit messy, but notice three fewer moves after inlining. In general LLVM's register allocator works better when it can see more code. For example, when the register allocator sees a call instruction, it partitions the registers into caller registers and callee registers, and it is not free to do whatever it wants with all the registers for the current function. Inlining the callee lets the register allocation access all registers and use them more flexsibly. --- lib/common/bitstream.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/common/bitstream.h b/lib/common/bitstream.h index db1b4cf1369..72b0b3df227 100644 --- a/lib/common/bitstream.h +++ b/lib/common/bitstream.h @@ -396,7 +396,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) * This function is safe, it guarantees it will not read beyond src buffer. * @return : status of `BIT_DStream_t` internal register. * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ -MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) +MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) { if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ return BIT_DStream_overflow; From b558190ac76fe6b0f2c42ae5fb9d2f90652d21b0 Mon Sep 17 00:00:00 2001 From: Han Zhu Date: Tue, 28 Mar 2023 14:33:50 -0700 Subject: [PATCH 2/2] Remove clang-only branch hints from ZSTD_decodeSequence Looking at the __builtin_expect in ZSTD_decodeSequence: { size_t offset; #if defined(__clang__) if (LIKELY(ofBits > 1)) { #else if (ofBits > 1) { #endif ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); From profile-annotated assembly, the probability of ofBits > 1 is about 75% (101k counts out of 135k counts). This is much smaller than the recommended likelihood to use __builtin_expect which is 99%. As a result, clang moved the else block further away which hurts cache locality. Removing this __built_expect along with two others in ZSTD_decodeSequence gave better performance when PGO is enabled. I suggest to remove these branch hints and rely on PGO which leverages runtime profiles from actual workload to calculate branch probability instead. --- lib/decompress/zstd_decompress_block.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/lib/decompress/zstd_decompress_block.c b/lib/decompress/zstd_decompress_block.c index 2a7d70d6911..f018271765d 100644 --- a/lib/decompress/zstd_decompress_block.c +++ b/lib/decompress/zstd_decompress_block.c @@ -1232,11 +1232,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) /* sequence */ { size_t offset; - #if defined(__clang__) - if (LIKELY(ofBits > 1)) { - #else if (ofBits > 1) { - #endif ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); @@ -1273,11 +1269,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) seq.offset = offset; } - #if defined(__clang__) - if (UNLIKELY(mlBits > 0)) - #else if (mlBits > 0) - #endif seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) @@ -1287,11 +1279,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - #if defined(__clang__) - if (UNLIKELY(llBits > 0)) - #else if (llBits > 0) - #endif seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); if (MEM_32bits())