From ee9f434973820d4836b711e461349fe7e7975cc1 Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Tue, 7 Mar 2023 12:11:09 -0800 Subject: [PATCH 01/13] Add init once memory - Adds memory type that is guaranteed to have been initialized at least once in the workspace's lifetime. - Changes tag space in row hash to be based on init once memory. - Moves buffers to aligned memory and removes the buffer memory type. --- lib/common/compiler.h | 4 + lib/compress/zstd_compress.c | 86 +++++++++--------- lib/compress/zstd_cwksp.h | 165 +++++++++++++++++++++++------------ lib/compress/zstd_ldm.c | 4 +- lib/compress/zstd_opt.c | 2 + 5 files changed, 158 insertions(+), 103 deletions(-) diff --git a/lib/common/compiler.h b/lib/common/compiler.h index d4f2f285d79..73f8d01998b 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -311,6 +311,10 @@ void __msan_poison(const volatile void *a, size_t size); /* Returns the offset of the first (at least partially) poisoned byte in the memory range, or -1 if the whole range is good. */ intptr_t __msan_test_shadow(const volatile void *x, size_t size); + +/* Print shadow and origin for the memory range to stderr in a human-readable + format. */ +void __msan_print_shadow(const volatile void *x, size_t size); #endif #if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index e04d207e504..6408ae8ccad 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1630,9 +1630,9 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); - size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + size_t const tokenSpace = ZSTD_cwksp_aligned_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) - + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); + + 3 * ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(BYTE)); size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); @@ -1643,28 +1643,21 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; - size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) - + ZSTD_cwksp_alloc_size(buffOutSize); + size_t const bufferSpace = ZSTD_cwksp_aligned_alloc_size(buffInSize) + + ZSTD_cwksp_aligned_alloc_size(buffOutSize); - size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + size_t const cctxSpace = isStatic ? ZSTD_cwksp_aligned_alloc_size(sizeof(ZSTD_CCtx)) : 0; size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); size_t const externalSeqSpace = useSequenceProducer ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) : 0; - size_t const neededSpace = - cctxSpace + - entropySpace + - blockStateSpace + - ldmSpace + - ldmSeqSpace + - matchStateSize + - tokenSpace + - bufferSpace + - externalSeqSpace; - - DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + size_t const objectsSpace = cctxSpace + entropySpace + blockStateSpace; + size_t const tableSpace = ldmSpace + matchStateSize; + size_t const alignedSpace = tokenSpace + bufferSpace + externalSeqSpace + ldmSeqSpace; + + size_t const neededSpace = objectsSpace + tableSpace + alignedSpace; return neededSpace; } @@ -1932,6 +1925,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ZSTD_cwksp_clean_tables(ws); } + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { + /* Row match finder needs an additional table of hashes ("tags") */ + size_t const tagTableSize = hSize * sizeof(U16); + ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); + assert(cParams->hashLog >= rowLog); + ms->rowHashLog = cParams->hashLog - rowLog; + } + } + /* opt parser space */ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { DEBUGLOG(4, "reserving optimal parser space"); @@ -1943,19 +1947,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); } - if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { - { /* Row match finder needs an additional table of hashes ("tags") */ - size_t const tagTableSize = hSize*sizeof(U16); - ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); - if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); - } - { /* Switch to 32-entry rows if searchLog is 5 (or more) */ - U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); - assert(cParams->hashLog >= rowLog); - ms->rowHashLog = cParams->hashLog - rowLog; - } - } - ms->cParams = *cParams; RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, @@ -2102,18 +2093,27 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + FORWARD_IF_ERROR(ZSTD_reset_matchState( + &zc->blockState.matchState, + ws, + ¶ms->cParams, + params->useRowMatchFinder, + crp, + needsIndexReset, + ZSTD_resetTarget_CCtx), ""); + /* ZSTD_wildcopy() is used to copy into the literals buffer, * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. */ - zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.litStart = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, blockSize + WILDCOPY_OVERLENGTH); zc->seqStore.maxNbLit = blockSize; /* buffers */ zc->bufferedPolicy = zbuff; zc->inBuffSize = buffInSize; - zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); + zc->inBuff = (char*)ZSTD_cwksp_reserve_aligned(ws, buffInSize); zc->outBuffSize = buffOutSize; - zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); + zc->outBuff = (char*)ZSTD_cwksp_reserve_aligned(ws, buffOutSize); /* ldm bucketOffsets table */ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { @@ -2121,26 +2121,18 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, size_t const numBuckets = ((size_t)1) << (params->ldmParams.hashLog - params->ldmParams.bucketSizeLog); - zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); + zc->ldmState.bucketOffsets = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, numBuckets); ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); } /* sequences storage */ ZSTD_referenceExternalSequences(zc, NULL, 0); zc->seqStore.maxNbSeq = maxNbSeq; - zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.llCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); - FORWARD_IF_ERROR(ZSTD_reset_matchState( - &zc->blockState.matchState, - ws, - ¶ms->cParams, - params->useRowMatchFinder, - crp, - needsIndexReset, - ZSTD_resetTarget_CCtx), ""); /* ldm hash table */ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { @@ -5195,7 +5187,7 @@ size_t ZSTD_estimateCDictSize_advanced( + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 - : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); + : ZSTD_cwksp_aligned_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); } size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) @@ -5280,7 +5272,7 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 - : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); + : ZSTD_cwksp_aligned_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); ZSTD_cwksp ws; ZSTD_CDict* cdict; diff --git a/lib/compress/zstd_cwksp.h b/lib/compress/zstd_cwksp.h index 9113cdbd4fb..06cf2e77b38 100644 --- a/lib/compress/zstd_cwksp.h +++ b/lib/compress/zstd_cwksp.h @@ -16,6 +16,7 @@ ***************************************/ #include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ #include "../common/zstd_internal.h" +#include "../common/portability_macros.h" #if defined (__cplusplus) extern "C" { @@ -45,7 +46,7 @@ extern "C" { ***************************************/ typedef enum { ZSTD_cwksp_alloc_objects, - ZSTD_cwksp_alloc_buffers, + ZSTD_cwksp_alloc_aligned_init_once, ZSTD_cwksp_alloc_aligned } ZSTD_cwksp_alloc_phase_e; @@ -99,8 +100,8 @@ typedef enum { * * Workspace Layout: * - * [ ... workspace ... ] - * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] + * [ ... workspace ... ] + * [objects][tables ... ->] free space [<- ... aligned][<- ... init once] * * The various objects that live in the workspace are divided into the * following categories, and are allocated separately: @@ -124,13 +125,19 @@ typedef enum { * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). * Their sizes depend on the cparams. These tables are 64-byte aligned. * - * - Aligned: these buffers are used for various purposes that require 4 byte - * alignment, but don't require any initialization before they're used. These - * buffers are each aligned to 64 bytes. + * - Init once: these buffers require to be initialized at least once before + * use. They should be used when we want to skip memory initialization + * while not triggering memory checkers (like Valgrind) when reading from + * from this memory without writing to it first. + * These buffers should be used carefully as they might contain data + * from previous compressions. + * Buffers are aligned to 64 bytes. + * + * - Aligned: these buffers don't require any initialization before they're + * used. The user of the buffer should make sure they write into a buffer + * location before reading from it. + * Buffers are aligned to 64 bytes. * - * - Buffers: these buffers are used for various purposes that don't require - * any alignment or initialization before they're used. This means they can - * be moved around at no cost for a new compression. * * Allocating Memory: * @@ -138,8 +145,8 @@ typedef enum { * correctly packed into the workspace buffer. That order is: * * 1. Objects - * 2. Buffers - * 3. Aligned/Tables + * 2. Init once / Tables + * 3. Aligned / Tables * * Attempts to reserve objects of different types out of order will fail. */ @@ -151,6 +158,7 @@ typedef struct { void* tableEnd; void* tableValidEnd; void* allocStart; + void* initOnceStart; BYTE allocFailed; int workspaceOversizedDuration; @@ -163,6 +171,7 @@ typedef struct { ***************************************/ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); +MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { (void)ws; @@ -172,6 +181,21 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { assert(ws->tableEnd <= ws->allocStart); assert(ws->tableValidEnd <= ws->allocStart); assert(ws->allocStart <= ws->workspaceEnd); + assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); + assert(ws->workspace <= ws->initOnceStart); + assert((size_t)ws->allocStart % ZSTD_CWKSP_ALIGNMENT_BYTES == 0); +#if ZSTD_MEMORY_SANITIZER + { + intptr_t const offset = __msan_test_shadow(ws->initOnceStart, + (U8*)ZSTD_cwksp_initialAllocStart(ws) - (U8*)ws->initOnceStart); +#if defined(ZSTD_MSAN_PRINT) + if(offset!=-1) { + __msan_print_shadow((U8*)ws->initOnceStart + offset - 8, 32); + } +#endif + assert(offset==-1); + }; +#endif } /** @@ -218,12 +242,9 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { * for internal purposes (currently only alignment). */ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { - /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes - * to align the beginning of tables section, as well as another n_2=[0, 63] bytes - * to align the beginning of the aligned section. - * - * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and - * aligneds being sized in multiples of 64 bytes. + /* For alignment, the wksp will always allocate an additional ZSTD_CWKSP_ALIGNMENT_BYTES + * bytes to align the beginning of tables section, this will ensure that tables section + * and aligned buffers sections are aligned. */ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; return slackSpace; @@ -238,10 +259,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt size_t const alignBytesMask = alignBytes - 1; size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; assert((alignBytes & alignBytesMask) == 0); - assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); + assert(bytes < alignBytes); return bytes; } +/** + * Returns the initial value for allocStart which is used to determine the position from + * which we can allocate from the end of the workspace. + */ +MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) { + return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1)); +} + /** * Internal function. Do not use directly. * Reserves the given number of bytes within the aligned/buffer segment of the wksp, @@ -282,23 +311,12 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase { assert(phase >= ws->phase); if (phase > ws->phase) { - /* Going from allocating objects to allocating buffers */ - if (ws->phase < ZSTD_cwksp_alloc_buffers && - phase >= ZSTD_cwksp_alloc_buffers) { + /* Going from allocating objects to allocating aligned / tables */ + if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && + phase >= ZSTD_cwksp_alloc_aligned_init_once) { ws->tableValidEnd = ws->objectEnd; - } + ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); - /* Going from allocating buffers to allocating aligneds/tables */ - if (ws->phase < ZSTD_cwksp_alloc_aligned && - phase >= ZSTD_cwksp_alloc_aligned) { - { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ - size_t const bytesToAlign = - ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); - DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); - ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ - RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), - memory_allocation, "aligned phase - alignment initial allocation failed!"); - } { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ void* const alloc = ws->objectEnd; size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); @@ -310,7 +328,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase ws->tableEnd = objectEnd; /* table area starts being empty */ if (ws->tableValidEnd < ws->tableEnd) { ws->tableValidEnd = ws->tableEnd; - } } } + } } } ws->phase = phase; ZSTD_cwksp_assert_internal_consistency(ws); } @@ -322,7 +340,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase */ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { - return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); + return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); } /** @@ -359,12 +377,35 @@ ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase return alloc; } + /** - * Reserves and returns unaligned memory. + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). + * This memory has been initialized at least once in the past. + * This doesn't mean it has been initialized this time, and it might contain data from previous + * operations. + * The main usage is for algorithms that might need read access into uninitialized memory. + * The algorithm must maintain safety under these conditions and must make sure it doesn't + * leak any of the past data (directly or in side channels). */ -MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) +MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) { - return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); + size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); + void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); + assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); + if(ptr && ptr < ws->initOnceStart) { + /* We assume the memory following the current allocation is either: + * 1. Not usable as initOnce memory (end of workspace) + * 2. Another initOnce buffer that has been allocated before (and so was previously memset) + * 3. An ASAN redzone, in which case we don't want to write on it + * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. + * Note that we assume here tha MSAN and ASAN cannot run in the same time. */ + ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); + ws->initOnceStart = ptr; + } +#if ZSTD_MEMORY_SANITIZER + assert(__msan_test_shadow(ptr, bytes) == -1); +#endif + return ptr; } /** @@ -385,13 +426,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) */ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { - const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; + const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; void* alloc; void* end; void* top; - if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { - return NULL; + /* We can only start allocating tables after we are done reserving space for objects at the + * start of the workspace */ + if(ws->phase < phase) { + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { + return NULL; + } } alloc = ws->tableEnd; end = (BYTE *)alloc + bytes; @@ -470,11 +515,19 @@ MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) /* To validate that the table re-use logic is sound, and that we don't * access table space that we haven't cleaned, we re-"poison" the table - * space every time we mark it dirty. */ + * space every time we mark it dirty. + * Since tableValidEnd space and initOnce space may overlap we don't poison + * the initOnce portion as it break its promise. This means that this poisoning + * check isn't always applied fully. */ { size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd; assert(__msan_test_shadow(ws->objectEnd, size) == -1); - __msan_poison(ws->objectEnd, size); + if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) { + __msan_poison(ws->objectEnd, size); + } else { + assert(ws->initOnceStart >= ws->objectEnd); + __msan_poison(ws->objectEnd, (BYTE*)ws->initOnceStart - (BYTE*)ws->objectEnd); + } } #endif @@ -539,11 +592,14 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) /* To validate that the context re-use logic is sound, and that we don't * access stuff that this compression hasn't initialized, we re-"poison" - * the workspace (or at least the non-static, non-table parts of it) - * every time we start a new compression. */ + * the workspace except for the areas in which we expect memory re-use + * without initialization (objects, valid tables area and init once + * memory). */ { - size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->tableValidEnd; - __msan_poison(ws->tableValidEnd, size); + if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) { + size_t size = (BYTE*)ws->initOnceStart - (BYTE*)ws->tableValidEnd; + __msan_poison(ws->tableValidEnd, size); + } } #endif @@ -559,10 +615,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { #endif ws->tableEnd = ws->objectEnd; - ws->allocStart = ws->workspaceEnd; + ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); ws->allocFailed = 0; - if (ws->phase > ZSTD_cwksp_alloc_buffers) { - ws->phase = ZSTD_cwksp_alloc_buffers; + if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { + ws->phase = ZSTD_cwksp_alloc_aligned_init_once; } ZSTD_cwksp_assert_internal_consistency(ws); } @@ -579,6 +635,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c ws->workspaceEnd = (BYTE*)start + size; ws->objectEnd = ws->workspace; ws->tableValidEnd = ws->objectEnd; + ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); ws->phase = ZSTD_cwksp_alloc_objects; ws->isStatic = isStatic; ZSTD_cwksp_clear(ws); @@ -616,7 +673,7 @@ MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) - + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); + + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); } MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { @@ -634,13 +691,13 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, size_t const estimatedSpace, int resizedWorkspace) { if (resizedWorkspace) { - /* Resized/newly allocated wksp should have exact bounds */ - return ZSTD_cwksp_used(ws) == estimatedSpace; + /* Resized/newly allocated wksp should have exact bounds up to one alignment */ + return (estimatedSpace - 64) <= ZSTD_cwksp_used(ws) && ZSTD_cwksp_used(ws) <= estimatedSpace; } else { /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes * than estimatedSpace. See the comments in zstd_cwksp.h for details. */ - return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); + return (ZSTD_cwksp_used(ws) >= estimatedSpace - 64) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); } } diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c index 3d74ff19e3c..aaa9473c3e6 100644 --- a/lib/compress/zstd_ldm.c +++ b/lib/compress/zstd_ldm.c @@ -157,8 +157,8 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params) size_t const ldmHSize = ((size_t)1) << params.hashLog; size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog); size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); - size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) - + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); + size_t const totalSize = ZSTD_cwksp_aligned_alloc_size(ldmBucketSize) + + ZSTD_cwksp_aligned_alloc_size(ldmHSize * sizeof(ldmEntry_t)); return params.enableLdm == ZSTD_ps_enable ? totalSize : 0; } diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index fdd7f9d8b5a..f02a760946e 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -1086,6 +1086,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, ZSTD_optimal_t lastSequence; ZSTD_optLdm_t optLdm; + ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t)); + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); From c4fd893e72685f7500ac3163bde6aa1026287dd6 Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Tue, 7 Mar 2023 14:41:33 -0800 Subject: [PATCH 02/13] Restores unaligned buffer allocations, but changes allocation order to allocate them after aligned buffers --- lib/compress/zstd_compress.c | 94 +++++++++++++++++++----------------- lib/compress/zstd_cwksp.h | 44 +++++++++-------- lib/compress/zstd_ldm.c | 4 +- 3 files changed, 77 insertions(+), 65 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 6408ae8ccad..346ca2873be 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1630,9 +1630,9 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); - size_t const tokenSpace = ZSTD_cwksp_aligned_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) - + 3 * ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(BYTE)); + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); @@ -1643,21 +1643,28 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; - size_t const bufferSpace = ZSTD_cwksp_aligned_alloc_size(buffInSize) - + ZSTD_cwksp_aligned_alloc_size(buffOutSize); + size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + + ZSTD_cwksp_alloc_size(buffOutSize); - size_t const cctxSpace = isStatic ? ZSTD_cwksp_aligned_alloc_size(sizeof(ZSTD_CCtx)) : 0; + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); size_t const externalSeqSpace = useSequenceProducer ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) : 0; - size_t const objectsSpace = cctxSpace + entropySpace + blockStateSpace; - size_t const tableSpace = ldmSpace + matchStateSize; - size_t const alignedSpace = tokenSpace + bufferSpace + externalSeqSpace + ldmSeqSpace; - - size_t const neededSpace = objectsSpace + tableSpace + alignedSpace; + size_t const neededSpace = + cctxSpace + + entropySpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + + bufferSpace + + externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); return neededSpace; } @@ -2102,38 +2109,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, needsIndexReset, ZSTD_resetTarget_CCtx), ""); - /* ZSTD_wildcopy() is used to copy into the literals buffer, - * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. - */ - zc->seqStore.litStart = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, blockSize + WILDCOPY_OVERLENGTH); - zc->seqStore.maxNbLit = blockSize; - - /* buffers */ - zc->bufferedPolicy = zbuff; - zc->inBuffSize = buffInSize; - zc->inBuff = (char*)ZSTD_cwksp_reserve_aligned(ws, buffInSize); - zc->outBuffSize = buffOutSize; - zc->outBuff = (char*)ZSTD_cwksp_reserve_aligned(ws, buffOutSize); - - /* ldm bucketOffsets table */ - if (params->ldmParams.enableLdm == ZSTD_ps_enable) { - /* TODO: avoid memset? */ - size_t const numBuckets = - ((size_t)1) << (params->ldmParams.hashLog - - params->ldmParams.bucketSizeLog); - zc->ldmState.bucketOffsets = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, numBuckets); - ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); - } - - /* sequences storage */ - ZSTD_referenceExternalSequences(zc, NULL, 0); - zc->seqStore.maxNbSeq = maxNbSeq; - zc->seqStore.llCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.mlCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.ofCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); - /* ldm hash table */ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* TODO: avoid memset? */ @@ -2155,8 +2132,39 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); } + /* buffers */ + + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); + zc->outBuffSize = buffOutSize; + zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); + + /* ldm bucketOffsets table */ + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* TODO: avoid memset? */ + size_t const numBuckets = + ((size_t)1) << (params->ldmParams.hashLog - + params->ldmParams.bucketSizeLog); + zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); + ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); + } + + /* sequences storage */ + ZSTD_referenceExternalSequences(zc, NULL, 0); + zc->seqStore.maxNbSeq = maxNbSeq; + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); - assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); + assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); zc->initialized = 1; @@ -5187,7 +5195,7 @@ size_t ZSTD_estimateCDictSize_advanced( + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 - : ZSTD_cwksp_aligned_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); } size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) @@ -5272,7 +5280,7 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 - : ZSTD_cwksp_aligned_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); ZSTD_cwksp ws; ZSTD_CDict* cdict; diff --git a/lib/compress/zstd_cwksp.h b/lib/compress/zstd_cwksp.h index 06cf2e77b38..a0759a72901 100644 --- a/lib/compress/zstd_cwksp.h +++ b/lib/compress/zstd_cwksp.h @@ -47,7 +47,8 @@ extern "C" { typedef enum { ZSTD_cwksp_alloc_objects, ZSTD_cwksp_alloc_aligned_init_once, - ZSTD_cwksp_alloc_aligned + ZSTD_cwksp_alloc_aligned, + ZSTD_cwksp_alloc_buffers } ZSTD_cwksp_alloc_phase_e; /** @@ -101,7 +102,7 @@ typedef enum { * Workspace Layout: * * [ ... workspace ... ] - * [objects][tables ... ->] free space [<- ... aligned][<- ... init once] + * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] * * The various objects that live in the workspace are divided into the * following categories, and are allocated separately: @@ -138,6 +139,9 @@ typedef enum { * location before reading from it. * Buffers are aligned to 64 bytes. * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can + * be moved around at no cost for a new compression. * * Allocating Memory: * @@ -147,6 +151,7 @@ typedef enum { * 1. Objects * 2. Init once / Tables * 3. Aligned / Tables + * 4. Buffers / Tables * * Attempts to reserve objects of different types out of order will fail. */ @@ -183,7 +188,6 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { assert(ws->allocStart <= ws->workspaceEnd); assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); assert(ws->workspace <= ws->initOnceStart); - assert((size_t)ws->allocStart % ZSTD_CWKSP_ALIGNMENT_BYTES == 0); #if ZSTD_MEMORY_SANITIZER { intptr_t const offset = __msan_test_shadow(ws->initOnceStart, @@ -242,11 +246,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { * for internal purposes (currently only alignment). */ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { - /* For alignment, the wksp will always allocate an additional ZSTD_CWKSP_ALIGNMENT_BYTES - * bytes to align the beginning of tables section, this will ensure that tables section - * and aligned buffers sections are aligned. + /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES + * bytes to align the beginning of tables section and end of buffers; */ - size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; + size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; return slackSpace; } @@ -311,7 +314,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase { assert(phase >= ws->phase); if (phase > ws->phase) { - /* Going from allocating objects to allocating aligned / tables */ + /* Going from allocating objects to allocating initOnce / tables */ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && phase >= ZSTD_cwksp_alloc_aligned_init_once) { ws->tableValidEnd = ws->objectEnd; @@ -377,6 +380,13 @@ ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase return alloc; } +/** + * Reserves and returns unaligned memory. + */ +MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) +{ + return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); +} /** * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). @@ -673,7 +683,7 @@ MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) - + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); + + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); } MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { @@ -688,17 +698,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { * Returns if the estimated space needed for a wksp is within an acceptable limit of the * actual amount of space used. */ -MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, - size_t const estimatedSpace, int resizedWorkspace) { - if (resizedWorkspace) { - /* Resized/newly allocated wksp should have exact bounds up to one alignment */ - return (estimatedSpace - 64) <= ZSTD_cwksp_used(ws) && ZSTD_cwksp_used(ws) <= estimatedSpace; - } else { - /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes - * than estimatedSpace. See the comments in zstd_cwksp.h for details. - */ - return (ZSTD_cwksp_used(ws) >= estimatedSpace - 64) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); - } +MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { + /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice + * the alignment bytes difference between estimation and actual usage */ + return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && + ZSTD_cwksp_used(ws) <= estimatedSpace; } diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c index aaa9473c3e6..3d74ff19e3c 100644 --- a/lib/compress/zstd_ldm.c +++ b/lib/compress/zstd_ldm.c @@ -157,8 +157,8 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params) size_t const ldmHSize = ((size_t)1) << params.hashLog; size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog); size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); - size_t const totalSize = ZSTD_cwksp_aligned_alloc_size(ldmBucketSize) - + ZSTD_cwksp_aligned_alloc_size(ldmHSize * sizeof(ldmEntry_t)); + size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) + + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); return params.enableLdm == ZSTD_ps_enable ? totalSize : 0; } From baefad5efe6101e9e00c4817d21562dacf64b1fd Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Thu, 9 Mar 2023 09:02:53 -0800 Subject: [PATCH 03/13] CR fixes --- lib/compress/zstd_cwksp.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/compress/zstd_cwksp.h b/lib/compress/zstd_cwksp.h index a0759a72901..cc7fb1c715c 100644 --- a/lib/compress/zstd_cwksp.h +++ b/lib/compress/zstd_cwksp.h @@ -321,9 +321,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ - void* const alloc = ws->objectEnd; + void *const alloc = ws->objectEnd; size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); - void* const objectEnd = (BYTE*)alloc + bytesToAlign; + void *const objectEnd = (BYTE *) alloc + bytesToAlign; DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, "table phase - alignment initial allocation failed!"); @@ -331,7 +331,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase ws->tableEnd = objectEnd; /* table area starts being empty */ if (ws->tableValidEnd < ws->tableEnd) { ws->tableValidEnd = ws->tableEnd; - } } } + } + } + } ws->phase = phase; ZSTD_cwksp_assert_internal_consistency(ws); } @@ -408,7 +410,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t byt * 2. Another initOnce buffer that has been allocated before (and so was previously memset) * 3. An ASAN redzone, in which case we don't want to write on it * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. - * Note that we assume here tha MSAN and ASAN cannot run in the same time. */ + * Note that we assume here that MSAN and ASAN cannot run in the same time. */ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); ws->initOnceStart = ptr; } From 420418531ca80145ecbaa75bab1ee09be87d499f Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Tue, 7 Mar 2023 15:24:23 -0800 Subject: [PATCH 04/13] Introduce salt into row hash (#3528) This helps to avoid regressions where consecutive compressions use the same tag space with similar data (running `zstd -b5e7 enwik8 -B128K` reproduces this regression). --- lib/compress/zstd_compress.c | 12 +++++-- lib/compress/zstd_compress_internal.h | 50 ++++++++++++++++++++------- lib/compress/zstd_lazy.c | 16 +++++---- 3 files changed, 57 insertions(+), 21 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 346ca2873be..36ecd5675e2 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1936,6 +1936,13 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, /* Row match finder needs an additional table of hashes ("tags") */ size_t const tagTableSize = hSize * sizeof(U16); ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); + /* We want to generate a new salt in case we reset a Cctx, but we always want to use + * 0 when we reset a Cdict */ + if(forWho == ZSTD_resetTarget_CCtx) { + ms->hashSalt = ms->hashSalt * 6364136223846793005 + 1; /* based on MUSL rand */ + } else { + ms->hashSalt = 0; + } { /* Switch to 32-entry rows if searchLog is 5 (or more) */ U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); assert(cParams->hashLog >= rowLog); @@ -2341,8 +2348,9 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { size_t const tagTableSize = hSize*sizeof(U16); ZSTD_memcpy(cctx->blockState.matchState.tagTable, - cdict->matchState.tagTable, - tagTableSize); + cdict->matchState.tagTable, + tagTableSize); + cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; } } diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index cbb85e527eb..148cff7617e 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -228,6 +228,7 @@ struct ZSTD_matchState_t { U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ + U64 hashSalt; U32* hashTable; U32* hashTable3; @@ -787,28 +788,35 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, * Hashes ***************************************/ static const U32 prime3bytes = 506832829U; -static U32 ZSTD_hash3(U32 u, U32 h) { assert(h <= 32); return ((u << (32-24)) * prime3bytes) >> (32-h) ; } -MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ +static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } +MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ +MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } static const U32 prime4bytes = 2654435761U; -static U32 ZSTD_hash4(U32 u, U32 h) { assert(h <= 32); return (u * prime4bytes) >> (32-h) ; } -static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h); } +static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } +static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } +static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } static const U64 prime5bytes = 889523592379ULL; -static size_t ZSTD_hash5(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } -static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } +static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } +static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } +static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } static const U64 prime6bytes = 227718039650203ULL; -static size_t ZSTD_hash6(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } -static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } +static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } +static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } +static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } static const U64 prime7bytes = 58295818150454627ULL; -static size_t ZSTD_hash7(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } -static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } +static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } +static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } +static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; -static size_t ZSTD_hash8(U64 u, U32 h) { assert(h <= 64); return (size_t)(((u) * prime8bytes) >> (64-h)) ; } -static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } +static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } +static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } +static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } + MEM_STATIC FORCE_INLINE_ATTR size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) @@ -828,6 +836,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) } } +MEM_STATIC FORCE_INLINE_ATTR +size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { + /* Although some of these hashes do support hBits up to 64, some do not. + * To be on the safe side, always avoid hBits > 32. */ + assert(hBits <= 32); + + switch(mls) + { + default: + case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); + case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); + case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); + case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); + case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); + } +} + + /** ZSTD_ipow() : * Return base^exponent. */ diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index a2473427299..9534eb715fb 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -850,7 +850,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); for (; idx < lim; ++idx) { - U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; @@ -868,9 +868,10 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, U16 const* tagTable, BYTE const* base, U32 idx, U32 const hashLog, - U32 const rowLog, U32 const mls) + U32 const rowLog, U32 const mls, + U64 const hashSalt) { - U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; @@ -894,15 +895,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { - U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) - : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) + : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; U32* const row = hashTable + relRow; BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. Explicit cast allows us to get exact desired position within each row */ U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); - assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); + assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; row[pos] = updateStartIdx; } @@ -1163,6 +1164,7 @@ size_t ZSTD_RowFindBestMatch( const U32 rowMask = rowEntries - 1; const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); + const U64 hashSalt = ms->hashSalt; U32 nbAttempts = 1U << cappedSearchLog; size_t ml=4-1; @@ -1200,7 +1202,7 @@ size_t ZSTD_RowFindBestMatch( /* Update the hashTable and tagTable up to (but not including) ip */ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); { /* Get the hash for ip, compute the appropriate row */ - U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; U32* const row = hashTable + relRow; From 462f3721464388ffc1fad5258e515a534cfad5c3 Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Thu, 9 Mar 2023 09:51:27 -0800 Subject: [PATCH 05/13] CR fixes --- lib/compress/zstd_compress.c | 5 ++++- lib/compress/zstd_compress_internal.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 36ecd5675e2..05708d8cb88 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1935,12 +1935,15 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { /* Row match finder needs an additional table of hashes ("tags") */ size_t const tagTableSize = hSize * sizeof(U16); - ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); /* We want to generate a new salt in case we reset a Cctx, but we always want to use * 0 when we reset a Cdict */ if(forWho == ZSTD_resetTarget_CCtx) { + ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); ms->hashSalt = ms->hashSalt * 6364136223846793005 + 1; /* based on MUSL rand */ } else { + /* When we are not salting we want to always memset the memory */ + ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); + ZSTD_memset(ms->tagTable, 0, tagTableSize); ms->hashSalt = 0; } { /* Switch to 32-entry rows if searchLog is 5 (or more) */ diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 148cff7617e..12ae5b4dcef 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -845,7 +845,7 @@ size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) switch(mls) { default: - case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); + case 4: return ZSTD_hash4PtrS(p, hBits, (U32)(hashSalt >> 32)); case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); From 81f6313951ad33b64067d87c86d7aa6d47ba0b23 Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Thu, 9 Mar 2023 13:06:34 -0800 Subject: [PATCH 06/13] CR fixes and better entropy for hash salt --- lib/compress/zstd_compress.c | 3 ++- lib/compress/zstd_compress_internal.h | 5 +++-- lib/compress/zstd_lazy.c | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 05708d8cb88..71f9f0c60d4 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1939,7 +1939,8 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, * 0 when we reset a Cdict */ if(forWho == ZSTD_resetTarget_CCtx) { ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); - ms->hashSalt = ms->hashSalt * 6364136223846793005 + 1; /* based on MUSL rand */ + ms->hashSalt = (U64) ZSTD_hashPtr(&ms->hashSalt, 32, sizeof(ms->hashSalt)) << 32 | + ZSTD_hashPtr(&ms->hashSaltEntropy, 32, sizeof(ms->hashSaltEntropy)); } else { /* When we are not salting we want to always memset the memory */ ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 12ae5b4dcef..223f4fde119 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -228,7 +228,8 @@ struct ZSTD_matchState_t { U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ - U64 hashSalt; + U64 hashSalt; /* For row-based matchFinder: salts the hash for re-use of tag table */ + U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ U32* hashTable; U32* hashTable3; @@ -845,7 +846,7 @@ size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) switch(mls) { default: - case 4: return ZSTD_hash4PtrS(p, hBits, (U32)(hashSalt >> 32)); + case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 9534eb715fb..1fa96521888 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -906,6 +906,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; row[pos] = updateStartIdx; + ms->hashSaltEntropy += hash; } } From 254329554352295b3e6cd63dfc7c04807f05ebec Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Thu, 9 Mar 2023 13:46:04 -0800 Subject: [PATCH 07/13] Slightly improve entropy performance --- lib/common/bits.h | 25 +++++++++++++++++++++++++ lib/compress/zstd_compress.c | 18 +++++++++++++++--- lib/compress/zstd_lazy.c | 29 +++-------------------------- 3 files changed, 43 insertions(+), 29 deletions(-) diff --git a/lib/common/bits.h b/lib/common/bits.h index 4a9bbf58c5d..378a34047a7 100644 --- a/lib/common/bits.h +++ b/lib/common/bits.h @@ -172,4 +172,29 @@ MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCo return 31 - ZSTD_countLeadingZeros32(val); } +/* ZSTD_rotateRight_*(): + * Rotates a bitfield to the right by "count" bits. + * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts + */ +FORCE_INLINE_TEMPLATE +U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { + assert(count < 64); + count &= 0x3F; /* for fickle pattern recognition */ + return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +} + +FORCE_INLINE_TEMPLATE +U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { + assert(count < 32); + count &= 0x1F; /* for fickle pattern recognition */ + return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +} + +FORCE_INLINE_TEMPLATE +U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { + assert(count < 16); + count &= 0x0F; /* for fickle pattern recognition */ + return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); +} + #endif /* ZSTD_BITS_H */ diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 71f9f0c60d4..43879a2ce7d 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -27,7 +27,7 @@ #include "zstd_opt.h" #include "zstd_ldm.h" #include "zstd_compress_superblock.h" -#include "../common/bits.h" /* ZSTD_highbit32 */ +#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ /* *************************************************************** * Tuning parameters @@ -1884,6 +1884,19 @@ typedef enum { ZSTD_resetTarget_CCtx } ZSTD_resetTarget_e; +/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ +static U64 ZSTD_bitmix(U64 val, U64 len) { + val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); + val *= 0x9FB21C651E98DF25ULL; + val ^= (val >> 35) + len ; + val *= 0x9FB21C651E98DF25ULL; + return val ^ (val >> 28); +} + +/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ +static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) { + ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); +} static size_t ZSTD_reset_matchState(ZSTD_matchState_t* ms, @@ -1939,8 +1952,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, * 0 when we reset a Cdict */ if(forWho == ZSTD_resetTarget_CCtx) { ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); - ms->hashSalt = (U64) ZSTD_hashPtr(&ms->hashSalt, 32, sizeof(ms->hashSalt)) << 32 | - ZSTD_hashPtr(&ms->hashSaltEntropy, 32, sizeof(ms->hashSaltEntropy)); + ZSTD_advanceHashSalt(ms); } else { /* When we are not salting we want to always memset the memory */ ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 1fa96521888..3ec97ce8488 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -774,31 +774,6 @@ MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { return ZSTD_countTrailingZeros64(val); } -/* ZSTD_rotateRight_*(): - * Rotates a bitfield to the right by "count" bits. - * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts - */ -FORCE_INLINE_TEMPLATE -U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { - assert(count < 64); - count &= 0x3F; /* for fickle pattern recognition */ - return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); -} - -FORCE_INLINE_TEMPLATE -U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { - assert(count < 32); - count &= 0x1F; /* for fickle pattern recognition */ - return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); -} - -FORCE_INLINE_TEMPLATE -U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { - assert(count < 16); - count &= 0x0F; /* for fickle pattern recognition */ - return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); -} - /* ZSTD_row_nextIndex(): * Returns the next index to insert at within a tagTable row, and updates the "head" * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) @@ -891,6 +866,7 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, U32* const hashTable = ms->hashTable; U16* const tagTable = ms->tagTable; U32 const hashLog = ms->rowHashLog; + U32 hashSaltEntropyCollected = 0; const BYTE* const base = ms->window.base; DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); @@ -906,8 +882,9 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; row[pos] = updateStartIdx; - ms->hashSaltEntropy += hash; + hashSaltEntropyCollected = hash; } + ms->hashSaltEntropy += hashSaltEntropyCollected; /* collect salt entropy */ } /* ZSTD_row_update_internal(): From 4f33201ab4ed606d7bbbb3ae2ac3b6d47b1dd36d Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Tue, 7 Mar 2023 12:11:09 -0800 Subject: [PATCH 08/13] Add init once memory - Adds memory type that is guaranteed to have been initialized at least once in the workspace's lifetime. - Changes tag space in row hash to be based on init once memory. - Moves buffers to aligned memory and removes the buffer memory type. --- lib/common/compiler.h | 4 + lib/compress/zstd_compress.c | 86 +++++++++--------- lib/compress/zstd_cwksp.h | 165 +++++++++++++++++++++++------------ lib/compress/zstd_ldm.c | 4 +- lib/compress/zstd_opt.c | 2 + 5 files changed, 158 insertions(+), 103 deletions(-) diff --git a/lib/common/compiler.h b/lib/common/compiler.h index d4f2f285d79..73f8d01998b 100644 --- a/lib/common/compiler.h +++ b/lib/common/compiler.h @@ -311,6 +311,10 @@ void __msan_poison(const volatile void *a, size_t size); /* Returns the offset of the first (at least partially) poisoned byte in the memory range, or -1 if the whole range is good. */ intptr_t __msan_test_shadow(const volatile void *x, size_t size); + +/* Print shadow and origin for the memory range to stderr in a human-readable + format. */ +void __msan_print_shadow(const volatile void *x, size_t size); #endif #if ZSTD_ADDRESS_SANITIZER && !defined(ZSTD_ASAN_DONT_POISON_WORKSPACE) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 15a7a666db4..44acc496d0f 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1653,9 +1653,9 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); - size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + size_t const tokenSpace = ZSTD_cwksp_aligned_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) - + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); + + 3 * ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(BYTE)); size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); @@ -1666,28 +1666,21 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; - size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) - + ZSTD_cwksp_alloc_size(buffOutSize); + size_t const bufferSpace = ZSTD_cwksp_aligned_alloc_size(buffInSize) + + ZSTD_cwksp_aligned_alloc_size(buffOutSize); - size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + size_t const cctxSpace = isStatic ? ZSTD_cwksp_aligned_alloc_size(sizeof(ZSTD_CCtx)) : 0; size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); size_t const externalSeqSpace = useSequenceProducer ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) : 0; - size_t const neededSpace = - cctxSpace + - entropySpace + - blockStateSpace + - ldmSpace + - ldmSeqSpace + - matchStateSize + - tokenSpace + - bufferSpace + - externalSeqSpace; - - DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + size_t const objectsSpace = cctxSpace + entropySpace + blockStateSpace; + size_t const tableSpace = ldmSpace + matchStateSize; + size_t const alignedSpace = tokenSpace + bufferSpace + externalSeqSpace + ldmSeqSpace; + + size_t const neededSpace = objectsSpace + tableSpace + alignedSpace; return neededSpace; } @@ -1955,6 +1948,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ZSTD_cwksp_clean_tables(ws); } + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { + /* Row match finder needs an additional table of hashes ("tags") */ + size_t const tagTableSize = hSize; + ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); + assert(cParams->hashLog >= rowLog); + ms->rowHashLog = cParams->hashLog - rowLog; + } + } + /* opt parser space */ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { DEBUGLOG(4, "reserving optimal parser space"); @@ -1966,19 +1970,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); } - if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { - { /* Row match finder needs an additional table of hashes ("tags") */ - size_t const tagTableSize = hSize; - ms->tagTable = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); - if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); - } - { /* Switch to 32-entry rows if searchLog is 5 (or more) */ - U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); - assert(cParams->hashLog >= rowLog); - ms->rowHashLog = cParams->hashLog - rowLog; - } - } - ms->cParams = *cParams; RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, @@ -2125,18 +2116,27 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + FORWARD_IF_ERROR(ZSTD_reset_matchState( + &zc->blockState.matchState, + ws, + ¶ms->cParams, + params->useRowMatchFinder, + crp, + needsIndexReset, + ZSTD_resetTarget_CCtx), ""); + /* ZSTD_wildcopy() is used to copy into the literals buffer, * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. */ - zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.litStart = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, blockSize + WILDCOPY_OVERLENGTH); zc->seqStore.maxNbLit = blockSize; /* buffers */ zc->bufferedPolicy = zbuff; zc->inBuffSize = buffInSize; - zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); + zc->inBuff = (char*)ZSTD_cwksp_reserve_aligned(ws, buffInSize); zc->outBuffSize = buffOutSize; - zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); + zc->outBuff = (char*)ZSTD_cwksp_reserve_aligned(ws, buffOutSize); /* ldm bucketOffsets table */ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { @@ -2144,26 +2144,18 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, size_t const numBuckets = ((size_t)1) << (params->ldmParams.hashLog - params->ldmParams.bucketSizeLog); - zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); + zc->ldmState.bucketOffsets = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, numBuckets); ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); } /* sequences storage */ ZSTD_referenceExternalSequences(zc, NULL, 0); zc->seqStore.maxNbSeq = maxNbSeq; - zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.llCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); - FORWARD_IF_ERROR(ZSTD_reset_matchState( - &zc->blockState.matchState, - ws, - ¶ms->cParams, - params->useRowMatchFinder, - crp, - needsIndexReset, - ZSTD_resetTarget_CCtx), ""); /* ldm hash table */ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { @@ -5218,7 +5210,7 @@ size_t ZSTD_estimateCDictSize_advanced( + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 - : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); + : ZSTD_cwksp_aligned_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); } size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) @@ -5303,7 +5295,7 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 - : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); + : ZSTD_cwksp_aligned_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); ZSTD_cwksp ws; ZSTD_CDict* cdict; diff --git a/lib/compress/zstd_cwksp.h b/lib/compress/zstd_cwksp.h index 9113cdbd4fb..06cf2e77b38 100644 --- a/lib/compress/zstd_cwksp.h +++ b/lib/compress/zstd_cwksp.h @@ -16,6 +16,7 @@ ***************************************/ #include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ #include "../common/zstd_internal.h" +#include "../common/portability_macros.h" #if defined (__cplusplus) extern "C" { @@ -45,7 +46,7 @@ extern "C" { ***************************************/ typedef enum { ZSTD_cwksp_alloc_objects, - ZSTD_cwksp_alloc_buffers, + ZSTD_cwksp_alloc_aligned_init_once, ZSTD_cwksp_alloc_aligned } ZSTD_cwksp_alloc_phase_e; @@ -99,8 +100,8 @@ typedef enum { * * Workspace Layout: * - * [ ... workspace ... ] - * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] + * [ ... workspace ... ] + * [objects][tables ... ->] free space [<- ... aligned][<- ... init once] * * The various objects that live in the workspace are divided into the * following categories, and are allocated separately: @@ -124,13 +125,19 @@ typedef enum { * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). * Their sizes depend on the cparams. These tables are 64-byte aligned. * - * - Aligned: these buffers are used for various purposes that require 4 byte - * alignment, but don't require any initialization before they're used. These - * buffers are each aligned to 64 bytes. + * - Init once: these buffers require to be initialized at least once before + * use. They should be used when we want to skip memory initialization + * while not triggering memory checkers (like Valgrind) when reading from + * from this memory without writing to it first. + * These buffers should be used carefully as they might contain data + * from previous compressions. + * Buffers are aligned to 64 bytes. + * + * - Aligned: these buffers don't require any initialization before they're + * used. The user of the buffer should make sure they write into a buffer + * location before reading from it. + * Buffers are aligned to 64 bytes. * - * - Buffers: these buffers are used for various purposes that don't require - * any alignment or initialization before they're used. This means they can - * be moved around at no cost for a new compression. * * Allocating Memory: * @@ -138,8 +145,8 @@ typedef enum { * correctly packed into the workspace buffer. That order is: * * 1. Objects - * 2. Buffers - * 3. Aligned/Tables + * 2. Init once / Tables + * 3. Aligned / Tables * * Attempts to reserve objects of different types out of order will fail. */ @@ -151,6 +158,7 @@ typedef struct { void* tableEnd; void* tableValidEnd; void* allocStart; + void* initOnceStart; BYTE allocFailed; int workspaceOversizedDuration; @@ -163,6 +171,7 @@ typedef struct { ***************************************/ MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); +MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { (void)ws; @@ -172,6 +181,21 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { assert(ws->tableEnd <= ws->allocStart); assert(ws->tableValidEnd <= ws->allocStart); assert(ws->allocStart <= ws->workspaceEnd); + assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); + assert(ws->workspace <= ws->initOnceStart); + assert((size_t)ws->allocStart % ZSTD_CWKSP_ALIGNMENT_BYTES == 0); +#if ZSTD_MEMORY_SANITIZER + { + intptr_t const offset = __msan_test_shadow(ws->initOnceStart, + (U8*)ZSTD_cwksp_initialAllocStart(ws) - (U8*)ws->initOnceStart); +#if defined(ZSTD_MSAN_PRINT) + if(offset!=-1) { + __msan_print_shadow((U8*)ws->initOnceStart + offset - 8, 32); + } +#endif + assert(offset==-1); + }; +#endif } /** @@ -218,12 +242,9 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { * for internal purposes (currently only alignment). */ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { - /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes - * to align the beginning of tables section, as well as another n_2=[0, 63] bytes - * to align the beginning of the aligned section. - * - * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and - * aligneds being sized in multiples of 64 bytes. + /* For alignment, the wksp will always allocate an additional ZSTD_CWKSP_ALIGNMENT_BYTES + * bytes to align the beginning of tables section, this will ensure that tables section + * and aligned buffers sections are aligned. */ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; return slackSpace; @@ -238,10 +259,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt size_t const alignBytesMask = alignBytes - 1; size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; assert((alignBytes & alignBytesMask) == 0); - assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); + assert(bytes < alignBytes); return bytes; } +/** + * Returns the initial value for allocStart which is used to determine the position from + * which we can allocate from the end of the workspace. + */ +MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) { + return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1)); +} + /** * Internal function. Do not use directly. * Reserves the given number of bytes within the aligned/buffer segment of the wksp, @@ -282,23 +311,12 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase { assert(phase >= ws->phase); if (phase > ws->phase) { - /* Going from allocating objects to allocating buffers */ - if (ws->phase < ZSTD_cwksp_alloc_buffers && - phase >= ZSTD_cwksp_alloc_buffers) { + /* Going from allocating objects to allocating aligned / tables */ + if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && + phase >= ZSTD_cwksp_alloc_aligned_init_once) { ws->tableValidEnd = ws->objectEnd; - } + ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); - /* Going from allocating buffers to allocating aligneds/tables */ - if (ws->phase < ZSTD_cwksp_alloc_aligned && - phase >= ZSTD_cwksp_alloc_aligned) { - { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ - size_t const bytesToAlign = - ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); - DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); - ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ - RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), - memory_allocation, "aligned phase - alignment initial allocation failed!"); - } { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ void* const alloc = ws->objectEnd; size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); @@ -310,7 +328,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase ws->tableEnd = objectEnd; /* table area starts being empty */ if (ws->tableValidEnd < ws->tableEnd) { ws->tableValidEnd = ws->tableEnd; - } } } + } } } ws->phase = phase; ZSTD_cwksp_assert_internal_consistency(ws); } @@ -322,7 +340,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase */ MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { - return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); + return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); } /** @@ -359,12 +377,35 @@ ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase return alloc; } + /** - * Reserves and returns unaligned memory. + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). + * This memory has been initialized at least once in the past. + * This doesn't mean it has been initialized this time, and it might contain data from previous + * operations. + * The main usage is for algorithms that might need read access into uninitialized memory. + * The algorithm must maintain safety under these conditions and must make sure it doesn't + * leak any of the past data (directly or in side channels). */ -MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) +MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) { - return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); + size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); + void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); + assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); + if(ptr && ptr < ws->initOnceStart) { + /* We assume the memory following the current allocation is either: + * 1. Not usable as initOnce memory (end of workspace) + * 2. Another initOnce buffer that has been allocated before (and so was previously memset) + * 3. An ASAN redzone, in which case we don't want to write on it + * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. + * Note that we assume here tha MSAN and ASAN cannot run in the same time. */ + ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); + ws->initOnceStart = ptr; + } +#if ZSTD_MEMORY_SANITIZER + assert(__msan_test_shadow(ptr, bytes) == -1); +#endif + return ptr; } /** @@ -385,13 +426,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) */ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { - const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; + const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; void* alloc; void* end; void* top; - if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { - return NULL; + /* We can only start allocating tables after we are done reserving space for objects at the + * start of the workspace */ + if(ws->phase < phase) { + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { + return NULL; + } } alloc = ws->tableEnd; end = (BYTE *)alloc + bytes; @@ -470,11 +515,19 @@ MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) /* To validate that the table re-use logic is sound, and that we don't * access table space that we haven't cleaned, we re-"poison" the table - * space every time we mark it dirty. */ + * space every time we mark it dirty. + * Since tableValidEnd space and initOnce space may overlap we don't poison + * the initOnce portion as it break its promise. This means that this poisoning + * check isn't always applied fully. */ { size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd; assert(__msan_test_shadow(ws->objectEnd, size) == -1); - __msan_poison(ws->objectEnd, size); + if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) { + __msan_poison(ws->objectEnd, size); + } else { + assert(ws->initOnceStart >= ws->objectEnd); + __msan_poison(ws->objectEnd, (BYTE*)ws->initOnceStart - (BYTE*)ws->objectEnd); + } } #endif @@ -539,11 +592,14 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { #if ZSTD_MEMORY_SANITIZER && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) /* To validate that the context re-use logic is sound, and that we don't * access stuff that this compression hasn't initialized, we re-"poison" - * the workspace (or at least the non-static, non-table parts of it) - * every time we start a new compression. */ + * the workspace except for the areas in which we expect memory re-use + * without initialization (objects, valid tables area and init once + * memory). */ { - size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->tableValidEnd; - __msan_poison(ws->tableValidEnd, size); + if((BYTE*)ws->tableValidEnd < (BYTE*)ws->initOnceStart) { + size_t size = (BYTE*)ws->initOnceStart - (BYTE*)ws->tableValidEnd; + __msan_poison(ws->tableValidEnd, size); + } } #endif @@ -559,10 +615,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { #endif ws->tableEnd = ws->objectEnd; - ws->allocStart = ws->workspaceEnd; + ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); ws->allocFailed = 0; - if (ws->phase > ZSTD_cwksp_alloc_buffers) { - ws->phase = ZSTD_cwksp_alloc_buffers; + if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { + ws->phase = ZSTD_cwksp_alloc_aligned_init_once; } ZSTD_cwksp_assert_internal_consistency(ws); } @@ -579,6 +635,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c ws->workspaceEnd = (BYTE*)start + size; ws->objectEnd = ws->workspace; ws->tableValidEnd = ws->objectEnd; + ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); ws->phase = ZSTD_cwksp_alloc_objects; ws->isStatic = isStatic; ZSTD_cwksp_clear(ws); @@ -616,7 +673,7 @@ MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) - + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); + + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); } MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { @@ -634,13 +691,13 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, size_t const estimatedSpace, int resizedWorkspace) { if (resizedWorkspace) { - /* Resized/newly allocated wksp should have exact bounds */ - return ZSTD_cwksp_used(ws) == estimatedSpace; + /* Resized/newly allocated wksp should have exact bounds up to one alignment */ + return (estimatedSpace - 64) <= ZSTD_cwksp_used(ws) && ZSTD_cwksp_used(ws) <= estimatedSpace; } else { /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes * than estimatedSpace. See the comments in zstd_cwksp.h for details. */ - return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); + return (ZSTD_cwksp_used(ws) >= estimatedSpace - 64) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); } } diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c index 3d74ff19e3c..aaa9473c3e6 100644 --- a/lib/compress/zstd_ldm.c +++ b/lib/compress/zstd_ldm.c @@ -157,8 +157,8 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params) size_t const ldmHSize = ((size_t)1) << params.hashLog; size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog); size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); - size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) - + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); + size_t const totalSize = ZSTD_cwksp_aligned_alloc_size(ldmBucketSize) + + ZSTD_cwksp_aligned_alloc_size(ldmHSize * sizeof(ldmEntry_t)); return params.enableLdm == ZSTD_ps_enable ? totalSize : 0; } diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index fdd7f9d8b5a..f02a760946e 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -1086,6 +1086,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, ZSTD_optimal_t lastSequence; ZSTD_optLdm_t optLdm; + ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t)); + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); From 5e4f57e6dc5fe3f82ede255e6af2bb78850be5e9 Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Tue, 7 Mar 2023 14:41:33 -0800 Subject: [PATCH 09/13] Restores unaligned buffer allocations, but changes allocation order to allocate them after aligned buffers --- lib/compress/zstd_compress.c | 94 +++++++++++++++++++----------------- lib/compress/zstd_cwksp.h | 44 +++++++++-------- lib/compress/zstd_ldm.c | 4 +- 3 files changed, 77 insertions(+), 65 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 44acc496d0f..49aa385466d 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1653,9 +1653,9 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); - size_t const tokenSpace = ZSTD_cwksp_aligned_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) - + 3 * ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(BYTE)); + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); @@ -1666,21 +1666,28 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; - size_t const bufferSpace = ZSTD_cwksp_aligned_alloc_size(buffInSize) - + ZSTD_cwksp_aligned_alloc_size(buffOutSize); + size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + + ZSTD_cwksp_alloc_size(buffOutSize); - size_t const cctxSpace = isStatic ? ZSTD_cwksp_aligned_alloc_size(sizeof(ZSTD_CCtx)) : 0; + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); size_t const externalSeqSpace = useSequenceProducer ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) : 0; - size_t const objectsSpace = cctxSpace + entropySpace + blockStateSpace; - size_t const tableSpace = ldmSpace + matchStateSize; - size_t const alignedSpace = tokenSpace + bufferSpace + externalSeqSpace + ldmSeqSpace; - - size_t const neededSpace = objectsSpace + tableSpace + alignedSpace; + size_t const neededSpace = + cctxSpace + + entropySpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + + bufferSpace + + externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); return neededSpace; } @@ -2125,38 +2132,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, needsIndexReset, ZSTD_resetTarget_CCtx), ""); - /* ZSTD_wildcopy() is used to copy into the literals buffer, - * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. - */ - zc->seqStore.litStart = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, blockSize + WILDCOPY_OVERLENGTH); - zc->seqStore.maxNbLit = blockSize; - - /* buffers */ - zc->bufferedPolicy = zbuff; - zc->inBuffSize = buffInSize; - zc->inBuff = (char*)ZSTD_cwksp_reserve_aligned(ws, buffInSize); - zc->outBuffSize = buffOutSize; - zc->outBuff = (char*)ZSTD_cwksp_reserve_aligned(ws, buffOutSize); - - /* ldm bucketOffsets table */ - if (params->ldmParams.enableLdm == ZSTD_ps_enable) { - /* TODO: avoid memset? */ - size_t const numBuckets = - ((size_t)1) << (params->ldmParams.hashLog - - params->ldmParams.bucketSizeLog); - zc->ldmState.bucketOffsets = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, numBuckets); - ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); - } - - /* sequences storage */ - ZSTD_referenceExternalSequences(zc, NULL, 0); - zc->seqStore.maxNbSeq = maxNbSeq; - zc->seqStore.llCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.mlCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.ofCode = (BYTE*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); - /* ldm hash table */ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { /* TODO: avoid memset? */ @@ -2178,8 +2155,39 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); } + /* buffers */ + + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); + zc->outBuffSize = buffOutSize; + zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); + + /* ldm bucketOffsets table */ + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* TODO: avoid memset? */ + size_t const numBuckets = + ((size_t)1) << (params->ldmParams.hashLog - + params->ldmParams.bucketSizeLog); + zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); + ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); + } + + /* sequences storage */ + ZSTD_referenceExternalSequences(zc, NULL, 0); + zc->seqStore.maxNbSeq = maxNbSeq; + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); - assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); + assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); zc->initialized = 1; @@ -5210,7 +5218,7 @@ size_t ZSTD_estimateCDictSize_advanced( + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams), /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 - : ZSTD_cwksp_aligned_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); } size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) @@ -5295,7 +5303,7 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedicatedDictSearch, /* forCCtx */ 0) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 - : ZSTD_cwksp_aligned_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); void* const workspace = ZSTD_customMalloc(workspaceSize, customMem); ZSTD_cwksp ws; ZSTD_CDict* cdict; diff --git a/lib/compress/zstd_cwksp.h b/lib/compress/zstd_cwksp.h index 06cf2e77b38..a0759a72901 100644 --- a/lib/compress/zstd_cwksp.h +++ b/lib/compress/zstd_cwksp.h @@ -47,7 +47,8 @@ extern "C" { typedef enum { ZSTD_cwksp_alloc_objects, ZSTD_cwksp_alloc_aligned_init_once, - ZSTD_cwksp_alloc_aligned + ZSTD_cwksp_alloc_aligned, + ZSTD_cwksp_alloc_buffers } ZSTD_cwksp_alloc_phase_e; /** @@ -101,7 +102,7 @@ typedef enum { * Workspace Layout: * * [ ... workspace ... ] - * [objects][tables ... ->] free space [<- ... aligned][<- ... init once] + * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] * * The various objects that live in the workspace are divided into the * following categories, and are allocated separately: @@ -138,6 +139,9 @@ typedef enum { * location before reading from it. * Buffers are aligned to 64 bytes. * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can + * be moved around at no cost for a new compression. * * Allocating Memory: * @@ -147,6 +151,7 @@ typedef enum { * 1. Objects * 2. Init once / Tables * 3. Aligned / Tables + * 4. Buffers / Tables * * Attempts to reserve objects of different types out of order will fail. */ @@ -183,7 +188,6 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { assert(ws->allocStart <= ws->workspaceEnd); assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); assert(ws->workspace <= ws->initOnceStart); - assert((size_t)ws->allocStart % ZSTD_CWKSP_ALIGNMENT_BYTES == 0); #if ZSTD_MEMORY_SANITIZER { intptr_t const offset = __msan_test_shadow(ws->initOnceStart, @@ -242,11 +246,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { * for internal purposes (currently only alignment). */ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { - /* For alignment, the wksp will always allocate an additional ZSTD_CWKSP_ALIGNMENT_BYTES - * bytes to align the beginning of tables section, this will ensure that tables section - * and aligned buffers sections are aligned. + /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES + * bytes to align the beginning of tables section and end of buffers; */ - size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; + size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; return slackSpace; } @@ -311,7 +314,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase { assert(phase >= ws->phase); if (phase > ws->phase) { - /* Going from allocating objects to allocating aligned / tables */ + /* Going from allocating objects to allocating initOnce / tables */ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && phase >= ZSTD_cwksp_alloc_aligned_init_once) { ws->tableValidEnd = ws->objectEnd; @@ -377,6 +380,13 @@ ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase return alloc; } +/** + * Reserves and returns unaligned memory. + */ +MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) +{ + return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); +} /** * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). @@ -673,7 +683,7 @@ MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) - + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); + + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); } MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { @@ -688,17 +698,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { * Returns if the estimated space needed for a wksp is within an acceptable limit of the * actual amount of space used. */ -MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, - size_t const estimatedSpace, int resizedWorkspace) { - if (resizedWorkspace) { - /* Resized/newly allocated wksp should have exact bounds up to one alignment */ - return (estimatedSpace - 64) <= ZSTD_cwksp_used(ws) && ZSTD_cwksp_used(ws) <= estimatedSpace; - } else { - /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes - * than estimatedSpace. See the comments in zstd_cwksp.h for details. - */ - return (ZSTD_cwksp_used(ws) >= estimatedSpace - 64) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); - } +MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { + /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice + * the alignment bytes difference between estimation and actual usage */ + return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && + ZSTD_cwksp_used(ws) <= estimatedSpace; } diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c index aaa9473c3e6..3d74ff19e3c 100644 --- a/lib/compress/zstd_ldm.c +++ b/lib/compress/zstd_ldm.c @@ -157,8 +157,8 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params) size_t const ldmHSize = ((size_t)1) << params.hashLog; size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog); size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); - size_t const totalSize = ZSTD_cwksp_aligned_alloc_size(ldmBucketSize) - + ZSTD_cwksp_aligned_alloc_size(ldmHSize * sizeof(ldmEntry_t)); + size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) + + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); return params.enableLdm == ZSTD_ps_enable ? totalSize : 0; } From f4aab97e84ca02d2ae634fa7eb6b704196c44138 Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Thu, 9 Mar 2023 09:02:53 -0800 Subject: [PATCH 10/13] CR fixes --- lib/compress/zstd_cwksp.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/compress/zstd_cwksp.h b/lib/compress/zstd_cwksp.h index a0759a72901..cc7fb1c715c 100644 --- a/lib/compress/zstd_cwksp.h +++ b/lib/compress/zstd_cwksp.h @@ -321,9 +321,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ - void* const alloc = ws->objectEnd; + void *const alloc = ws->objectEnd; size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); - void* const objectEnd = (BYTE*)alloc + bytesToAlign; + void *const objectEnd = (BYTE *) alloc + bytesToAlign; DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, "table phase - alignment initial allocation failed!"); @@ -331,7 +331,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase ws->tableEnd = objectEnd; /* table area starts being empty */ if (ws->tableValidEnd < ws->tableEnd) { ws->tableValidEnd = ws->tableEnd; - } } } + } + } + } ws->phase = phase; ZSTD_cwksp_assert_internal_consistency(ws); } @@ -408,7 +410,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t byt * 2. Another initOnce buffer that has been allocated before (and so was previously memset) * 3. An ASAN redzone, in which case we don't want to write on it * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. - * Note that we assume here tha MSAN and ASAN cannot run in the same time. */ + * Note that we assume here that MSAN and ASAN cannot run in the same time. */ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); ws->initOnceStart = ptr; } From a8c62ff9d79e8a1ffc2a236021129e39f1cb716a Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Mon, 13 Mar 2023 11:00:59 -0700 Subject: [PATCH 11/13] Merge bugfix --- lib/compress/zstd_compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index d38db375ca6..77cb82e2178 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1978,7 +1978,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ZSTD_advanceHashSalt(ms); } else { /* When we are not salting we want to always memset the memory */ - ms->tagTable = (U16 *) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); + ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); ZSTD_memset(ms->tagTable, 0, tagTableSize); ms->hashSalt = 0; } From 2343273fc1a48e685f242b8855c846c49e672f29 Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Mon, 13 Mar 2023 11:24:36 -0700 Subject: [PATCH 12/13] Use `MEM_STATIC` instead of `FORCE_INLINE_TEMPLATE` --- lib/common/bits.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/common/bits.h b/lib/common/bits.h index 378a34047a7..def56c474c3 100644 --- a/lib/common/bits.h +++ b/lib/common/bits.h @@ -176,21 +176,21 @@ MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCo * Rotates a bitfield to the right by "count" bits. * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts */ -FORCE_INLINE_TEMPLATE +MEM_STATIC U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { assert(count < 64); count &= 0x3F; /* for fickle pattern recognition */ return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); } -FORCE_INLINE_TEMPLATE +MEM_STATIC U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { assert(count < 32); count &= 0x1F; /* for fickle pattern recognition */ return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); } -FORCE_INLINE_TEMPLATE +MEM_STATIC U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { assert(count < 16); count &= 0x0F; /* for fickle pattern recognition */ From 12d9b53f3e9bc43c99cd6da00b89b9c2985cf334 Mon Sep 17 00:00:00 2001 From: Yonatan Komornik Date: Mon, 13 Mar 2023 13:30:04 -0700 Subject: [PATCH 13/13] merge fix --- lib/compress/zstd_compress.c | 31 ------------------------------- 1 file changed, 31 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 98a4e03d326..77cb82e2178 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -2209,37 +2209,6 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - /* buffers */ - - /* ZSTD_wildcopy() is used to copy into the literals buffer, - * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. - */ - zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); - zc->seqStore.maxNbLit = blockSize; - - zc->bufferedPolicy = zbuff; - zc->inBuffSize = buffInSize; - zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); - zc->outBuffSize = buffOutSize; - zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); - - /* ldm bucketOffsets table */ - if (params->ldmParams.enableLdm == ZSTD_ps_enable) { - /* TODO: avoid memset? */ - size_t const numBuckets = - ((size_t)1) << (params->ldmParams.hashLog - - params->ldmParams.bucketSizeLog); - zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, numBuckets); - ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); - } - - /* sequences storage */ - ZSTD_referenceExternalSequences(zc, NULL, 0); - zc->seqStore.maxNbSeq = maxNbSeq; - zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace));