From 2caa126ab1e35639cb90e989b6d3edb38269670c Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sun, 5 Apr 2026 08:43:37 -0700 Subject: [PATCH 1/9] JIT: STORE_LCL_FLD coalescing Refactor the lowering-time store coalescing machinery so adjacent constant GT_STORE_LCL_FLD writes can reuse the existing combine logic that was previously limited to STOREIND/STORE_BLK. Gate the new local-field path from LowerStoreLocCommon with JitEnableStoreLclFldCoalescing. Extend the struct-promotion regression coverage for the newly handled cases. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/jitconfigvalues.h | 1 + src/coreclr/jit/lower.cpp | 621 +++++++++++------- src/coreclr/jit/lower.h | 57 +- .../JIT/Directed/StructPromote/SpAddr.cs | 55 +- 4 files changed, 496 insertions(+), 238 deletions(-) diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 6890ee573919d7..5318304d9206de 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -91,6 +91,7 @@ CONFIG_INTEGER(JitHideAlignBehindJmp, "JitHideAlignBehindJmp", 1) // Track stores to locals done through return buffers. CONFIG_INTEGER(JitOptimizeStructHiddenBuffer, "JitOptimizeStructHiddenBuffer", 1) +RELEASE_CONFIG_INTEGER(JitEnableStoreLclFldCoalescing, "JitEnableStoreLclFldCoalescing", 1) CONFIG_INTEGER(JitUnrollLoopMaxIterationCount, "JitUnrollLoopMaxIterationCount", diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6ebc78c1433591..06e5b3fe8b4689 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5662,6 +5662,11 @@ GenTree* Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore) TryRetypingFloatingPointStoreToIntegerStore(lclStore); + if (lclStore->OperIs(GT_STORE_LCL_FLD) && (JitConfig.JitEnableStoreLclFldCoalescing() != 0)) + { + LowerStoreCoalescing(lclStore); + } + GenTree* src = lclStore->gtGetOp1(); LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclStore); const bool srcIsMultiReg = src->IsMultiRegNode(); @@ -10436,6 +10441,7 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing } data->targetType = ind->TypeGet(); + data->accessSize = ind->Size(); data->value = isStore ? ind->Data() : nullptr; if (ind->Addr()->OperIs(GT_LEA)) { @@ -10458,6 +10464,10 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing data->index = index == nullptr ? nullptr : index; data->scale = ind->Addr()->AsAddrMode()->GetScale(); data->offset = ind->Addr()->AsAddrMode()->Offset(); + if ((base != nullptr) && base->OperIs(GT_LCL_VAR)) + { + data->lclNum = base->AsLclVar()->GetLclNum(); + } } else if (isNodeInvariant(m_compiler, ind->Addr(), true)) { @@ -10466,6 +10476,10 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing data->index = nullptr; data->scale = 1; data->offset = 0; + if ((ind->Addr() != nullptr) && ind->Addr()->OperIs(GT_LCL_VAR)) + { + data->lclNum = ind->Addr()->AsLclVar()->GetLclNum(); + } } else { @@ -10483,125 +10497,264 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing data->rangeStart = range.FirstNode(); data->rangeEnd = range.LastNode(); + data->storeFlags = ind->gtFlags; return true; } //------------------------------------------------------------------------ -// LowerStoreIndirCoalescing: If the given IND/STOREIND node is followed by a similar -// IND/STOREIND node, try to merge them into a single store of a twice wider type. Example: +// GetStoreCoalescingData: get the data needed to perform store coalescing for either +// STOREIND/STORE_BLK or STORE_LCL_FLD. // -// * STOREIND int -// +--* LCL_VAR byref V00 -// \--* CNS_INT int 0x1 +// Arguments: +// store - the store node +// data - [OUT] coalescing data // -// * STOREIND int -// +--* LEA(b+4) byref -// | \--* LCL_VAR byref V00 -// \--* CNS_INT int 0x2 +// Return Value: +// true if the data was successfully retrieved, false otherwise. // -// We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value +bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* data) const +{ + if (store->OperIs(GT_STOREIND, GT_STORE_BLK)) + { + return GetLoadStoreCoalescingData(store->AsIndir(), data); + } + + if (!store->OperIs(GT_STORE_LCL_FLD)) + { + return false; + } + + auto isNodeInvariant = [](Compiler* compiler, GenTree* node) { + if (node->OperIsConst()) + { + return true; + } + + return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum()); + }; + + auto* lclStore = store->AsLclFld(); + if (!isNodeInvariant(m_compiler, lclStore->Data())) + { + return false; + } + + bool isClosedRange = false; + LIR::ReadOnlyRange range = BlockRange().GetTreeRange(store, &isClosedRange); + if (!isClosedRange) + { + return false; + } + + LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclStore); + data->targetType = store->TypeGet(); + data->accessSize = lclStore->GetSize(); + data->value = lclStore->Data(); + data->offset = lclStore->GetLclOffs(); + data->lclNum = lclStore->GetLclNum(); + data->isLocalStoreNode = true; + data->isAddressExposed = varDsc->IsAddressExposed(); + data->storeFlags = store->gtFlags; + data->rangeStart = range.FirstNode(); + data->rangeEnd = range.LastNode(); + + return true; +} + +//------------------------------------------------------------------------ +// LowerCheckCoalescedStoreAtomicity: decide whether it's safe to coalesce two adjacent +// stores into a single wider store without violating atomicity guarantees. // -// * STOREIND long -// +--* LEA(b+0) byref -// | \--* LCL_VAR byref V00 -// \--* CNS_INT long 0x200000001 +// Arguments: +// currStore - the current store node +// prevStore - the previous store node +// currData - coalescing data for the current store +// prevData - coalescing data for the previous store // -// NOTE: Our memory model allows us to do this optimization, see Memory-model.md: -// * Adjacent non-volatile writes to the same location can be coalesced. (see Memory-model.md) +// Return Value: +// true if it is safe to coalesce the two stores, false otherwise. +// +bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree* currStore, + GenTree* prevStore, + const LoadStoreCoalescingData& currData, + const LoadStoreCoalescingData& prevData) +{ + // Now the hardest part: decide whether it's safe to use an unaligned write. + // + // IND is always fine (and all IND created here from such) + // IND is not required to be atomic per our Memory Model + bool allowsNonAtomic = currData.AllowsNonAtomic() && prevData.AllowsNonAtomic(); + int minOffset = min(prevData.offset, currData.offset); + int maxEndOffset = max(prevData.offset + static_cast(prevData.accessSize), + currData.offset + static_cast(currData.accessSize)); + unsigned combinedSize = static_cast(maxEndOffset - minOffset); + + if (!allowsNonAtomic && (currData.lclNum == m_compiler->info.compRetBuffArg)) + { + // RetBuf is a private stack memory, so we don't need to worry about atomicity. + allowsNonAtomic = true; + } + + if (!allowsNonAtomic && currData.IsLocalStore() && !currData.isAddressExposed && !prevData.isAddressExposed) + { + allowsNonAtomic = true; + } + + if (!allowsNonAtomic && currData.IsLocalStore()) + { + // During lowering we cannot rely on the final stack offset yet. Instead, use the stack-home + // alignment guarantees implied by local allocation: locals are naturally aligned up to pointer + // size, and address-exposed locals must be conservative beyond that. + unsigned homeAlignment = min(m_compiler->lvaLclStackHomeSize(currData.lclNum), (unsigned)TARGET_POINTER_SIZE); + bool isCombinedStoreAtomic = (combinedSize <= homeAlignment) && ((minOffset % combinedSize) == 0); + +#ifdef TARGET_ARM64 + if (currData.accessSize == TARGET_POINTER_SIZE) + { + // Per Arm ARM, a 128-bit SIMD write that is 64-bit aligned is treated as a pair of + // single-copy atomic 64-bit writes. + isCombinedStoreAtomic = (homeAlignment >= TARGET_POINTER_SIZE) && ((minOffset % TARGET_POINTER_SIZE) == 0); + } +#endif + + return isCombinedStoreAtomic; + } + + if (!allowsNonAtomic && (genTypeSize(currStore) > 1) && !varTypeIsSIMD(currStore)) + { + // Ignore indices for now, they can invalidate our alignment assumptions. + // Although, we can take scale into account. + if (currData.index != nullptr) + { + return false; + } + + // Base address being TYP_REF gives us a hint that data is pointer-aligned. + if (!currData.baseAddr->TypeIs(TYP_REF)) + { + return false; + } + + // Check whether the combined indir is still aligned. + bool isCombinedIndirAtomic = + (combinedSize <= TARGET_POINTER_SIZE) && ((minOffset % static_cast(combinedSize)) == 0); + + if (combinedSize == 2 * TARGET_POINTER_SIZE) + { +#ifdef TARGET_ARM64 + // Per Arm Architecture Reference Manual for A-profile architecture: + // + // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory + // are treated as a pair of single - copy atomic 64 - bit writes. + // + // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null) + // + // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise + // we're already doing a load that has no atomicity guarantees. + isCombinedIndirAtomic = true; +#endif + } + else if (combinedSize > TARGET_POINTER_SIZE) + { + return false; + } + + if (!isCombinedIndirAtomic) + { + return false; + } + } + + return true; +} + +//------------------------------------------------------------------------ +// LowerStoreCoalescing: if the given store is followed by a similar adjacent constant store, +// try to merge them into a single wider store. Supports STOREIND/STORE_BLK and STORE_LCL_FLD. // // Arguments: -// ind - the current STOREIND node +// node - current store node // -void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) +void Lowering::LowerStoreCoalescing(GenTree* node) { -// LA, RISC-V and ARM32 more likely to receive a terrible performance hit from -// unaligned accesses making this optimization questionable. #if defined(TARGET_XARCH) || defined(TARGET_ARM64) if (!m_compiler->opts.OptimizationEnabled()) { return; } - if (!ind->OperIs(GT_STOREIND, GT_STORE_BLK)) + if (!node->OperIs(GT_STOREIND, GT_STORE_BLK, GT_STORE_LCL_FLD)) { - // Load coalescing is not yet supported return; } - // We're going to do it in a loop while we see suitable STOREINDs to coalesce. - // E.g.: we have the following LIR sequence: - // - // ...addr nodes... - // STOREIND(int) - // ...addr nodes... - // STOREIND(short) - // ...addr nodes... - // STOREIND(short) <-- we're here - // - // First we merge two 'short' stores, then we merge the result with the 'int' store - // to get a single store of 8 bytes. do { LoadStoreCoalescingData currData; LoadStoreCoalescingData prevData; - // Get coalescing data for the current STOREIND - if (!GetLoadStoreCoalescingData(ind, &currData)) + if (!GetStoreCoalescingData(node, &currData)) { return; } GenTree* prevTree = currData.rangeStart->gtPrev; - // Now we need to find the previous STOREIND, - // we can ignore any NOPs or IL_OFFSETs in-between while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET)) { prevTree = prevTree->gtPrev; } - // It's not a store - bail out. - if ((prevTree == nullptr) || !prevTree->OperIs(GT_STOREIND, GT_STORE_BLK)) + if (prevTree == nullptr) { return; } - // Get coalescing data for the previous STOREIND - GenTreeIndir* prevInd = prevTree->AsIndir(); - if (!GetLoadStoreCoalescingData(prevInd, &prevData)) + if (node->OperIs(GT_STORE_LCL_FLD)) + { + if (!prevTree->OperIs(GT_STORE_LCL_FLD)) + { + return; + } + } + else if (!prevTree->OperIs(GT_STOREIND, GT_STORE_BLK)) { return; } - if (!currData.IsAddressEqual(prevData, /*ignoreOffset*/ true)) + if (!GetStoreCoalescingData(prevTree, &prevData)) { - // Non-offset part of the address is not the same - bail out. return; } - // The same offset means that we're storing to the same location of the same width. - // Just remove the previous store then. - if (prevData.offset == currData.offset) + bool const allowOverlapOptimization = node->OperIs(GT_STORE_LCL_FLD); + bool const sameLocation = currData.IsAddressEqual(prevData, /*ignoreOffset*/ true, allowOverlapOptimization); + + if (!sameLocation) { + return; + } + + if ((currData.offset == prevData.offset) && (currData.targetType == prevData.targetType)) + { + if (m_compiler->gtTreeHasSideEffects(prevData.value, GTF_SIDE_EFFECT | GTF_GLOB_EFFECT | GTF_ASG) || + m_compiler->gtTreeHasSideEffects(currData.value, GTF_SIDE_EFFECT | GTF_GLOB_EFFECT | GTF_ASG)) + { + return; + } + BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); continue; } - // TODO-ARM64-CQ: enable TYP_REF if we find a case where it's beneficial. - // The algorithm does support TYP_REF (with null value), but it seems to be not worth - // it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two - // items at once. Although, it may be profitable to do "stp q0, q0, [addr]". - if (!varTypeIsIntegral(ind) && !varTypeIsSIMD(ind)) + if (!varTypeIsIntegral(node) && !varTypeIsSIMD(node)) { return; } - assert(ind->OperIs(GT_STOREIND)); - assert(prevInd->OperIs(GT_STOREIND)); assert(prevData.IsStore()); assert(currData.IsStore()); - // For now, only non-relocatable constants are supported for data. if (!prevData.value->OperIsConst() || !currData.value->OperIsConst()) { return; @@ -10617,161 +10770,132 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) return; } - // Otherwise, the difference between two offsets has to match the size of the type. - // We don't support overlapping stores. - if (abs(prevData.offset - currData.offset) != (int)genTypeSize(prevData.targetType)) + if (node->OperIs(GT_STOREIND, GT_STORE_BLK) && !node->AsIndir()->Addr()->OperIs(GT_LEA)) { return; } - // For now, we require the current STOREIND to have LEA (previous store may not have it) - // So we can easily adjust the offset, consider making it more flexible in future. - if (!ind->Addr()->OperIs(GT_LEA)) - { - return; - } - - // Now the hardest part: decide whether it's safe to use an unaligned write. - // - // IND is always fine (and all IND created here from such) - // IND is not required to be atomic per our Memory Model - bool allowsNonAtomic = - ((ind->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0) && ((prevInd->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0); + var_types oldType = node->TypeGet(); + var_types newType = TYP_UNDEF; + bool tryReusingPrevValue = false; + int const minOffset = min(prevData.offset, currData.offset); + int const prevEndOffset = prevData.offset + static_cast(prevData.accessSize); + int const currEndOffset = currData.offset + static_cast(currData.accessSize); + int const maxEndOffset = max(prevEndOffset, currEndOffset); + int const combinedSize = maxEndOffset - minOffset; + bool const prevContainsCurr = (prevData.offset <= currData.offset) && (prevEndOffset >= currEndOffset); + bool const currContainsPrev = (currData.offset <= prevData.offset) && (currEndOffset >= prevEndOffset); + bool const storesAreAdjacent = (prevEndOffset == currData.offset) || (currEndOffset == prevData.offset); + bool const storesHaveSameSize = prevData.accessSize == currData.accessSize; - if (!allowsNonAtomic && currData.baseAddr->OperIs(GT_LCL_VAR) && - (currData.baseAddr->AsLclVar()->GetLclNum() == m_compiler->info.compRetBuffArg)) + if (allowOverlapOptimization && currContainsPrev) { - // RetBuf is a private stack memory, so we don't need to worry about atomicity. - allowsNonAtomic = true; + BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); + continue; } - if (!allowsNonAtomic && (genTypeSize(ind) > 1) && !varTypeIsSIMD(ind)) + if (allowOverlapOptimization && prevContainsCurr) { - // TODO-CQ: if we see that the target is a local memory (non address exposed) - // we can use any type (including SIMD) for a new load. - - // Ignore indices for now, they can invalidate our alignment assumptions. - // Although, we can take scale into account. - if (currData.index != nullptr) + if (!varTypeIsIntegral(node)) { return; } - // Base address being TYP_REF gives us a hint that data is pointer-aligned. - if (!currData.baseAddr->TypeIs(TYP_REF)) + switch (combinedSize) { - return; + case 1: + newType = TYP_UBYTE; + break; + case 2: + newType = TYP_USHORT; + break; + case 4: + newType = TYP_INT; + break; +#ifdef TARGET_64BIT + case 8: + newType = TYP_LONG; + break; +#endif + default: + return; } - - // Check whether the combined indir is still aligned. - bool isCombinedIndirAtomic = (genTypeSize(ind) < TARGET_POINTER_SIZE) && - (min(prevData.offset, currData.offset) % (genTypeSize(ind) * 2)) == 0; - - if (genTypeSize(ind) == TARGET_POINTER_SIZE) + } + else + { + if (!storesAreAdjacent || !storesHaveSameSize) { -#ifdef TARGET_ARM64 - // Per Arm Architecture Reference Manual for A-profile architecture: - // - // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory - // are treated as a pair of single - copy atomic 64 - bit writes. - // - // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null) - // - // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise - // we're already doing a load that has no atomicity guarantees. - isCombinedIndirAtomic = true; -#endif + return; } - if (!isCombinedIndirAtomic) + if (abs(prevData.offset - currData.offset) != static_cast(prevData.accessSize)) { return; } - } - // Since we're merging two stores of the same type, the new type is twice wider. - var_types oldType = ind->TypeGet(); - var_types newType = TYP_UNDEF; - bool tryReusingPrevValue = false; - switch (oldType) - { - case TYP_BYTE: - case TYP_UBYTE: - newType = TYP_USHORT; - break; + switch (oldType) + { + case TYP_BYTE: + case TYP_UBYTE: + newType = TYP_USHORT; + break; - case TYP_SHORT: - case TYP_USHORT: - newType = TYP_INT; - break; + case TYP_SHORT: + case TYP_USHORT: + newType = TYP_INT; + break; #ifdef TARGET_64BIT - case TYP_INT: - newType = TYP_LONG; - break; + case TYP_INT: + newType = TYP_LONG; + break; #if defined(FEATURE_HW_INTRINSICS) - case TYP_LONG: - case TYP_REF: - // TLDR: we should be here only if one of the conditions is true: - // 1) Both GT_INDs have GTF_IND_ALLOW_NON_ATOMIC flag - // 2) ARM64: Data is at least 8-byte aligned - // 3) AMD64: Data is at least 16-byte aligned on AMD/Intel with AVX+ - // - newType = TYP_SIMD16; - if ((oldType == TYP_REF) && - (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0))) - { - // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.: - // - // arr[1] = "hello"; - // arr[0] = "world"; - // - // but we don't want to load managed references into SIMD registers (we can only do so - // when we can issue a nongc region for a block) - return; - } - break; + case TYP_LONG: + case TYP_REF: + newType = TYP_SIMD16; + if ((oldType == TYP_REF) && + (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0))) + { + return; + } + break; #if defined(TARGET_AMD64) - case TYP_SIMD16: - if (m_compiler->getPreferredVectorByteLength() >= 32) - { - newType = TYP_SIMD32; + case TYP_SIMD16: + if (m_compiler->getPreferredVectorByteLength() >= 32) + { + newType = TYP_SIMD32; + break; + } + tryReusingPrevValue = true; break; - } - tryReusingPrevValue = true; - break; - case TYP_SIMD32: - if (m_compiler->getPreferredVectorByteLength() >= 64) - { - newType = TYP_SIMD64; + case TYP_SIMD32: + if (m_compiler->getPreferredVectorByteLength() >= 64) + { + newType = TYP_SIMD64; + break; + } + tryReusingPrevValue = true; break; - } - tryReusingPrevValue = true; - break; -#elif defined(TARGET_ARM64) // TARGET_AMD64 - case TYP_SIMD16: - tryReusingPrevValue = true; - break; - -#endif // TARGET_ARM64 -#endif // FEATURE_HW_INTRINSICS -#endif // TARGET_64BIT +#elif defined(TARGET_ARM64) + case TYP_SIMD16: + tryReusingPrevValue = true; + break; +#endif +#endif +#endif + default: + return; + } + } - // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to - // be converted to TYP_INT/TYP_LONG for constant value. - // - // TYP_UINT and TYP_ULONG are not legal for GT_IND. - // - default: - return; + if (!LowerCheckCoalescedStoreAtomicity(node, prevTree, currData, prevData)) + { + return; } - // If we can't merge these two stores into a single store, we can at least - // cache prevData.value to a local and reuse it in currData. - // Normally, LSRA is expected to do this for us, but it's not always the case for SIMD. if (tryReusingPrevValue) { #if defined(FEATURE_HW_INTRINSICS) @@ -10783,43 +10907,56 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) m_compiler->gtNewLclvNode(use.ReplaceWithLclVar(m_compiler), prevData.value->TypeGet()); BlockRange().InsertBefore(currData.value, prevValueTmp); BlockRange().Remove(currData.value); - ind->Data() = prevValueTmp; + + if (node->OperIsLocalStore()) + { + node->AsLclVarCommon()->Data() = prevValueTmp; + } + else + { + node->AsIndir()->Data() = prevValueTmp; + } } -#endif // FEATURE_HW_INTRINSICS +#endif return; } assert(newType != TYP_UNDEF); - // We should not be here for stores requiring write barriers. - assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind->AsStoreInd())); - assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd->AsStoreInd())); + if (node->OperIs(GT_STOREIND, GT_STORE_BLK)) + { + auto* ind = node->AsStoreInd(); + auto* prevInd = prevTree->AsStoreInd(); - // Delete previous STOREIND entirely - BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); + assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind)); + assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd)); - // It's not expected to be contained yet, but just in case... - ind->Data()->ClearContained(); + BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); - // We know it's always LEA for now - GenTreeAddrMode* addr = ind->Addr()->AsAddrMode(); + ind->Data()->ClearContained(); + GenTreeAddrMode* addr = ind->Addr()->AsAddrMode(); + addr->SetOffset(min(prevData.offset, currData.offset)); - // Update offset to be the minimum of the two - addr->SetOffset(min(prevData.offset, currData.offset)); + ind->gtType = newType; + ind->Data()->gtType = newType; + } + else + { + auto* lclStore = node->AsLclFld(); + BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); - // Update type for both STOREIND and val - ind->gtType = newType; - ind->Data()->gtType = newType; + lclStore->Data()->ClearContained(); + lclStore->gtType = newType; + lclStore->Data()->gtType = newType; + lclStore->SetLclOffs(min(prevData.offset, currData.offset)); + } #if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS) - // Upgrading two SIMD stores to a wider SIMD store. - // Only on x64 since ARM64 has no options above SIMD16 if (varTypeIsSIMD(oldType)) { int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8; int8_t* upperCns = currData.value->AsVecCon()->gtSimdVal.i8; - // if the previous store was at a higher address, swap the constants if (prevData.offset > currData.offset) { std::swap(lowerCns, upperCns); @@ -10830,56 +10967,94 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) memcpy(newCns.i8, lowerCns, oldWidth); memcpy(newCns.i8 + oldWidth, upperCns, oldWidth); - ind->Data()->AsVecCon()->gtSimdVal = newCns; + currData.value->AsVecCon()->gtSimdVal = newCns; continue; } #endif - size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue(); - size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue(); - - // if the previous store was at a higher address, swap the constants - if (prevData.offset > currData.offset) - { - std::swap(lowerCns, upperCns); - } + size_t lowerCns = static_cast(prevData.value->AsIntCon()->IconValue()); + size_t upperCns = static_cast(currData.value->AsIntCon()->IconValue()); #if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS) - // We're promoting two TYP_LONG/TYP_REF into TYP_SIMD16 - // All legality checks were done above. if (varTypeIsSIMD(newType)) { - // Replace two 64bit constants with a single 128bit constant + if (prevData.offset > currData.offset) + { + std::swap(lowerCns, upperCns); + } + int8_t val[16]; memcpy(val, &lowerCns, 8); memcpy(val + 8, &upperCns, 8); GenTreeVecCon* vecCns = m_compiler->gtNewVconNode(newType, &val); - BlockRange().InsertAfter(ind->Data(), vecCns); - BlockRange().Remove(ind->Data()); - ind->gtOp2 = vecCns; + BlockRange().InsertAfter(currData.value, vecCns); + BlockRange().Remove(currData.value); + + if (node->OperIsLocalStore()) + { + node->AsLclVarCommon()->Data() = vecCns; + } + else + { + node->AsIndir()->Data() = vecCns; + } + continue; } -#endif // TARGET_64BIT && FEATURE_HW_INTRINSICS +#endif - // Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT - // the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF. - size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_PER_BYTE; - lowerCns &= mask; - upperCns &= mask; + size_t prevMask = ~(size_t(0)) >> (sizeof(size_t) - prevData.accessSize) * BITS_PER_BYTE; + size_t currMask = ~(size_t(0)) >> (sizeof(size_t) - currData.accessSize) * BITS_PER_BYTE; + size_t newMask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(newType)) * BITS_PER_BYTE; + lowerCns &= prevMask; + upperCns &= currMask; - size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_PER_BYTE))); + size_t val = 0; + val |= (lowerCns << ((prevData.offset - minOffset) * BITS_PER_BYTE)); + val |= (upperCns << ((currData.offset - minOffset) * BITS_PER_BYTE)); + val &= newMask; JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val); - ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val; - if (genTypeSize(oldType) == 1) + auto* intCon = currData.value->AsIntCon(); + intCon->gtIconVal = static_cast(val); + if (genTypeSize(oldType) == 1 && node->OperIsIndir()) { - // A mark for future foldings that this IND doesn't need to be atomic. - ind->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC; + node->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC; } - } while (true); -#endif // TARGET_XARCH || TARGET_ARM64 +#endif +} + +//------------------------------------------------------------------------ +// LowerStoreIndirCoalescing: If the given IND/STOREIND node is followed by a similar +// IND/STOREIND node, try to merge them into a single store of a twice wider type. Example: +// +// * STOREIND int +// +--* LCL_VAR byref V00 +// \--* CNS_INT int 0x1 +// +// * STOREIND int +// +--* LEA(b+4) byref +// | \--* LCL_VAR byref V00 +// \--* CNS_INT int 0x2 +// +// We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value +// +// * STOREIND long +// +--* LEA(b+0) byref +// | \--* LCL_VAR byref V00 +// \--* CNS_INT long 0x200000001 +// +// NOTE: Our memory model allows us to do this optimization, see Memory-model.md: +// * Adjacent non-volatile writes to the same location can be coalesced. (see Memory-model.md) +// +// Arguments: +// ind - the current STOREIND node +// +void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) +{ + LowerStoreCoalescing(ind); } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 763f68ff607771..47f2c09f730fdd 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -350,32 +350,62 @@ class Lowering final : public Phase struct LoadStoreCoalescingData { - var_types targetType; - GenTree* baseAddr; - GenTree* index; - GenTree* value; - uint32_t scale; - int offset; - GenTree* rangeStart; - GenTree* rangeEnd; + var_types targetType = TYP_UNDEF; + GenTree* baseAddr = nullptr; + GenTree* index = nullptr; + GenTree* value = nullptr; + uint32_t scale = 1; + int offset = 0; + unsigned accessSize = 0; + unsigned lclNum = BAD_VAR_NUM; + GenTreeFlags storeFlags = GTF_EMPTY; + bool isLocalStoreNode = false; + bool isAddressExposed = false; + GenTree* rangeStart = nullptr; + GenTree* rangeEnd = nullptr; bool IsStore() const { return value != nullptr; } - bool IsAddressEqual(const LoadStoreCoalescingData& other, bool ignoreOffset) const + bool IsLocalStore() const { - if ((scale != other.scale) || (targetType != other.targetType) || - !GenTree::Compare(baseAddr, other.baseAddr) || !GenTree::Compare(index, other.index)) + return isLocalStoreNode; + } + + bool AllowsNonAtomic() const + { + return (storeFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0; + } + + bool IsAddressEqual(const LoadStoreCoalescingData& other, + bool ignoreOffset, + bool allowDifferentTypes = false) const + { + if ((scale != other.scale) || (!allowDifferentTypes && (targetType != other.targetType))) { return false; } + + if (IsLocalStore() || other.IsLocalStore()) + { + if (!IsLocalStore() || !other.IsLocalStore() || (lclNum != other.lclNum)) + { + return false; + } + } + else if (!GenTree::Compare(baseAddr, other.baseAddr) || !GenTree::Compare(index, other.index)) + { + return false; + } + return ignoreOffset || (offset == other.offset); } }; bool GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescingData* data) const; + bool GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* data) const; // Per tree node member functions GenTree* LowerStoreIndirCommon(GenTreeStoreInd* ind); @@ -388,7 +418,12 @@ class Lowering final : public Phase void MarkTree(GenTree* root); void UnmarkTree(GenTree* root); GenTree* LowerStoreIndir(GenTreeStoreInd* node); + void LowerStoreCoalescing(GenTree* node); void LowerStoreIndirCoalescing(GenTreeIndir* node); + bool LowerCheckCoalescedStoreAtomicity(GenTree* currStore, + GenTree* prevStore, + const LoadStoreCoalescingData& currData, + const LoadStoreCoalescingData& prevData); GenTree* LowerAdd(GenTreeOp* node); GenTree* LowerMul(GenTreeOp* mul); bool TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode); diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs index b9fa68710da733..9ae7a9c83543ff 100644 --- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs +++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs @@ -2,20 +2,24 @@ // The .NET Foundation licenses this file to you under the MIT license. // -using System.Runtime.CompilerServices; using System; +using System.Runtime.CompilerServices; using Xunit; public class SpAddr { - - // Struct in reg (2 ints) struct S { public int i0; public int i1; } + struct Pair + { + public int A; + public int B; + } + [MethodImpl(MethodImplOptions.NoInlining)] static int Foo(S s0, S s1) { @@ -35,12 +39,55 @@ static int M(int i0, int i1, int i2, int i3) return Foo(s0, s1); // r0 <= r1; r1 <= r0; r2 <= r3; r3 <= r2 } + [MethodImpl(MethodImplOptions.NoInlining)] + static long Consume(Pair p) => ((long)p.B << 32) | (uint)p.A; + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Expose(ref Pair p) + { + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static long NonAddressExposed(int a, int b) + { + Pair p; + p.A = a; + p.B = b; + return Consume(p); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static long AddressExposed(int a, int b) + { + Pair p = default; + Expose(ref p); + p.A = a; + p.B = b; + return Consume(p); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static int OverlappingZeroThenShort() + { + Pair p = default; + p.A = 0; + Unsafe.As(ref p) = 256; + return p.A; + } + [Fact] public static int TestEntryPoint() { int res = M(1, 2, 3, 4); Console.WriteLine("M(1, 2, 3, 4) is {0}.", res); - if (res == 10) + long nonAddressExposed = NonAddressExposed(0x11223344, 0x55667788); + long addressExposed = AddressExposed(0x10203040, 0x50607080); + int overlapping = OverlappingZeroThenShort(); + + if ((res == 10) && + (nonAddressExposed == 0x5566778811223344L) && + (addressExposed == 0x5060708010203040L) && + (overlapping == 256)) return 100; else return 99; From 8d03f6e7395093a93ec38e941f5aa623fa57adae Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Tue, 7 Apr 2026 13:39:39 -0700 Subject: [PATCH 2/9] review feedback --- src/coreclr/jit/lower.cpp | 131 +++++++++++++++++++++++--------------- src/coreclr/jit/lower.h | 1 - 2 files changed, 81 insertions(+), 51 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 06e5b3fe8b4689..f71394ac4d63c9 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10391,6 +10391,34 @@ void Lowering::LowerBlockStoreAsHelperCall(GenTreeBlk* blkNode) } } +//------------------------------------------------------------------------ +// IsStoreCoalescingInvariantNode: check whether a node is simple enough to participate in +// lowering-time store coalescing. +// +// Arguments: +// compiler - the compiler instance +// node - the node to examine +// allowNull - true if a null node should be considered valid +// +// Return Value: +// true if the node is an allowed invariant input for store coalescing; otherwise false. +// +static bool IsStoreCoalescingInvariantNode(Compiler* compiler, GenTree* node, bool allowNull = false) +{ + if (node == nullptr) + { + return allowNull; + } + + if (node->OperIsConst()) + { + return true; + } + + // We can allow bigger trees here, but it's not clear if it's worth it. + return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum()); +} + //------------------------------------------------------------------------ // GetLoadStoreCoalescingData: given a STOREIND/IND node, get the data needed to perform // store/load coalescing including pointer to the previous node. @@ -10414,23 +10442,10 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing const bool isStore = ind->OperIs(GT_STOREIND, GT_STORE_BLK); const bool isLoad = ind->OperIs(GT_IND); - auto isNodeInvariant = [](Compiler* m_compiler, GenTree* node, bool allowNull) { - if (node == nullptr) - { - return allowNull; - } - if (node->OperIsConst()) - { - return true; - } - // We can allow bigger trees here, but it's not clear if it's worth it. - return node->OperIs(GT_LCL_VAR) && !m_compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum()); - }; - if (isStore) { // For stores, Data() is expected to be an invariant node - if (!isNodeInvariant(m_compiler, ind->Data(), false)) + if (!IsStoreCoalescingInvariantNode(m_compiler, ind->Data())) { return false; } @@ -10447,14 +10462,14 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing { GenTree* base = ind->Addr()->AsAddrMode()->Base(); GenTree* index = ind->Addr()->AsAddrMode()->Index(); - if (!isNodeInvariant(m_compiler, base, false)) + if (!IsStoreCoalescingInvariantNode(m_compiler, base)) { // Base must be a local. It's possible for it to be nullptr when index is not null, // but let's ignore such cases. return false; } - if (!isNodeInvariant(m_compiler, index, true)) + if (!IsStoreCoalescingInvariantNode(m_compiler, index, true)) { // Index should be either nullptr or a local. return false; @@ -10469,7 +10484,7 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing data->lclNum = base->AsLclVar()->GetLclNum(); } } - else if (isNodeInvariant(m_compiler, ind->Addr(), true)) + else if (IsStoreCoalescingInvariantNode(m_compiler, ind->Addr(), true)) { // Address is just a local, no offset, scale is 1 data->baseAddr = ind->Addr(); @@ -10525,17 +10540,8 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d return false; } - auto isNodeInvariant = [](Compiler* compiler, GenTree* node) { - if (node->OperIsConst()) - { - return true; - } - - return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum()); - }; - auto* lclStore = store->AsLclFld(); - if (!isNodeInvariant(m_compiler, lclStore->Data())) + if (!IsStoreCoalescingInvariantNode(m_compiler, lclStore->Data())) { return false; } @@ -10568,7 +10574,6 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d // // Arguments: // currStore - the current store node -// prevStore - the previous store node // currData - coalescing data for the current store // prevData - coalescing data for the previous store // @@ -10576,7 +10581,6 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d // true if it is safe to coalesce the two stores, false otherwise. // bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree* currStore, - GenTree* prevStore, const LoadStoreCoalescingData& currData, const LoadStoreCoalescingData& prevData) { @@ -10612,8 +10616,8 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree* #ifdef TARGET_ARM64 if (currData.accessSize == TARGET_POINTER_SIZE) { - // Per Arm ARM, a 128-bit SIMD write that is 64-bit aligned is treated as a pair of - // single-copy atomic 64-bit writes. + // See the Arm Architecture Reference Manual for A-profile architecture, "Single-copy atomicity": + // a 128-bit SIMD write that is 64-bit aligned is treated as a pair of single-copy atomic 64-bit writes. isCombinedStoreAtomic = (homeAlignment >= TARGET_POINTER_SIZE) && ((minOffset % TARGET_POINTER_SIZE) == 0); } #endif @@ -10643,15 +10647,13 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree* if (combinedSize == 2 * TARGET_POINTER_SIZE) { #ifdef TARGET_ARM64 - // Per Arm Architecture Reference Manual for A-profile architecture: - // - // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory - // are treated as a pair of single - copy atomic 64 - bit writes. - // - // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null) + // See the Arm Architecture Reference Manual for A-profile architecture, "Single-copy atomicity": + // writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory are + // treated as a pair of single-copy atomic 64-bit writes. // - // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise - // we're already doing a load that has no atomicity guarantees. + // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null), and assume TYP_LONG/TYP_REF + // are already 64-bit aligned. Otherwise the existing load/store sequence would not have atomicity + // guarantees either. isCombinedIndirAtomic = true; #endif } @@ -10678,6 +10680,9 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree* // void Lowering::LowerStoreCoalescing(GenTree* node) { + // LA, RISC-V, ARM32, and WASM are more likely to receive a large performance hit from unaligned accesses, + // making this optimization questionable there. + // TODO-WASM-CQ: re-evaluate enabling this once we have better data for Wasm unaligned accesses. #if defined(TARGET_XARCH) || defined(TARGET_ARM64) if (!m_compiler->opts.OptimizationEnabled()) { @@ -10737,16 +10742,23 @@ void Lowering::LowerStoreCoalescing(GenTree* node) if ((currData.offset == prevData.offset) && (currData.targetType == prevData.targetType)) { - if (m_compiler->gtTreeHasSideEffects(prevData.value, GTF_SIDE_EFFECT | GTF_GLOB_EFFECT | GTF_ASG) || - m_compiler->gtTreeHasSideEffects(currData.value, GTF_SIDE_EFFECT | GTF_GLOB_EFFECT | GTF_ASG)) + if (m_compiler->gtTreeHasSideEffects(prevData.value, GTF_GLOB_EFFECT) || + m_compiler->gtTreeHasSideEffects(currData.value, GTF_GLOB_EFFECT)) { return; } + JITDUMP( + "Store coalescing: removing previous store [%06u] because store [%06u] rewrites the same location\n", + m_compiler->dspTreeID(prevTree), m_compiler->dspTreeID(node)); BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); continue; } + // TODO-ARM64-CQ: enable TYP_REF if we find a case where it's beneficial. + // The algorithm does support TYP_REF (with null value), but it seems to be not worth + // it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two + // items at once. Although, it may be profitable to do "stp q0, q0, [addr]". if (!varTypeIsIntegral(node) && !varTypeIsSIMD(node)) { return; @@ -10790,6 +10802,8 @@ void Lowering::LowerStoreCoalescing(GenTree* node) if (allowOverlapOptimization && currContainsPrev) { + JITDUMP("Store coalescing: removing previous store [%06u] because store [%06u] fully overwrites it\n", + m_compiler->dspTreeID(prevTree), m_compiler->dspTreeID(node)); BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); continue; } @@ -10853,16 +10867,29 @@ void Lowering::LowerStoreCoalescing(GenTree* node) #if defined(FEATURE_HW_INTRINSICS) case TYP_LONG: case TYP_REF: + // TLDR: we should be here only if one of the conditions is true: + // 1) Both stores have the GTF_IND_ALLOW_NON_ATOMIC flag + // 2) ARM64: data is at least 8-byte aligned + // 3) AMD64: data is at least 16-byte aligned on AMD/Intel with AVX+ newType = TYP_SIMD16; if ((oldType == TYP_REF) && (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0))) { + // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.: + // + // arr[1] = "hello"; + // arr[0] = "world"; + // + // but we don't want to load managed references into SIMD registers (we can only do so + // when we can issue a nongc region for a block). return; } break; #if defined(TARGET_AMD64) case TYP_SIMD16: + // Upgrading two SIMD stores to a wider SIMD store. + // Only on x64 since ARM64 has no options above SIMD16. if (m_compiler->getPreferredVectorByteLength() >= 32) { newType = TYP_SIMD32; @@ -10879,19 +10906,19 @@ void Lowering::LowerStoreCoalescing(GenTree* node) } tryReusingPrevValue = true; break; -#elif defined(TARGET_ARM64) +#elif defined(TARGET_ARM64) // TARGET_AMD64 case TYP_SIMD16: tryReusingPrevValue = true; break; -#endif -#endif -#endif +#endif // TARGET_AMD64 +#endif // FEATURE_HW_INTRINSICS +#endif // TARGET_64BIT default: return; } } - if (!LowerCheckCoalescedStoreAtomicity(node, prevTree, currData, prevData)) + if (!LowerCheckCoalescedStoreAtomicity(node, currData, prevData)) { return; } @@ -10917,7 +10944,7 @@ void Lowering::LowerStoreCoalescing(GenTree* node) node->AsIndir()->Data() = prevValueTmp; } } -#endif +#endif // FEATURE_HW_INTRINSICS return; } @@ -10952,6 +10979,8 @@ void Lowering::LowerStoreCoalescing(GenTree* node) } #if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS) + // Upgrading two SIMD stores to a wider SIMD store. + // Only on x64 since ARM64 has no options above SIMD16. if (varTypeIsSIMD(oldType)) { int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8; @@ -10970,8 +10999,10 @@ void Lowering::LowerStoreCoalescing(GenTree* node) currData.value->AsVecCon()->gtSimdVal = newCns; continue; } -#endif +#endif // TARGET_AMD64 && FEATURE_HW_INTRINSICS + // The integer path below places each constant according to its byte offset, so it doesn't need to swap the + // values first. Only the SIMD packing paths above need to normalize lower/upper order explicitly. size_t lowerCns = static_cast(prevData.value->AsIntCon()->IconValue()); size_t upperCns = static_cast(currData.value->AsIntCon()->IconValue()); @@ -11002,7 +11033,7 @@ void Lowering::LowerStoreCoalescing(GenTree* node) continue; } -#endif +#endif // TARGET_64BIT && FEATURE_HW_INTRINSICS size_t prevMask = ~(size_t(0)) >> (sizeof(size_t) - prevData.accessSize) * BITS_PER_BYTE; size_t currMask = ~(size_t(0)) >> (sizeof(size_t) - currData.accessSize) * BITS_PER_BYTE; @@ -11023,7 +11054,7 @@ void Lowering::LowerStoreCoalescing(GenTree* node) node->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC; } } while (true); -#endif +#endif // TARGET_XARCH || TARGET_ARM64 } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 47f2c09f730fdd..698396ecbc406b 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -421,7 +421,6 @@ class Lowering final : public Phase void LowerStoreCoalescing(GenTree* node); void LowerStoreIndirCoalescing(GenTreeIndir* node); bool LowerCheckCoalescedStoreAtomicity(GenTree* currStore, - GenTree* prevStore, const LoadStoreCoalescingData& currData, const LoadStoreCoalescingData& prevData); GenTree* LowerAdd(GenTreeOp* node); From 578f52ba2407b0bdc9ec26d8b7e2cbf802262d8b Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 9 Apr 2026 14:50:32 -0700 Subject: [PATCH 3/9] fix issue with overlapping stores --- src/coreclr/jit/lower.cpp | 13 +++++++++---- src/tests/JIT/Directed/StructPromote/SpAddr.cs | 10 +++++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index f71394ac4d63c9..701c376be4d63f 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11041,10 +11041,15 @@ void Lowering::LowerStoreCoalescing(GenTree* node) lowerCns &= prevMask; upperCns &= currMask; - size_t val = 0; - val |= (lowerCns << ((prevData.offset - minOffset) * BITS_PER_BYTE)); - val |= (upperCns << ((currData.offset - minOffset) * BITS_PER_BYTE)); - val &= newMask; + unsigned const prevShift = static_cast((prevData.offset - minOffset) * BITS_PER_BYTE); + unsigned const currShift = static_cast((currData.offset - minOffset) * BITS_PER_BYTE); + + size_t prevBits = (lowerCns << prevShift) & newMask; + size_t currBits = (upperCns << currShift) & newMask; + + // Later stores must overwrite any overlapping bytes from earlier stores. + size_t currBitsMask = (currMask << currShift) & newMask; + size_t val = (prevBits & ~currBitsMask) | currBits; JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val); auto* intCon = currData.value->AsIntCon(); diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs index 9ae7a9c83543ff..e0ec39b84927ac 100644 --- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs +++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs @@ -67,11 +67,11 @@ static long AddressExposed(int a, int b) } [MethodImpl(MethodImplOptions.NoInlining)] - static int OverlappingZeroThenShort() + static int OverlappingNonZeroThenShort() { Pair p = default; - p.A = 0; - Unsafe.As(ref p) = 256; + p.A = unchecked((int)0xDEADBEEF); + Unsafe.As(ref p) = 0; return p.A; } @@ -82,12 +82,12 @@ public static int TestEntryPoint() Console.WriteLine("M(1, 2, 3, 4) is {0}.", res); long nonAddressExposed = NonAddressExposed(0x11223344, 0x55667788); long addressExposed = AddressExposed(0x10203040, 0x50607080); - int overlapping = OverlappingZeroThenShort(); + int overlapping = OverlappingNonZeroThenShort(); if ((res == 10) && (nonAddressExposed == 0x5566778811223344L) && (addressExposed == 0x5060708010203040L) && - (overlapping == 256)) + (overlapping == unchecked((int)0xDEAD0000))) return 100; else return 99; From d9a6a7d773eb0101d6d3f2f8cb505f09fab702af Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 10 Apr 2026 07:24:29 -0700 Subject: [PATCH 4/9] fix mixed int/float cases --- src/coreclr/jit/lower.cpp | 67 +++++++++++++++++-- .../JIT/Directed/StructPromote/SpAddr.cs | 23 ++++++- 2 files changed, 84 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 701c376be4d63f..9b62b2b18a1d96 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10419,6 +10419,51 @@ static bool IsStoreCoalescingInvariantNode(Compiler* compiler, GenTree* node, bo return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum()); } +//------------------------------------------------------------------------ +// TryGetStoreCoalescingConstantBits: get the raw bits for a constant used by store +// coalescing. +// +// Arguments: +// value - the constant node +// bits - [out] the raw constant bits +// +// Return Value: +// true if the constant can participate in store coalescing; otherwise false. +// +static bool TryGetStoreCoalescingConstantBits(GenTree* value, size_t* bits) +{ + assert(bits != nullptr); + *bits = 0; + + if (value->IsCnsIntOrI()) + { + *bits = static_cast(value->AsIntCon()->IconValue()); + return true; + } + + if (value->IsCnsFltOrDbl()) + { + if (value->TypeIs(TYP_FLOAT)) + { + float floatCns = static_cast(value->AsDblCon()->DconValue()); + memcpy(bits, &floatCns, sizeof(floatCns)); + return true; + } + +#ifdef TARGET_64BIT + // This helper returns bits through size_t, so only 64-bit targets can carry the full payload of a double. + // That also matches the current coalescing surface: double-to-integer store retyping only matters when the + // resulting 8-byte integer store is supported. + assert(value->TypeIs(TYP_DOUBLE)); + double doubleCns = value->AsDblCon()->DconValue(); + memcpy(bits, &doubleCns, sizeof(doubleCns)); + return true; +#endif + } + + return false; +} + //------------------------------------------------------------------------ // GetLoadStoreCoalescingData: given a STOREIND/IND node, get the data needed to perform // store/load coalescing including pointer to the previous node. @@ -10733,7 +10778,11 @@ void Lowering::LowerStoreCoalescing(GenTree* node) } bool const allowOverlapOptimization = node->OperIs(GT_STORE_LCL_FLD); - bool const sameLocation = currData.IsAddressEqual(prevData, /*ignoreOffset*/ true, allowOverlapOptimization); + int const prevEndOffset = prevData.offset + static_cast(prevData.accessSize); + int const currEndOffset = currData.offset + static_cast(currData.accessSize); + bool const storesOverlap = (currData.offset < prevEndOffset) && (prevData.offset < currEndOffset); + bool const sameLocation = + currData.IsAddressEqual(prevData, /*ignoreOffset*/ true, allowOverlapOptimization && storesOverlap); if (!sameLocation) { @@ -10791,8 +10840,6 @@ void Lowering::LowerStoreCoalescing(GenTree* node) var_types newType = TYP_UNDEF; bool tryReusingPrevValue = false; int const minOffset = min(prevData.offset, currData.offset); - int const prevEndOffset = prevData.offset + static_cast(prevData.accessSize); - int const currEndOffset = currData.offset + static_cast(currData.accessSize); int const maxEndOffset = max(prevEndOffset, currEndOffset); int const combinedSize = maxEndOffset - minOffset; bool const prevContainsCurr = (prevData.offset <= currData.offset) && (prevEndOffset >= currEndOffset); @@ -10983,6 +11030,11 @@ void Lowering::LowerStoreCoalescing(GenTree* node) // Only on x64 since ARM64 has no options above SIMD16. if (varTypeIsSIMD(oldType)) { + if (!prevData.value->OperIs(GT_CNS_VEC) || !currData.value->OperIs(GT_CNS_VEC)) + { + return; + } + int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8; int8_t* upperCns = currData.value->AsVecCon()->gtSimdVal.i8; @@ -11003,8 +11055,13 @@ void Lowering::LowerStoreCoalescing(GenTree* node) // The integer path below places each constant according to its byte offset, so it doesn't need to swap the // values first. Only the SIMD packing paths above need to normalize lower/upper order explicitly. - size_t lowerCns = static_cast(prevData.value->AsIntCon()->IconValue()); - size_t upperCns = static_cast(currData.value->AsIntCon()->IconValue()); + size_t lowerCns = 0; + size_t upperCns = 0; + if (!TryGetStoreCoalescingConstantBits(prevData.value, &lowerCns) || + !TryGetStoreCoalescingConstantBits(currData.value, &upperCns)) + { + return; + } #if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS) if (varTypeIsSIMD(newType)) diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs index e0ec39b84927ac..ab5f694192e12e 100644 --- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs +++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs @@ -20,6 +20,12 @@ struct Pair public int B; } + struct FloatPair + { + public float A; + public float B; + } + [MethodImpl(MethodImplOptions.NoInlining)] static int Foo(S s0, S s1) { @@ -42,6 +48,10 @@ static int M(int i0, int i1, int i2, int i3) [MethodImpl(MethodImplOptions.NoInlining)] static long Consume(Pair p) => ((long)p.B << 32) | (uint)p.A; + [MethodImpl(MethodImplOptions.NoInlining)] + static long Consume(FloatPair p) => + ((long)(uint)BitConverter.SingleToInt32Bits(p.B) << 32) | (uint)BitConverter.SingleToInt32Bits(p.A); + [MethodImpl(MethodImplOptions.NoInlining)] static void Expose(ref Pair p) { @@ -75,6 +85,15 @@ static int OverlappingNonZeroThenShort() return p.A; } + [MethodImpl(MethodImplOptions.NoInlining)] + static long AdjacentFloatNonZeroThenZero() + { + FloatPair p = default; + p.A = 1.0f; + p.B = 0.0f; + return Consume(p); + } + [Fact] public static int TestEntryPoint() { @@ -83,11 +102,13 @@ public static int TestEntryPoint() long nonAddressExposed = NonAddressExposed(0x11223344, 0x55667788); long addressExposed = AddressExposed(0x10203040, 0x50607080); int overlapping = OverlappingNonZeroThenShort(); + long floatPair = AdjacentFloatNonZeroThenZero(); if ((res == 10) && (nonAddressExposed == 0x5566778811223344L) && (addressExposed == 0x5060708010203040L) && - (overlapping == unchecked((int)0xDEAD0000))) + (overlapping == unchecked((int)0xDEAD0000)) && + (floatPair == 0x000000003F800000L)) return 100; else return 99; From c61e8da06d624753e76d6bf193bd81dd04aa2b96 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 13 Apr 2026 09:49:37 -0700 Subject: [PATCH 5/9] review feedback: use BitOperations and uint64_t to build coaleseced constant --- src/coreclr/jit/lower.cpp | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 9b62b2b18a1d96..dc299d24a78b7f 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10430,14 +10430,14 @@ static bool IsStoreCoalescingInvariantNode(Compiler* compiler, GenTree* node, bo // Return Value: // true if the constant can participate in store coalescing; otherwise false. // -static bool TryGetStoreCoalescingConstantBits(GenTree* value, size_t* bits) +static bool TryGetStoreCoalescingConstantBits(GenTree* value, uint64_t* bits) { assert(bits != nullptr); *bits = 0; if (value->IsCnsIntOrI()) { - *bits = static_cast(value->AsIntCon()->IconValue()); + *bits = static_cast(value->AsIntCon()->IconValue()); return true; } @@ -10446,17 +10446,15 @@ static bool TryGetStoreCoalescingConstantBits(GenTree* value, size_t* bits) if (value->TypeIs(TYP_FLOAT)) { float floatCns = static_cast(value->AsDblCon()->DconValue()); - memcpy(bits, &floatCns, sizeof(floatCns)); + *bits = BitOperations::SingleToUInt32Bits(floatCns); return true; } #ifdef TARGET_64BIT - // This helper returns bits through size_t, so only 64-bit targets can carry the full payload of a double. - // That also matches the current coalescing surface: double-to-integer store retyping only matters when the - // resulting 8-byte integer store is supported. + // We only need the raw 64-bit payload for targets where the resulting 8-byte coalesced store is supported. assert(value->TypeIs(TYP_DOUBLE)); double doubleCns = value->AsDblCon()->DconValue(); - memcpy(bits, &doubleCns, sizeof(doubleCns)); + *bits = BitOperations::DoubleToUInt64Bits(doubleCns); return true; #endif } @@ -10777,6 +10775,12 @@ void Lowering::LowerStoreCoalescing(GenTree* node) return; } + if (((currData.storeFlags | prevData.storeFlags) & (GTF_IND_VOLATILE | GTF_ORDER_SIDEEFF)) != 0) + { + // Keep volatile and other ordering-sensitive stores in their original form. + return; + } + bool const allowOverlapOptimization = node->OperIs(GT_STORE_LCL_FLD); int const prevEndOffset = prevData.offset + static_cast(prevData.accessSize); int const currEndOffset = currData.offset + static_cast(currData.accessSize); @@ -11055,8 +11059,8 @@ void Lowering::LowerStoreCoalescing(GenTree* node) // The integer path below places each constant according to its byte offset, so it doesn't need to swap the // values first. Only the SIMD packing paths above need to normalize lower/upper order explicitly. - size_t lowerCns = 0; - size_t upperCns = 0; + uint64_t lowerCns = 0; + uint64_t upperCns = 0; if (!TryGetStoreCoalescingConstantBits(prevData.value, &lowerCns) || !TryGetStoreCoalescingConstantBits(currData.value, &upperCns)) { @@ -11092,23 +11096,24 @@ void Lowering::LowerStoreCoalescing(GenTree* node) } #endif // TARGET_64BIT && FEATURE_HW_INTRINSICS - size_t prevMask = ~(size_t(0)) >> (sizeof(size_t) - prevData.accessSize) * BITS_PER_BYTE; - size_t currMask = ~(size_t(0)) >> (sizeof(size_t) - currData.accessSize) * BITS_PER_BYTE; - size_t newMask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(newType)) * BITS_PER_BYTE; + uint64_t prevMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - prevData.accessSize) * BITS_PER_BYTE; + uint64_t currMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - currData.accessSize) * BITS_PER_BYTE; + uint64_t newMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - genTypeSize(newType)) * BITS_PER_BYTE; lowerCns &= prevMask; upperCns &= currMask; unsigned const prevShift = static_cast((prevData.offset - minOffset) * BITS_PER_BYTE); unsigned const currShift = static_cast((currData.offset - minOffset) * BITS_PER_BYTE); - size_t prevBits = (lowerCns << prevShift) & newMask; - size_t currBits = (upperCns << currShift) & newMask; + uint64_t prevBits = (lowerCns << prevShift) & newMask; + uint64_t currBits = (upperCns << currShift) & newMask; // Later stores must overwrite any overlapping bytes from earlier stores. - size_t currBitsMask = (currMask << currShift) & newMask; - size_t val = (prevBits & ~currBitsMask) | currBits; + uint64_t currBitsMask = (currMask << currShift) & newMask; + uint64_t val = (prevBits & ~currBitsMask) | currBits; JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val); + assert(currData.value->OperIs(GT_CNS_INT)); auto* intCon = currData.value->AsIntCon(); intCon->gtIconVal = static_cast(val); if (genTypeSize(oldType) == 1 && node->OperIsIndir()) From a0cc8d7a9755d2d75a390868038e3d6a190cb4b8 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 20 Apr 2026 08:45:34 -0700 Subject: [PATCH 6/9] fixes --- src/coreclr/jit/lower.cpp | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 2ac9d6d3ce1a3f..32a9305a84b7bd 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10574,7 +10574,7 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing //------------------------------------------------------------------------ // GetStoreCoalescingData: get the data needed to perform store coalescing for either -// STOREIND/STORE_BLK or STORE_LCL_FLD. +// STOREIND/STORE_BLK or local stores. // // Arguments: // store - the store node @@ -10590,12 +10590,12 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d return GetLoadStoreCoalescingData(store->AsIndir(), data); } - if (!store->OperIs(GT_STORE_LCL_FLD)) + if (!store->OperIsLocalStore()) { return false; } - auto* lclStore = store->AsLclFld(); + auto* lclStore = store->AsLclVarCommon(); if (!IsStoreCoalescingInvariantNode(m_compiler, lclStore->Data())) { return false; @@ -10610,13 +10610,16 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclStore); data->targetType = store->TypeGet(); - data->accessSize = lclStore->GetSize(); + data->accessSize = store->OperIs(GT_STORE_LCL_FLD) ? lclStore->AsLclFld()->GetSize() : varDsc->lvExactSize(); data->value = lclStore->Data(); data->offset = lclStore->GetLclOffs(); data->lclNum = lclStore->GetLclNum(); data->isLocalStoreNode = true; data->isAddressExposed = varDsc->IsAddressExposed(); - data->storeFlags = store->gtFlags; + // Local stores don't use the GTF_IND_* flag space semantically. In particular, GT_STORE_LCL_FLD + // may carry GTF_VAR_USEASG, which shares the same bit as GTF_IND_VOLATILE. Keep only the flags + // that are actually meaningful for indir coalescing decisions. + data->storeFlags = GTF_EMPTY; data->rangeStart = range.FirstNode(); data->rangeEnd = range.LastNode(); @@ -10728,7 +10731,8 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree* //------------------------------------------------------------------------ // LowerStoreCoalescing: if the given store is followed by a similar adjacent constant store, -// try to merge them into a single wider store. Supports STOREIND/STORE_BLK and STORE_LCL_FLD. +// try to merge them into a single wider store. Supports STOREIND/STORE_BLK and STORE_LCL_FLD, +// where the previous local store may be either STORE_LCL_VAR or STORE_LCL_FLD. // // Arguments: // node - current store node @@ -10772,7 +10776,7 @@ void Lowering::LowerStoreCoalescing(GenTree* node) if (node->OperIs(GT_STORE_LCL_FLD)) { - if (!prevTree->OperIs(GT_STORE_LCL_FLD)) + if (!prevTree->OperIsLocalStore()) { return; } @@ -10787,9 +10791,11 @@ void Lowering::LowerStoreCoalescing(GenTree* node) return; } - if (((currData.storeFlags | prevData.storeFlags) & (GTF_IND_VOLATILE | GTF_ORDER_SIDEEFF)) != 0) + if (((currData.storeFlags | prevData.storeFlags) & GTF_IND_VOLATILE) != 0) { - // Keep volatile and other ordering-sensitive stores in their original form. + // Keep volatile stores in their original form. + // Regular STOREIND nodes may carry GTF_ORDER_SIDEEFF, and the pre-existing + // STOREIND coalescing logic intentionally allowed those cases. return; } @@ -11032,13 +11038,16 @@ void Lowering::LowerStoreCoalescing(GenTree* node) } else { - auto* lclStore = node->AsLclFld(); + auto* lclStore = node->AsLclVarCommon(); BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); lclStore->Data()->ClearContained(); lclStore->gtType = newType; lclStore->Data()->gtType = newType; - lclStore->SetLclOffs(min(prevData.offset, currData.offset)); + if (lclStore->OperIs(GT_STORE_LCL_FLD)) + { + lclStore->AsLclFld()->SetLclOffs(min(prevData.offset, currData.offset)); + } } #if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS) From 33c46127146466ea5a5f1c2606722d862397204d Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 20 Apr 2026 09:35:48 -0700 Subject: [PATCH 7/9] remove comment --- src/coreclr/jit/lower.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 32a9305a84b7bd..b173d8c6b9d5e5 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10616,12 +10616,9 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d data->lclNum = lclStore->GetLclNum(); data->isLocalStoreNode = true; data->isAddressExposed = varDsc->IsAddressExposed(); - // Local stores don't use the GTF_IND_* flag space semantically. In particular, GT_STORE_LCL_FLD - // may carry GTF_VAR_USEASG, which shares the same bit as GTF_IND_VOLATILE. Keep only the flags - // that are actually meaningful for indir coalescing decisions. - data->storeFlags = GTF_EMPTY; - data->rangeStart = range.FirstNode(); - data->rangeEnd = range.LastNode(); + data->storeFlags = GTF_EMPTY; + data->rangeStart = range.FirstNode(); + data->rangeEnd = range.LastNode(); return true; } From 3c96099310c0ba902761f73991ad94201b39c778 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Mon, 20 Apr 2026 12:22:44 -0700 Subject: [PATCH 8/9] format --- src/coreclr/jit/lower.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index b173d8c6b9d5e5..62124f149d322c 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10616,9 +10616,9 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d data->lclNum = lclStore->GetLclNum(); data->isLocalStoreNode = true; data->isAddressExposed = varDsc->IsAddressExposed(); - data->storeFlags = GTF_EMPTY; - data->rangeStart = range.FirstNode(); - data->rangeEnd = range.LastNode(); + data->storeFlags = GTF_EMPTY; + data->rangeStart = range.FirstNode(); + data->rangeEnd = range.LastNode(); return true; } From b276d3375f5cbc6ec7228b542969443c7c7807cc Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Sat, 25 Apr 2026 08:38:39 -0700 Subject: [PATCH 9/9] Address PR feedback: guard RetBuf check and make test endian-agnostic - LowerCheckCoalescedStoreAtomicity: also require compRetBuffArg != BAD_VAR_NUM before relaxing atomicity for retbuf stores. Previously, when no retbuf was present, both lclNum and compRetBuffArg defaulted to BAD_VAR_NUM and the comparison trivially held, allowing non-atomic coalescing for unrelated indirect stores. - SpAddr regression test: compute the expected `overlapping` value based on BitConverter.IsLittleEndian so the test does not rely on little-endian byte layout. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 3 ++- src/tests/JIT/Directed/StructPromote/SpAddr.cs | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index e26dc6a6edaec4..5d783052b2fe8b 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10656,7 +10656,8 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree* currData.offset + static_cast(currData.accessSize)); unsigned combinedSize = static_cast(maxEndOffset - minOffset); - if (!allowsNonAtomic && (currData.lclNum == m_compiler->info.compRetBuffArg)) + if (!allowsNonAtomic && (m_compiler->info.compRetBuffArg != BAD_VAR_NUM) && + (currData.lclNum == m_compiler->info.compRetBuffArg)) { // RetBuf is a private stack memory, so we don't need to worry about atomicity. allowsNonAtomic = true; diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs index ab5f694192e12e..e44c3176c89c0e 100644 --- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs +++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs @@ -104,10 +104,14 @@ public static int TestEntryPoint() int overlapping = OverlappingNonZeroThenShort(); long floatPair = AdjacentFloatNonZeroThenZero(); + int expectedOverlapping = BitConverter.IsLittleEndian + ? unchecked((int)0xDEAD0000) + : unchecked((int)0x0000BEEF); + if ((res == 10) && (nonAddressExposed == 0x5566778811223344L) && (addressExposed == 0x5060708010203040L) && - (overlapping == unchecked((int)0xDEAD0000)) && + (overlapping == expectedOverlapping) && (floatPair == 0x000000003F800000L)) return 100; else