From 4f07fab9092dcd49f9c1676e17327440741bf40e Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 07:39:07 -0700 Subject: [PATCH 01/27] JIT: Coalesce adjacent STORE_LCL_FLD nodes Extend store coalescing to handle GT_STORE_LCL_FLD nodes during lowering. When two adjacent STORE_LCL_FLD nodes write to the same local variable at adjacent offsets with the same type, merge them into a single wider store. Two modes are supported: 1. Constant values: combine via bit manipulation (byte+byte->short, short+short->int, int+int->long, etc.), matching the existing STOREIND coalescing approach. 2. Non-constant values (int+int->long only): compose two register values into a single qword via zero-extend CAST + LSH + OR. This eliminates store-forwarding stalls where two 4-byte field stores are followed by an 8-byte struct read. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 315 ++++++++++++++++++++++++++++++++++++++ src/coreclr/jit/lower.h | 1 + 2 files changed, 316 insertions(+) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 8e7976a0e1e9d3..0deb1160458a74 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5654,6 +5654,9 @@ GenTree* Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore) DISPTREERANGE(BlockRange(), lclStore); JITDUMP("\n"); + // Try to coalesce adjacent GT_STORE_LCL_FLD nodes into a single wider store. + LowerStoreLclFldCoalescing(lclStore); + TryRetypingFloatingPointStoreToIntegerStore(lclStore); GenTree* src = lclStore->gtGetOp1(); @@ -10876,6 +10879,318 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) #endif // TARGET_XARCH || TARGET_ARM64 } +//------------------------------------------------------------------------ +// LowerStoreLclFldCoalescing: If the given GT_STORE_LCL_FLD node is preceded by another +// GT_STORE_LCL_FLD to the same local at an adjacent offset with constant data, merge +// them into a single wider store. This avoids store-forwarding stalls when the struct +// is later read back at the wider width. +// +// Example: +// +// * STORE_LCL_FLD int V04 [+0] +// \--* CNS_INT int 0x1 +// +// * STORE_LCL_FLD int V04 [+4] +// \--* CNS_INT int 0x2 +// +// After coalescing: +// +// * STORE_LCL_FLD long V04 [+0] +// \--* CNS_INT long 0x200000001 +// +// Arguments: +// store - the current GT_STORE_LCL_FLD node +// +void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) +{ +#if defined(TARGET_XARCH) || defined(TARGET_ARM64) + if (!m_compiler->opts.OptimizationEnabled()) + { + return; + } + + if (!store->OperIs(GT_STORE_LCL_FLD)) + { + return; + } + + do + { + if (!store->OperIs(GT_STORE_LCL_FLD)) + { + return; + } + + var_types currType = store->TypeGet(); + unsigned currLclNum = store->GetLclNum(); + unsigned currOffset = store->AsLclFld()->GetLclOffs(); + + if (!varTypeIsIntegral(currType) && !varTypeIsSIMD(currType)) + { + return; + } + + // Make sure the current node's tree range is closed (no unexpected interleaved nodes). + bool isClosedRange = false; + LIR::ReadOnlyRange currRange = BlockRange().GetTreeRange(store, &isClosedRange); + if (!isClosedRange) + { + return; + } + + // Look backward for a previous GT_STORE_LCL_FLD, skipping NOPs and IL_OFFSETs. + GenTree* prevTree = currRange.FirstNode()->gtPrev; + while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET)) + { + prevTree = prevTree->gtPrev; + } + + if ((prevTree == nullptr) || !prevTree->OperIs(GT_STORE_LCL_FLD)) + { + return; + } + + GenTreeLclFld* prevStore = prevTree->AsLclFld(); + + // Must be the same local variable. + if (prevStore->GetLclNum() != currLclNum) + { + return; + } + + // Both stores must be the same type. + var_types prevType = prevStore->TypeGet(); + if (prevType != currType) + { + return; + } + + // The offsets must be adjacent (differ by exactly the type size). + unsigned prevOffset = prevStore->GetLclOffs(); + if (abs((int)prevOffset - (int)currOffset) != (int)genTypeSize(currType)) + { + return; + } + + GenTree* currValue = store->Data(); + GenTree* prevValue = prevStore->Data(); + + // Previous store's tree range must also be closed. + bool isPrevClosedRange = false; + LIR::ReadOnlyRange prevRange = BlockRange().GetTreeRange(prevStore, &isPrevClosedRange); + if (!isPrevClosedRange) + { + return; + } + + bool isCurrConst = currValue->OperIsConst() && + !(currValue->IsCnsIntOrI() && currValue->AsIntCon()->ImmedValNeedsReloc(m_compiler)); + bool isPrevConst = prevValue->OperIsConst() && + !(prevValue->IsCnsIntOrI() && prevValue->AsIntCon()->ImmedValNeedsReloc(m_compiler)); + + // Determine the new wider type. + var_types newType = TYP_UNDEF; + switch (currType) + { + case TYP_BYTE: + case TYP_UBYTE: + newType = TYP_USHORT; + break; + + case TYP_SHORT: + case TYP_USHORT: + newType = TYP_INT; + break; + +#ifdef TARGET_64BIT + case TYP_INT: + newType = TYP_LONG; + break; + +#if defined(FEATURE_HW_INTRINSICS) + case TYP_LONG: + newType = TYP_SIMD16; + break; + +#if defined(TARGET_AMD64) + case TYP_SIMD16: + if (m_compiler->getPreferredVectorByteLength() >= 32) + { + newType = TYP_SIMD32; + break; + } + return; + + case TYP_SIMD32: + if (m_compiler->getPreferredVectorByteLength() >= 64) + { + newType = TYP_SIMD64; + break; + } + return; +#endif // TARGET_AMD64 +#endif // FEATURE_HW_INTRINSICS +#endif // TARGET_64BIT + + default: + return; + } + + assert(newType != TYP_UNDEF); + + // Both constants: combine via bit manipulation (existing path). + if (isCurrConst && isPrevConst) + { + JITDUMP("Coalescing two GT_STORE_LCL_FLD stores into a single wider store:\n"); + JITDUMP(" Previous store: V%02u [+%u] %s\n", currLclNum, prevOffset, varTypeName(prevType)); + JITDUMP(" Current store: V%02u [+%u] %s\n", currLclNum, currOffset, varTypeName(currType)); + JITDUMP(" New type: %s\n", varTypeName(newType)); + + BlockRange().Remove(prevRange.FirstNode(), prevRange.LastNode()); + + unsigned newOffset = min(prevOffset, currOffset); + store->AsLclFld()->SetLclOffs(newOffset); + store->gtType = newType; + currValue->ClearContained(); + +#if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS) + if (varTypeIsSIMD(currType)) + { + int8_t* lowerCns = prevValue->AsVecCon()->gtSimdVal.i8; + int8_t* upperCns = currValue->AsVecCon()->gtSimdVal.i8; + + if (prevOffset > currOffset) + { + std::swap(lowerCns, upperCns); + } + + simd_t newCns = {}; + uint32_t oldWidth = genTypeSize(currType); + memcpy(newCns.i8, lowerCns, oldWidth); + memcpy(newCns.i8 + oldWidth, upperCns, oldWidth); + + currValue->AsVecCon()->gtSimdVal = newCns; + currValue->gtType = newType; + continue; + } +#endif + + size_t lowerCns = (size_t)prevValue->AsIntCon()->IconValue(); + size_t upperCns = (size_t)currValue->AsIntCon()->IconValue(); + + if (prevOffset > currOffset) + { + std::swap(lowerCns, upperCns); + } + +#if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS) + if (varTypeIsSIMD(newType)) + { + int8_t val[16]; + memcpy(val, &lowerCns, 8); + memcpy(val + 8, &upperCns, 8); + GenTreeVecCon* vecCns = m_compiler->gtNewVconNode(newType, &val); + + BlockRange().InsertAfter(currValue, vecCns); + BlockRange().Remove(currValue); + store->Data() = vecCns; + continue; + } +#endif // TARGET_64BIT && FEATURE_HW_INTRINSICS + + size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(currType)) * BITS_PER_BYTE; + lowerCns &= mask; + upperCns &= mask; + + size_t val = (lowerCns | (upperCns << (genTypeSize(currType) * BITS_PER_BYTE))); + JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val); + + currValue->AsIntCon()->gtIconVal = (ssize_t)val; + currValue->gtType = newType; + continue; + } + +#ifdef TARGET_64BIT + // Non-constant values: compose two integral values into a wider value via + // cast + shift + OR. For now, restrict to int+int -> long which is the dominant + // case and avoids complications with small type register widening. + if (currType != TYP_INT || newType != TYP_LONG) + { + return; + } + + // Both values must have closed tree ranges so we can safely relocate them. + bool isCurrValueClosed = false; + bool isPrevValueClosed = false; + BlockRange().GetTreeRange(currValue, &isCurrValueClosed); + BlockRange().GetTreeRange(prevValue, &isPrevValueClosed); + if (!isCurrValueClosed || !isPrevValueClosed) + { + return; + } + + JITDUMP("Coalescing two non-const GT_STORE_LCL_FLD stores via shift+OR:\n"); + JITDUMP(" Previous store: V%02u [+%u] %s\n", currLclNum, prevOffset, varTypeName(prevType)); + JITDUMP(" Current store: V%02u [+%u] %s\n", currLclNum, currOffset, varTypeName(currType)); + JITDUMP(" New type: %s\n", varTypeName(newType)); + + // Identify which value goes in the low bits and which in the high bits. + GenTree* lowValue = (prevOffset < currOffset) ? prevValue : currValue; + GenTree* highValue = (prevOffset < currOffset) ? currValue : prevValue; + + // Clear containment flags — the values may have been marked contained for + // the original stores but need to be uncontained for the new CAST/OR tree. + lowValue->ClearContained(); + highValue->ClearContained(); + + // Remove the previous store node, keeping its data subtree in the LIR. + prevStore->Data() = nullptr; + BlockRange().Remove(prevStore); + + // Zero-extend both values to the new wider type (int -> long). + unsigned shiftBits = genTypeSize(currType) * BITS_PER_BYTE; + + if (genTypeSize(newType) > genTypeSize(genActualType(lowValue))) + { + GenTree* castLow = m_compiler->gtNewCastNode(newType, lowValue, true, newType); + BlockRange().InsertBefore(store, castLow); + lowValue = castLow; + } + + if (genTypeSize(newType) > genTypeSize(genActualType(highValue))) + { + GenTree* castHigh = m_compiler->gtNewCastNode(newType, highValue, true, newType); + BlockRange().InsertBefore(store, castHigh); + highValue = castHigh; + } + + // Shift the high value left. + GenTree* shiftAmount = m_compiler->gtNewIconNode((ssize_t)shiftBits); + GenTree* shifted = m_compiler->gtNewOperNode(GT_LSH, newType, highValue, shiftAmount); + BlockRange().InsertBefore(store, shiftAmount, shifted); + + // OR the low and shifted-high values. + GenTree* combined = m_compiler->gtNewOperNode(GT_OR, newType, lowValue, shifted); + BlockRange().InsertBefore(store, combined); + + // Update the current store to use the combined value at the lower offset. + unsigned newOffset = min(prevOffset, currOffset); + store->AsLclFld()->SetLclOffs(newOffset); + store->gtType = newType; + store->Data() = combined; + + JITDUMP("Coalesced two non-const stores into a single store via shift+OR\n"); + // Don't loop for non-constant coalescing — the combined value is no longer a simple + // constant, so further coalescing would need a different approach. + return; +#else + return; +#endif // TARGET_64BIT + + } while (true); +#endif // TARGET_XARCH || TARGET_ARM64 +} + //------------------------------------------------------------------------ // LowerStoreIndirCommon: a common logic to lower StoreIndir. // diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index b135a9a3109e1c..a70e9a3940167d 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -389,6 +389,7 @@ class Lowering final : public Phase void UnmarkTree(GenTree* root); GenTree* LowerStoreIndir(GenTreeStoreInd* node); void LowerStoreIndirCoalescing(GenTreeIndir* node); + void LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store); GenTree* LowerAdd(GenTreeOp* node); GenTree* LowerMul(GenTreeOp* mul); bool TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode); From f8a89a73acd8e2193ae114f01590704caf9dd8e1 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 10:10:58 -0700 Subject: [PATCH 02/27] JIT: Add heuristic for non-constant STORE_LCL_FLD coalescing Non-constant coalescing (composing two register values via shift+OR) increases code size. Only apply it when a wider read from the same local is found within the next 20 LIR nodes, indicating a likely store-forwarding stall that the coalescing would prevent. This avoids unnecessary code size increases in cases where the individual field stores are not followed by a wider struct read. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 0deb1160458a74..7a8abb48399d58 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11119,6 +11119,62 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) return; } + // Non-constant coalescing increases code size (adds cast+shift+or instructions). + // Only do it if we can see that the local is read back at a wider width within + // the next few instructions, which would cause a store-forwarding stall. + { + unsigned newOffset = min(prevOffset, currOffset); + unsigned combinedSize = genTypeSize(currType) * 2; + bool foundWiderRead = false; + GenTree* scanNode = store->gtNext; + const int scanLimit = 20; + + for (int scanCount = 0; scanNode != nullptr && scanCount < scanLimit; scanCount++) + { + // Stop scanning at control flow boundaries. + if (scanNode->OperIsConditionalJump() || scanNode->OperGet() == GT_JMP || + scanNode->OperGet() == GT_RETURN || scanNode->OperGet() == GT_SWIFT_ERROR_RET) + { + break; + } + + // Check for a local read from the same variable that overlaps our store range + // at a wider width. + if (scanNode->OperIsLocalRead() && scanNode->AsLclVarCommon()->GetLclNum() == currLclNum) + { + unsigned readOffset = scanNode->AsLclVarCommon()->GetLclOffs(); + unsigned readSize = genTypeSize(scanNode->TypeGet()); + if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) + { + readSize = scanNode->AsLclVarCommon()->GetLayout(m_compiler)->GetSize(); + } + + // Does this read overlap our combined store range and is it wider? + if (readSize > genTypeSize(currType) && readOffset <= newOffset && + (readOffset + readSize) >= (newOffset + combinedSize)) + { + foundWiderRead = true; + break; + } + } + + // Stop at stores to the same local (would invalidate the forwarding scenario). + if (scanNode->OperIsLocalStore() && scanNode->AsLclVarCommon()->GetLclNum() == currLclNum) + { + break; + } + + scanNode = scanNode->gtNext; + } + + if (!foundWiderRead) + { + JITDUMP("Skipping non-const GT_STORE_LCL_FLD coalescing: no wider read found within %d nodes\n", + scanLimit); + return; + } + } + // Both values must have closed tree ranges so we can safely relocate them. bool isCurrValueClosed = false; bool isPrevValueClosed = false; From 2b3926f07d10804551fee33ac1be6413dea4b10f Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 10:35:30 -0700 Subject: [PATCH 03/27] JIT: Fold narrow constant stores into preceding wider zero-init When a wider constant-zero STORE_LCL_FLD is followed by a narrower constant STORE_LCL_FLD to the same local at an offset within the wider store's range, fold the narrow value into the wider zero constant and remove the narrow store. Example: STORE_LCL_FLD int [+0] = 0; STORE_LCL_FLD ubyte [+0] = 1 => STORE_LCL_FLD int [+0] = 1 This eliminates store-forwarding stalls where a struct is zero- initialized then has individual fields set before being read back. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 50 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 7a8abb48399d58..a0c2dd3ae73cea 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10962,6 +10962,56 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) var_types prevType = prevStore->TypeGet(); if (prevType != currType) { + // Special case: a wider constant-zero store followed by a narrower constant store + // that's fully contained within the wider store's range. We can fold the narrow + // value into the wider zero constant, eliminating the narrow store entirely. + // Example: STORE_LCL_FLD int [+0] = 0; STORE_LCL_FLD ubyte [+0] = 1 + // => STORE_LCL_FLD int [+0] = 1 + unsigned prevSize = genTypeSize(prevType); + unsigned prevOffset = prevStore->GetLclOffs(); + + if (varTypeIsIntegral(prevType) && varTypeIsIntegral(currType) && prevSize > genTypeSize(currType) && + currOffset >= prevOffset && (currOffset + genTypeSize(currType)) <= (prevOffset + prevSize)) + { + GenTree* prevValue = prevStore->Data(); + GenTree* currValue = store->Data(); + + bool isPrevClosedRange = false; + LIR::ReadOnlyRange prevRange = BlockRange().GetTreeRange(prevStore, &isPrevClosedRange); + if (!isPrevClosedRange) + { + return; + } + + if (prevValue->IsIntegralConst(0) && currValue->IsCnsIntOrI() && + !currValue->AsIntCon()->ImmedValNeedsReloc(m_compiler)) + { + unsigned insertBitOffset = (currOffset - prevOffset) * BITS_PER_BYTE; + size_t mask = (~(size_t(0)) >> ((sizeof(size_t) - genTypeSize(currType)) * BITS_PER_BYTE)); + size_t narrowVal = (size_t)currValue->AsIntCon()->IconValue() & mask; + size_t combined = narrowVal << insertBitOffset; + + JITDUMP("Folding narrow constant store into wider zero-init:\n"); + JITDUMP(" Wide store: V%02u [+%u] %s = 0\n", currLclNum, prevOffset, varTypeName(prevType)); + JITDUMP(" Narrow store: V%02u [+%u] %s = %lld\n", currLclNum, currOffset, + varTypeName(currType), (int64_t)currValue->AsIntCon()->IconValue()); + JITDUMP(" Combined value: %lld\n", (int64_t)combined); + + // Remove the previous (wider zero) store. + BlockRange().Remove(prevRange.FirstNode(), prevRange.LastNode()); + + // Update the current store to be the wider type with the combined value. + store->AsLclFld()->SetLclOffs(prevOffset); + store->gtType = prevType; + currValue->gtType = prevType; + currValue->AsIntCon()->gtIconVal = (ssize_t)combined; + currValue->ClearContained(); + + // Continue to try further coalescing with the now-widened store. + continue; + } + } + return; } From 48da287bae28935db5caf73f4fd978755c3430fa Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 10:43:08 -0700 Subject: [PATCH 04/27] JIT: Generalize narrow-into-wide fold beyond zero-init Extend the fold to handle any wider constant store (not just zero) followed by a narrower constant store within its range. The narrow value is spliced into the wider constant by clearing the affected bits and OR-ing in the new value. This handles chains of folds where the first fold produces a non-zero wider constant and subsequent narrow stores need to be folded into it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index a0c2dd3ae73cea..97b4a7be2b926d 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10962,11 +10962,13 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) var_types prevType = prevStore->TypeGet(); if (prevType != currType) { - // Special case: a wider constant-zero store followed by a narrower constant store - // that's fully contained within the wider store's range. We can fold the narrow - // value into the wider zero constant, eliminating the narrow store entirely. - // Example: STORE_LCL_FLD int [+0] = 0; STORE_LCL_FLD ubyte [+0] = 1 + // Special case: a wider constant store followed by a narrower constant store + // that's fully contained within the wider store's range. We can splice the narrow + // value into the wider constant, eliminating the narrow store. + // Example: STORE_LCL_FLD int [+0] = 0; STORE_LCL_FLD ubyte [+0] = 1 // => STORE_LCL_FLD int [+0] = 1 + // Example: STORE_LCL_FLD int [+0] = 256; STORE_LCL_FLD ubyte [+0] = 0 + // => STORE_LCL_FLD int [+0] = 256 (narrow store is redundant) unsigned prevSize = genTypeSize(prevType); unsigned prevOffset = prevStore->GetLclOffs(); @@ -10983,26 +10985,33 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) return; } - if (prevValue->IsIntegralConst(0) && currValue->IsCnsIntOrI() && - !currValue->AsIntCon()->ImmedValNeedsReloc(m_compiler)) + if (prevValue->IsCnsIntOrI() && !prevValue->AsIntCon()->ImmedValNeedsReloc(m_compiler) && + currValue->IsCnsIntOrI() && !currValue->AsIntCon()->ImmedValNeedsReloc(m_compiler)) { unsigned insertBitOffset = (currOffset - prevOffset) * BITS_PER_BYTE; - size_t mask = (~(size_t(0)) >> ((sizeof(size_t) - genTypeSize(currType)) * BITS_PER_BYTE)); - size_t narrowVal = (size_t)currValue->AsIntCon()->IconValue() & mask; - size_t combined = narrowVal << insertBitOffset; - - JITDUMP("Folding narrow constant store into wider zero-init:\n"); - JITDUMP(" Wide store: V%02u [+%u] %s = 0\n", currLclNum, prevOffset, varTypeName(prevType)); + unsigned narrowBits = genTypeSize(currType) * BITS_PER_BYTE; + size_t narrowMask = (~(size_t(0)) >> (sizeof(size_t) * BITS_PER_BYTE - narrowBits)); + size_t narrowVal = (size_t)currValue->AsIntCon()->IconValue() & narrowMask; + size_t wideVal = (size_t)prevValue->AsIntCon()->IconValue(); + + // Clear the bits in the wide value where the narrow value will be inserted, + // then OR in the narrow value. + size_t clearMask = ~(narrowMask << insertBitOffset); + size_t combined = (wideVal & clearMask) | (narrowVal << insertBitOffset); + + JITDUMP("Folding narrow constant store into wider constant store:\n"); + JITDUMP(" Wide store: V%02u [+%u] %s = %lld\n", currLclNum, prevOffset, + varTypeName(prevType), (int64_t)prevValue->AsIntCon()->IconValue()); JITDUMP(" Narrow store: V%02u [+%u] %s = %lld\n", currLclNum, currOffset, varTypeName(currType), (int64_t)currValue->AsIntCon()->IconValue()); JITDUMP(" Combined value: %lld\n", (int64_t)combined); - // Remove the previous (wider zero) store. + // Remove the previous (wider) store. BlockRange().Remove(prevRange.FirstNode(), prevRange.LastNode()); // Update the current store to be the wider type with the combined value. store->AsLclFld()->SetLclOffs(prevOffset); - store->gtType = prevType; + store->gtType = prevType; currValue->gtType = prevType; currValue->AsIntCon()->gtIconVal = (ssize_t)combined; currValue->ClearContained(); From 187dd5b425fc2828f05373effcf31d4c3ca39536 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 11:08:40 -0700 Subject: [PATCH 05/27] JIT: Bypass constant store+load for last-use locals After store coalescing produces a constant STORE_LCL_FLD, scan ahead for a same-sized read from the same local that is the last use. When found, replace the read with the constant value directly, eliminating both the store-to-stack and load-from-stack instructions. For untracked (do-not-enreg) locals where GTF_VAR_DEATH may not be set, verify last-use by scanning forward to confirm no further reads exist before the next store to the same local. This completely eliminates the store+load round-trip for struct field initialization patterns like: STORE_LCL_FLD int V02 [+0] = LCL_VAR struct V02 (last use) STOREIND [static_field], V02 => STOREIND [static_field], Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 102 +++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 97b4a7be2b926d..da14936206595a 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10947,7 +10947,7 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) if ((prevTree == nullptr) || !prevTree->OperIs(GT_STORE_LCL_FLD)) { - return; + break; } GenTreeLclFld* prevStore = prevTree->AsLclFld(); @@ -11303,6 +11303,106 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) #endif // TARGET_64BIT } while (true); + + // After coalescing, if the store has a constant value, look ahead for a same-sized + // last-use read from the same local. If found, we can bypass the store+load entirely + // by replacing the load with the constant value and removing the store. + if (store->OperIs(GT_STORE_LCL_FLD) && store->Data()->OperIsConst() && store->Data()->IsCnsIntOrI() && + !store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler)) + { + unsigned lclNum = store->GetLclNum(); + unsigned offset = store->AsLclFld()->GetLclOffs(); + var_types storeType = store->TypeGet(); + + GenTree* scanNode = store->gtNext; + const int scanLimit = 20; + + for (int scanCount = 0; scanNode != nullptr && scanCount < scanLimit; scanCount++) + { + if (scanNode->OperIsConditionalJump() || scanNode->OperGet() == GT_JMP || + scanNode->OperGet() == GT_RETURN || scanNode->OperGet() == GT_SWIFT_ERROR_RET) + { + break; + } + + // Found a read from the same local, same offset, same size. + if (scanNode->OperIs(GT_LCL_VAR, GT_LCL_FLD) && scanNode->OperIsLocalRead() && + scanNode->AsLclVarCommon()->GetLclNum() == lclNum && + scanNode->AsLclVarCommon()->GetLclOffs() == offset) + { + unsigned readSize = genTypeSize(scanNode->TypeGet()); + if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) + { + readSize = scanNode->AsLclVarCommon()->GetLayout(m_compiler)->GetSize(); + } + + if (readSize != genTypeSize(storeType)) + { + break; // Size mismatch, can't bypass. + } + // Only bypass if this is the last use — either GTF_VAR_DEATH is set, + // or the local is do-not-enreg and we can verify no further reads. + bool isLastUse = ((scanNode->gtFlags & GTF_VAR_DEATH) != 0); + if (!isLastUse && m_compiler->lvaGetDesc(lclNum)->lvDoNotEnregister) + { + // For untracked locals, scan forward to verify no more reads. + isLastUse = true; + GenTree* chk = scanNode->gtNext; + for (int chkCount = 0; chk != nullptr && chkCount < scanLimit; chkCount++) + { + if (chk->OperIsLocalRead() && chk->AsLclVarCommon()->GetLclNum() == lclNum) + { + isLastUse = false; + break; + } + if (chk->OperIsLocalStore() && chk->AsLclVarCommon()->GetLclNum() == lclNum) + { + break; // redefined before next read = this was the last use + } + chk = chk->gtNext; + } + } + + if (isLastUse) + { + JITDUMP("Bypassing constant store+load for V%02u [+%u]: replacing load with constant %lld\n", + lclNum, offset, (int64_t)store->Data()->AsIntCon()->IconValue()); + + // Create a new constant node and insert it in place of the load. + ssize_t constVal = store->Data()->AsIntCon()->IconValue(); + GenTree* newConst = m_compiler->gtNewIconNode(constVal, storeType); + BlockRange().InsertAfter(scanNode, newConst); + + // Replace all uses of the load with the new constant. + LIR::Use use; + if (BlockRange().TryGetUse(scanNode, &use)) + { + use.ReplaceWith(newConst); + } + + // Remove the load node. + BlockRange().Remove(scanNode); + + // The store is now dead (its value was forwarded to the load's user). + // We leave it in place — removing it here would cause issues since + // LowerStoreLocCommon is still processing this node. Later phases + // or the register allocator can handle the dead store. + break; + } + // Not the last use — stop scanning, can't bypass. + break; + } + + // Stop at stores to the same local (would change the value). + if (scanNode->OperIsLocalStore() && scanNode->AsLclVarCommon()->GetLclNum() == lclNum) + { + break; + } + + scanNode = scanNode->gtNext; + } + } + #endif // TARGET_XARCH || TARGET_ARM64 } From 21130c7ffc67430a0e754473f18e35210fe45a98 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 11:26:16 -0700 Subject: [PATCH 06/27] JIT: Revert store-load bypass due to correctness issue The store-load bypass (forwarding constants to reads and leaving dead stores) has a subtle interaction with the narrow-into-wide fold: when a local is reused across multiple struct initializations, the dead store's value gets picked up by subsequent folds, producing incorrect combined constants. Revert the bypass for now. The coalescing and fold optimizations remain and are correct. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 101 +------------------------------------- 1 file changed, 1 insertion(+), 100 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index da14936206595a..20ac2b9daee212 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10947,7 +10947,7 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) if ((prevTree == nullptr) || !prevTree->OperIs(GT_STORE_LCL_FLD)) { - break; + return; } GenTreeLclFld* prevStore = prevTree->AsLclFld(); @@ -11304,105 +11304,6 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) } while (true); - // After coalescing, if the store has a constant value, look ahead for a same-sized - // last-use read from the same local. If found, we can bypass the store+load entirely - // by replacing the load with the constant value and removing the store. - if (store->OperIs(GT_STORE_LCL_FLD) && store->Data()->OperIsConst() && store->Data()->IsCnsIntOrI() && - !store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler)) - { - unsigned lclNum = store->GetLclNum(); - unsigned offset = store->AsLclFld()->GetLclOffs(); - var_types storeType = store->TypeGet(); - - GenTree* scanNode = store->gtNext; - const int scanLimit = 20; - - for (int scanCount = 0; scanNode != nullptr && scanCount < scanLimit; scanCount++) - { - if (scanNode->OperIsConditionalJump() || scanNode->OperGet() == GT_JMP || - scanNode->OperGet() == GT_RETURN || scanNode->OperGet() == GT_SWIFT_ERROR_RET) - { - break; - } - - // Found a read from the same local, same offset, same size. - if (scanNode->OperIs(GT_LCL_VAR, GT_LCL_FLD) && scanNode->OperIsLocalRead() && - scanNode->AsLclVarCommon()->GetLclNum() == lclNum && - scanNode->AsLclVarCommon()->GetLclOffs() == offset) - { - unsigned readSize = genTypeSize(scanNode->TypeGet()); - if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) - { - readSize = scanNode->AsLclVarCommon()->GetLayout(m_compiler)->GetSize(); - } - - if (readSize != genTypeSize(storeType)) - { - break; // Size mismatch, can't bypass. - } - // Only bypass if this is the last use — either GTF_VAR_DEATH is set, - // or the local is do-not-enreg and we can verify no further reads. - bool isLastUse = ((scanNode->gtFlags & GTF_VAR_DEATH) != 0); - if (!isLastUse && m_compiler->lvaGetDesc(lclNum)->lvDoNotEnregister) - { - // For untracked locals, scan forward to verify no more reads. - isLastUse = true; - GenTree* chk = scanNode->gtNext; - for (int chkCount = 0; chk != nullptr && chkCount < scanLimit; chkCount++) - { - if (chk->OperIsLocalRead() && chk->AsLclVarCommon()->GetLclNum() == lclNum) - { - isLastUse = false; - break; - } - if (chk->OperIsLocalStore() && chk->AsLclVarCommon()->GetLclNum() == lclNum) - { - break; // redefined before next read = this was the last use - } - chk = chk->gtNext; - } - } - - if (isLastUse) - { - JITDUMP("Bypassing constant store+load for V%02u [+%u]: replacing load with constant %lld\n", - lclNum, offset, (int64_t)store->Data()->AsIntCon()->IconValue()); - - // Create a new constant node and insert it in place of the load. - ssize_t constVal = store->Data()->AsIntCon()->IconValue(); - GenTree* newConst = m_compiler->gtNewIconNode(constVal, storeType); - BlockRange().InsertAfter(scanNode, newConst); - - // Replace all uses of the load with the new constant. - LIR::Use use; - if (BlockRange().TryGetUse(scanNode, &use)) - { - use.ReplaceWith(newConst); - } - - // Remove the load node. - BlockRange().Remove(scanNode); - - // The store is now dead (its value was forwarded to the load's user). - // We leave it in place — removing it here would cause issues since - // LowerStoreLocCommon is still processing this node. Later phases - // or the register allocator can handle the dead store. - break; - } - // Not the last use — stop scanning, can't bypass. - break; - } - - // Stop at stores to the same local (would change the value). - if (scanNode->OperIsLocalStore() && scanNode->AsLclVarCommon()->GetLclNum() == lclNum) - { - break; - } - - scanNode = scanNode->gtNext; - } - } - #endif // TARGET_XARCH || TARGET_ARM64 } From 8c294d8f27d12c4c3e1afe40b7e6c6261e01895d Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 12:01:45 -0700 Subject: [PATCH 07/27] Revert "JIT: Revert store-load bypass due to correctness issue" This reverts commit 21130c7ffc67430a0e754473f18e35210fe45a98. --- src/coreclr/jit/lower.cpp | 101 +++++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 20ac2b9daee212..da14936206595a 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10947,7 +10947,7 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) if ((prevTree == nullptr) || !prevTree->OperIs(GT_STORE_LCL_FLD)) { - return; + break; } GenTreeLclFld* prevStore = prevTree->AsLclFld(); @@ -11304,6 +11304,105 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) } while (true); + // After coalescing, if the store has a constant value, look ahead for a same-sized + // last-use read from the same local. If found, we can bypass the store+load entirely + // by replacing the load with the constant value and removing the store. + if (store->OperIs(GT_STORE_LCL_FLD) && store->Data()->OperIsConst() && store->Data()->IsCnsIntOrI() && + !store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler)) + { + unsigned lclNum = store->GetLclNum(); + unsigned offset = store->AsLclFld()->GetLclOffs(); + var_types storeType = store->TypeGet(); + + GenTree* scanNode = store->gtNext; + const int scanLimit = 20; + + for (int scanCount = 0; scanNode != nullptr && scanCount < scanLimit; scanCount++) + { + if (scanNode->OperIsConditionalJump() || scanNode->OperGet() == GT_JMP || + scanNode->OperGet() == GT_RETURN || scanNode->OperGet() == GT_SWIFT_ERROR_RET) + { + break; + } + + // Found a read from the same local, same offset, same size. + if (scanNode->OperIs(GT_LCL_VAR, GT_LCL_FLD) && scanNode->OperIsLocalRead() && + scanNode->AsLclVarCommon()->GetLclNum() == lclNum && + scanNode->AsLclVarCommon()->GetLclOffs() == offset) + { + unsigned readSize = genTypeSize(scanNode->TypeGet()); + if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) + { + readSize = scanNode->AsLclVarCommon()->GetLayout(m_compiler)->GetSize(); + } + + if (readSize != genTypeSize(storeType)) + { + break; // Size mismatch, can't bypass. + } + // Only bypass if this is the last use — either GTF_VAR_DEATH is set, + // or the local is do-not-enreg and we can verify no further reads. + bool isLastUse = ((scanNode->gtFlags & GTF_VAR_DEATH) != 0); + if (!isLastUse && m_compiler->lvaGetDesc(lclNum)->lvDoNotEnregister) + { + // For untracked locals, scan forward to verify no more reads. + isLastUse = true; + GenTree* chk = scanNode->gtNext; + for (int chkCount = 0; chk != nullptr && chkCount < scanLimit; chkCount++) + { + if (chk->OperIsLocalRead() && chk->AsLclVarCommon()->GetLclNum() == lclNum) + { + isLastUse = false; + break; + } + if (chk->OperIsLocalStore() && chk->AsLclVarCommon()->GetLclNum() == lclNum) + { + break; // redefined before next read = this was the last use + } + chk = chk->gtNext; + } + } + + if (isLastUse) + { + JITDUMP("Bypassing constant store+load for V%02u [+%u]: replacing load with constant %lld\n", + lclNum, offset, (int64_t)store->Data()->AsIntCon()->IconValue()); + + // Create a new constant node and insert it in place of the load. + ssize_t constVal = store->Data()->AsIntCon()->IconValue(); + GenTree* newConst = m_compiler->gtNewIconNode(constVal, storeType); + BlockRange().InsertAfter(scanNode, newConst); + + // Replace all uses of the load with the new constant. + LIR::Use use; + if (BlockRange().TryGetUse(scanNode, &use)) + { + use.ReplaceWith(newConst); + } + + // Remove the load node. + BlockRange().Remove(scanNode); + + // The store is now dead (its value was forwarded to the load's user). + // We leave it in place — removing it here would cause issues since + // LowerStoreLocCommon is still processing this node. Later phases + // or the register allocator can handle the dead store. + break; + } + // Not the last use — stop scanning, can't bypass. + break; + } + + // Stop at stores to the same local (would change the value). + if (scanNode->OperIsLocalStore() && scanNode->AsLclVarCommon()->GetLclNum() == lclNum) + { + break; + } + + scanNode = scanNode->gtNext; + } + } + #endif // TARGET_XARCH || TARGET_ARM64 } From 0c83c0150ec161b3dc7650327ae7f35f55a81eea Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 13:11:20 -0700 Subject: [PATCH 08/27] tweak; format --- src/coreclr/jit/lower.cpp | 50 +++++++++++++-------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index da14936206595a..35fabd21df159c 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10978,8 +10978,8 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) GenTree* prevValue = prevStore->Data(); GenTree* currValue = store->Data(); - bool isPrevClosedRange = false; - LIR::ReadOnlyRange prevRange = BlockRange().GetTreeRange(prevStore, &isPrevClosedRange); + bool isPrevClosedRange = false; + LIR::ReadOnlyRange prevRange = BlockRange().GetTreeRange(prevStore, &isPrevClosedRange); if (!isPrevClosedRange) { return; @@ -10996,14 +10996,14 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) // Clear the bits in the wide value where the narrow value will be inserted, // then OR in the narrow value. - size_t clearMask = ~(narrowMask << insertBitOffset); - size_t combined = (wideVal & clearMask) | (narrowVal << insertBitOffset); + size_t clearMask = ~(narrowMask << insertBitOffset); + size_t combined = (wideVal & clearMask) | (narrowVal << insertBitOffset); JITDUMP("Folding narrow constant store into wider constant store:\n"); - JITDUMP(" Wide store: V%02u [+%u] %s = %lld\n", currLclNum, prevOffset, - varTypeName(prevType), (int64_t)prevValue->AsIntCon()->IconValue()); - JITDUMP(" Narrow store: V%02u [+%u] %s = %lld\n", currLclNum, currOffset, - varTypeName(currType), (int64_t)currValue->AsIntCon()->IconValue()); + JITDUMP(" Wide store: V%02u [+%u] %s = %lld\n", currLclNum, prevOffset, varTypeName(prevType), + (int64_t)prevValue->AsIntCon()->IconValue()); + JITDUMP(" Narrow store: V%02u [+%u] %s = %lld\n", currLclNum, currOffset, varTypeName(currType), + (int64_t)currValue->AsIntCon()->IconValue()); JITDUMP(" Combined value: %lld\n", (int64_t)combined); // Remove the previous (wider) store. @@ -11327,8 +11327,7 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) // Found a read from the same local, same offset, same size. if (scanNode->OperIs(GT_LCL_VAR, GT_LCL_FLD) && scanNode->OperIsLocalRead() && - scanNode->AsLclVarCommon()->GetLclNum() == lclNum && - scanNode->AsLclVarCommon()->GetLclOffs() == offset) + scanNode->AsLclVarCommon()->GetLclNum() == lclNum && scanNode->AsLclVarCommon()->GetLclOffs() == offset) { unsigned readSize = genTypeSize(scanNode->TypeGet()); if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) @@ -11340,33 +11339,16 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) { break; // Size mismatch, can't bypass. } - // Only bypass if this is the last use — either GTF_VAR_DEATH is set, - // or the local is do-not-enreg and we can verify no further reads. - bool isLastUse = ((scanNode->gtFlags & GTF_VAR_DEATH) != 0); - if (!isLastUse && m_compiler->lvaGetDesc(lclNum)->lvDoNotEnregister) - { - // For untracked locals, scan forward to verify no more reads. - isLastUse = true; - GenTree* chk = scanNode->gtNext; - for (int chkCount = 0; chk != nullptr && chkCount < scanLimit; chkCount++) - { - if (chk->OperIsLocalRead() && chk->AsLclVarCommon()->GetLclNum() == lclNum) - { - isLastUse = false; - break; - } - if (chk->OperIsLocalStore() && chk->AsLclVarCommon()->GetLclNum() == lclNum) - { - break; // redefined before next read = this was the last use - } - chk = chk->gtNext; - } - } + // Only bypass if this is the last use + + bool const isLastUse = ((scanNode->gtFlags & GTF_VAR_DEATH) != 0); if (isLastUse) { - JITDUMP("Bypassing constant store+load for V%02u [+%u]: replacing load with constant %lld\n", - lclNum, offset, (int64_t)store->Data()->AsIntCon()->IconValue()); + JITDUMP( + "Bypassing constant store [%06u] + last-use load [%02u] for V%02u[+%u]: replacing load with constant %lld\n", + m_compiler->dspTreeID(store), m_compiler->dspTreeID(scanNode), lclNum, offset, + (int64_t)store->Data()->AsIntCon()->IconValue()); // Create a new constant node and insert it in place of the load. ssize_t constVal = store->Data()->AsIntCon()->IconValue(); From fc88ab9352d44f11585f02002c47fe90167167d9 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 13:19:53 -0700 Subject: [PATCH 09/27] JIT: Fix store-load bypass with GTF_ICON_STRUCT_INIT_VAL flag When the store-load bypass forwards a constant value to replace a struct load, mark the constant with a new GTF_ICON_STRUCT_INIT_VAL flag. This tells TryTransformStoreObjAsStoreInd to use the constant as-is rather than treating its low byte as a fill pattern to replicate (via gtNewConWithPattern). Without this flag, a forwarded value like 1 would be expanded to 0x01010101 by the STORE_BLK -> STOREIND transformation, corrupting the struct value. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/gentree.h | 1 + src/coreclr/jit/lower.cpp | 23 ++++++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index c5484f70896e28..69592d8351b6bc 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -519,6 +519,7 @@ enum GenTreeFlags : unsigned // GTF_ICON_REUSE_REG_VAL = 0x00800000 // GT_CNS_INT -- GTF_REUSE_REG_VAL, defined above GTF_ICON_SIMD_COUNT = 0x00200000, // GT_CNS_INT -- constant is Vector.Count + GTF_ICON_STRUCT_INIT_VAL = 0x00100000, // GT_CNS_INT -- constant is a full struct init value, not a byte pattern GTF_OVERFLOW = 0x10000000, // Supported for: GT_ADD, GT_SUB, GT_MUL and GT_CAST. // Requires an overflow check. Use gtOverflow(Ex)() to check this flag. diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 35fabd21df159c..6e062d7663351b 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11351,8 +11351,11 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) (int64_t)store->Data()->AsIntCon()->IconValue()); // Create a new constant node and insert it in place of the load. + // Mark it with GTF_ICON_STRUCT_INIT_VAL so that TryTransformStoreObjAsStoreInd + // uses it as-is instead of treating the low byte as a fill pattern. ssize_t constVal = store->Data()->AsIntCon()->IconValue(); GenTree* newConst = m_compiler->gtNewIconNode(constVal, storeType); + newConst->gtFlags |= GTF_ICON_STRUCT_INIT_VAL; BlockRange().InsertAfter(scanNode, newConst); // Replace all uses of the load with the new constant. @@ -12366,11 +12369,21 @@ bool Lowering::TryTransformStoreObjAsStoreInd(GenTreeBlk* blkNode) src = src->gtGetOp1(); } - uint8_t initVal = static_cast(src->AsIntCon()->IconValue()); - GenTree* cnsVec = m_compiler->gtNewConWithPattern(regType, initVal); - BlockRange().InsertAfter(src, cnsVec); - BlockRange().Remove(src); - blkNode->SetData(cnsVec); + // If the constant was produced by store-load bypass (GTF_ICON_STRUCT_INIT_VAL), + // it already holds the complete struct value — don't reinterpret the low byte + // as a fill pattern. + if ((src->gtFlags & GTF_ICON_STRUCT_INIT_VAL) != 0) + { + src->ChangeType(regType); + } + else + { + uint8_t initVal = static_cast(src->AsIntCon()->IconValue()); + GenTree* cnsVec = m_compiler->gtNewConWithPattern(regType, initVal); + BlockRange().InsertAfter(src, cnsVec); + BlockRange().Remove(src); + blkNode->SetData(cnsVec); + } } else if (varTypeIsStruct(src)) { From ffe5df5341586c96cc82cf6905aaff7abab8311f Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 13:25:01 -0700 Subject: [PATCH 10/27] JIT: Relax store-load bypass to not require last use Replace the GTF_VAR_DEATH last-use check with IsInvariantInRange to verify the stored value can reach the load. This allows the bypass to fire even when the load is not the last use of the local, as long as the constant value is invariant between the store and load points. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 6e062d7663351b..b7873ddf43730a 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11339,14 +11339,15 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) { break; // Size mismatch, can't bypass. } - // Only bypass if this is the last use - bool const isLastUse = ((scanNode->gtFlags & GTF_VAR_DEATH) != 0); - - if (isLastUse) + // Verify the stored value can reach the load — the store's data must be + // invariant between the store and the load (no intervening modifications). + if (!IsInvariantInRange(store->Data(), scanNode)) { - JITDUMP( - "Bypassing constant store [%06u] + last-use load [%02u] for V%02u[+%u]: replacing load with constant %lld\n", + break; + } + + JITDUMP("Forwarding constant store [%06u] to load [%06u] for V%02u[+%u]: value %lld\n", m_compiler->dspTreeID(store), m_compiler->dspTreeID(scanNode), lclNum, offset, (int64_t)store->Data()->AsIntCon()->IconValue()); @@ -11373,9 +11374,6 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) // LowerStoreLocCommon is still processing this node. Later phases // or the register allocator can handle the dead store. break; - } - // Not the last use — stop scanning, can't bypass. - break; } // Stop at stores to the same local (would change the value). From 0ceed698af3100c1179cc830233df054db6927b2 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 13:38:31 -0700 Subject: [PATCH 11/27] JIT: Support partial constant forwarding in store-load bypass When the load reads a sub-range of the stored constant (e.g., a byte field from a dword store), extract the appropriate bits from the constant and forward them. This handles cases where a wider store covers multiple fields and a subsequent load reads just one field. The forwarded value is computed by shifting and masking the stored constant based on the read offset and size within the store range. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 80 ++++++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index b7873ddf43730a..853e131db1c5ed 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11313,6 +11313,7 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) unsigned lclNum = store->GetLclNum(); unsigned offset = store->AsLclFld()->GetLclOffs(); var_types storeType = store->TypeGet(); + unsigned storeSize = genTypeSize(storeType); GenTree* scanNode = store->gtNext; const int scanLimit = 20; @@ -11325,19 +11326,27 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) break; } - // Found a read from the same local, same offset, same size. + // Found a read from the same local that may overlap our store. if (scanNode->OperIs(GT_LCL_VAR, GT_LCL_FLD) && scanNode->OperIsLocalRead() && - scanNode->AsLclVarCommon()->GetLclNum() == lclNum && scanNode->AsLclVarCommon()->GetLclOffs() == offset) + scanNode->AsLclVarCommon()->GetLclNum() == lclNum) { - unsigned readSize = genTypeSize(scanNode->TypeGet()); + unsigned readOffset = scanNode->AsLclVarCommon()->GetLclOffs(); + unsigned readSize = genTypeSize(scanNode->TypeGet()); if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) { readSize = scanNode->AsLclVarCommon()->GetLayout(m_compiler)->GetSize(); } - if (readSize != genTypeSize(storeType)) + // The read must be fully contained within the store's range. + if (readOffset < offset || (readOffset + readSize) > (offset + storeSize)) { - break; // Size mismatch, can't bypass. + break; // Read extends outside the store — can't forward. + } + + // Only forward to integral reads (not struct reads wider than a register). + if (!varTypeIsIntegral(scanNode->TypeGet()) && !scanNode->TypeIs(TYP_STRUCT)) + { + break; } // Verify the stored value can reach the load — the store's data must be @@ -11347,33 +11356,44 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) break; } - JITDUMP("Forwarding constant store [%06u] to load [%06u] for V%02u[+%u]: value %lld\n", - m_compiler->dspTreeID(store), m_compiler->dspTreeID(scanNode), lclNum, offset, - (int64_t)store->Data()->AsIntCon()->IconValue()); - - // Create a new constant node and insert it in place of the load. - // Mark it with GTF_ICON_STRUCT_INIT_VAL so that TryTransformStoreObjAsStoreInd - // uses it as-is instead of treating the low byte as a fill pattern. - ssize_t constVal = store->Data()->AsIntCon()->IconValue(); - GenTree* newConst = m_compiler->gtNewIconNode(constVal, storeType); - newConst->gtFlags |= GTF_ICON_STRUCT_INIT_VAL; - BlockRange().InsertAfter(scanNode, newConst); - - // Replace all uses of the load with the new constant. - LIR::Use use; - if (BlockRange().TryGetUse(scanNode, &use)) - { - use.ReplaceWith(newConst); - } + // Extract the portion of the constant that corresponds to the read. + ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); + unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; + size_t readMask = (readSize >= sizeof(size_t)) + ? ~(size_t)0 + : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; + ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); + + // For the forwarded constant, use the read's type if it's integral, + // otherwise use the store type. + var_types fwdType = varTypeIsIntegral(scanNode->TypeGet()) ? scanNode->TypeGet() : storeType; + + JITDUMP("Forwarding constant store [%06u] to load [%06u] for V%02u[+%u]: " + "store value 0x%llx, forwarded value 0x%llx (read offset +%u, size %u)\n", + m_compiler->dspTreeID(store), m_compiler->dspTreeID(scanNode), lclNum, readOffset, + (unsigned long long)(size_t)fullVal, (unsigned long long)(size_t)forwardVal, + readOffset, readSize); + + // Create a new constant node and insert it in place of the load. + // Mark it with GTF_ICON_STRUCT_INIT_VAL so that TryTransformStoreObjAsStoreInd + // uses it as-is instead of treating the low byte as a fill pattern. + GenTree* newConst = m_compiler->gtNewIconNode(forwardVal, fwdType); + newConst->gtFlags |= GTF_ICON_STRUCT_INIT_VAL; + BlockRange().InsertAfter(scanNode, newConst); + + // Replace all uses of the load with the new constant. + LIR::Use use; + if (BlockRange().TryGetUse(scanNode, &use)) + { + use.ReplaceWith(newConst); + } - // Remove the load node. - BlockRange().Remove(scanNode); + // Remove the load node. + BlockRange().Remove(scanNode); - // The store is now dead (its value was forwarded to the load's user). - // We leave it in place — removing it here would cause issues since - // LowerStoreLocCommon is still processing this node. Later phases - // or the register allocator can handle the dead store. - break; + // The store may now be dead, but we leave it in place since + // LowerStoreLocCommon is still processing this node. + break; } // Stop at stores to the same local (would change the value). From 190473ba1d986eb06fb219bd9a2d41f75458066c Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 13:58:37 -0700 Subject: [PATCH 12/27] JIT: Remove partial forward debug trace Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 853e131db1c5ed..c8c5063510b4e2 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11357,12 +11357,12 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) } // Extract the portion of the constant that corresponds to the read. - ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); - unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; - size_t readMask = (readSize >= sizeof(size_t)) - ? ~(size_t)0 - : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; - ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); + ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); + unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; + size_t readMask = (readSize >= sizeof(size_t)) + ? ~(size_t)0 + : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; + ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); // For the forwarded constant, use the read's type if it's integral, // otherwise use the store type. From fe29f90cca610d00f3667034a26f689ee7f5fc33 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 14:28:54 -0700 Subject: [PATCH 13/27] JIT: Skip store-load bypass for address-exposed locals Address-exposed locals can be modified through aliases between the store and load, making it unsafe to forward the constant. Skip the bypass for such locals. Fixes assertion failures in libraries_tests.run collection. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index c8c5063510b4e2..9837e51da2ade2 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11308,7 +11308,8 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) // last-use read from the same local. If found, we can bypass the store+load entirely // by replacing the load with the constant value and removing the store. if (store->OperIs(GT_STORE_LCL_FLD) && store->Data()->OperIsConst() && store->Data()->IsCnsIntOrI() && - !store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler)) + !store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler) && + !m_compiler->lvaVarAddrExposed(store->GetLclNum())) { unsigned lclNum = store->GetLclNum(); unsigned offset = store->AsLclFld()->GetLclOffs(); From 345f8f707f760c57c8f135d58f5c1dac9bf1b6a8 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 14:36:06 -0700 Subject: [PATCH 14/27] format, fix comments --- src/coreclr/jit/lower.cpp | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 9837e51da2ade2..7f8ea70fec51bc 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11304,9 +11304,9 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) } while (true); - // After coalescing, if the store has a constant value, look ahead for a same-sized - // last-use read from the same local. If found, we can bypass the store+load entirely - // by replacing the load with the constant value and removing the store. + // After coalescing, if the store has a constant value, look ahead for an overlapped read + // from the same local. If found, we can bypass the store+load entirely + // by replacing the load with the constant value. if (store->OperIs(GT_STORE_LCL_FLD) && store->Data()->OperIsConst() && store->Data()->IsCnsIntOrI() && !store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler) && !m_compiler->lvaVarAddrExposed(store->GetLclNum())) @@ -11319,14 +11319,8 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) GenTree* scanNode = store->gtNext; const int scanLimit = 20; - for (int scanCount = 0; scanNode != nullptr && scanCount < scanLimit; scanCount++) + for (int scanCount = 0; (scanNode != nullptr) && (scanCount < scanLimit); scanCount++) { - if (scanNode->OperIsConditionalJump() || scanNode->OperGet() == GT_JMP || - scanNode->OperGet() == GT_RETURN || scanNode->OperGet() == GT_SWIFT_ERROR_RET) - { - break; - } - // Found a read from the same local that may overlap our store. if (scanNode->OperIs(GT_LCL_VAR, GT_LCL_FLD) && scanNode->OperIsLocalRead() && scanNode->AsLclVarCommon()->GetLclNum() == lclNum) @@ -11358,12 +11352,11 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) } // Extract the portion of the constant that corresponds to the read. - ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); - unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; - size_t readMask = (readSize >= sizeof(size_t)) - ? ~(size_t)0 - : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; - ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); + ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); + unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; + size_t readMask = + (readSize >= sizeof(size_t)) ? ~(size_t)0 : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; + ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); // For the forwarded constant, use the read's type if it's integral, // otherwise use the store type. @@ -11372,8 +11365,8 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) JITDUMP("Forwarding constant store [%06u] to load [%06u] for V%02u[+%u]: " "store value 0x%llx, forwarded value 0x%llx (read offset +%u, size %u)\n", m_compiler->dspTreeID(store), m_compiler->dspTreeID(scanNode), lclNum, readOffset, - (unsigned long long)(size_t)fullVal, (unsigned long long)(size_t)forwardVal, - readOffset, readSize); + (unsigned long long)(size_t)fullVal, (unsigned long long)(size_t)forwardVal, readOffset, + readSize); // Create a new constant node and insert it in place of the load. // Mark it with GTF_ICON_STRUCT_INIT_VAL so that TryTransformStoreObjAsStoreInd @@ -11392,8 +11385,7 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) // Remove the load node. BlockRange().Remove(scanNode); - // The store may now be dead, but we leave it in place since - // LowerStoreLocCommon is still processing this node. + // The store may not be dead, so we leave it in place. break; } From ba118a0cb12f3c2286c88b8d4aa28ae8e37a8835 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 14:41:56 -0700 Subject: [PATCH 15/27] JIT: Extract store-forwarding into TryForwardConstantStoreLclFld helper Move the constant store-to-load forwarding logic out of LowerStoreLclFldCoalescing into its own method for clarity. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 176 ++++++++++++++++++++------------------ src/coreclr/jit/lower.h | 1 + 2 files changed, 96 insertions(+), 81 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 7f8ea70fec51bc..385f9392298d11 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11304,102 +11304,116 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) } while (true); - // After coalescing, if the store has a constant value, look ahead for an overlapped read - // from the same local. If found, we can bypass the store+load entirely - // by replacing the load with the constant value. - if (store->OperIs(GT_STORE_LCL_FLD) && store->Data()->OperIsConst() && store->Data()->IsCnsIntOrI() && - !store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler) && - !m_compiler->lvaVarAddrExposed(store->GetLclNum())) - { - unsigned lclNum = store->GetLclNum(); - unsigned offset = store->AsLclFld()->GetLclOffs(); - var_types storeType = store->TypeGet(); - unsigned storeSize = genTypeSize(storeType); - - GenTree* scanNode = store->gtNext; - const int scanLimit = 20; - - for (int scanCount = 0; (scanNode != nullptr) && (scanCount < scanLimit); scanCount++) - { - // Found a read from the same local that may overlap our store. - if (scanNode->OperIs(GT_LCL_VAR, GT_LCL_FLD) && scanNode->OperIsLocalRead() && - scanNode->AsLclVarCommon()->GetLclNum() == lclNum) - { - unsigned readOffset = scanNode->AsLclVarCommon()->GetLclOffs(); - unsigned readSize = genTypeSize(scanNode->TypeGet()); - if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) - { - readSize = scanNode->AsLclVarCommon()->GetLayout(m_compiler)->GetSize(); - } + TryForwardConstantStoreLclFld(store); - // The read must be fully contained within the store's range. - if (readOffset < offset || (readOffset + readSize) > (offset + storeSize)) - { - break; // Read extends outside the store — can't forward. - } +#endif // TARGET_XARCH || TARGET_ARM64 +} - // Only forward to integral reads (not struct reads wider than a register). - if (!varTypeIsIntegral(scanNode->TypeGet()) && !scanNode->TypeIs(TYP_STRUCT)) - { - break; - } +//------------------------------------------------------------------------ +// TryForwardConstantStoreLclFld: After store coalescing/folding, if the resulting store +// writes a constant to a non-address-exposed local, look ahead for a load from the same +// local that is fully contained within the store's range. If found, replace the load with +// the appropriate portion of the constant value. +// +// Arguments: +// store - the GT_STORE_LCL_FLD node with a constant value +// +void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) +{ + if (!store->OperIs(GT_STORE_LCL_FLD) || !store->Data()->IsCnsIntOrI() || + store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler) || + m_compiler->lvaVarAddrExposed(store->GetLclNum())) + { + return; + } - // Verify the stored value can reach the load — the store's data must be - // invariant between the store and the load (no intervening modifications). - if (!IsInvariantInRange(store->Data(), scanNode)) - { - break; - } + unsigned lclNum = store->GetLclNum(); + unsigned offset = store->AsLclFld()->GetLclOffs(); + var_types storeType = store->TypeGet(); + unsigned storeSize = genTypeSize(storeType); - // Extract the portion of the constant that corresponds to the read. - ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); - unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; - size_t readMask = - (readSize >= sizeof(size_t)) ? ~(size_t)0 : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; - ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); - - // For the forwarded constant, use the read's type if it's integral, - // otherwise use the store type. - var_types fwdType = varTypeIsIntegral(scanNode->TypeGet()) ? scanNode->TypeGet() : storeType; - - JITDUMP("Forwarding constant store [%06u] to load [%06u] for V%02u[+%u]: " - "store value 0x%llx, forwarded value 0x%llx (read offset +%u, size %u)\n", - m_compiler->dspTreeID(store), m_compiler->dspTreeID(scanNode), lclNum, readOffset, - (unsigned long long)(size_t)fullVal, (unsigned long long)(size_t)forwardVal, readOffset, - readSize); - - // Create a new constant node and insert it in place of the load. - // Mark it with GTF_ICON_STRUCT_INIT_VAL so that TryTransformStoreObjAsStoreInd - // uses it as-is instead of treating the low byte as a fill pattern. - GenTree* newConst = m_compiler->gtNewIconNode(forwardVal, fwdType); - newConst->gtFlags |= GTF_ICON_STRUCT_INIT_VAL; - BlockRange().InsertAfter(scanNode, newConst); - - // Replace all uses of the load with the new constant. - LIR::Use use; - if (BlockRange().TryGetUse(scanNode, &use)) - { - use.ReplaceWith(newConst); - } + GenTree* scanNode = store->gtNext; + const int scanLimit = 20; - // Remove the load node. - BlockRange().Remove(scanNode); + for (int scanCount = 0; (scanNode != nullptr) && (scanCount < scanLimit); scanCount++) + { + // Found a read from the same local that may overlap our store. + if (scanNode->OperIs(GT_LCL_VAR, GT_LCL_FLD) && scanNode->OperIsLocalRead() && + scanNode->AsLclVarCommon()->GetLclNum() == lclNum) + { + unsigned readOffset = scanNode->AsLclVarCommon()->GetLclOffs(); + unsigned readSize = genTypeSize(scanNode->TypeGet()); + if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) + { + readSize = scanNode->AsLclVarCommon()->GetLayout(m_compiler)->GetSize(); + } - // The store may not be dead, so we leave it in place. + // The read must be fully contained within the store's range. + if (readOffset < offset || (readOffset + readSize) > (offset + storeSize)) + { break; } - // Stop at stores to the same local (would change the value). - if (scanNode->OperIsLocalStore() && scanNode->AsLclVarCommon()->GetLclNum() == lclNum) + // Only forward to integral reads (not struct reads wider than a register). + if (!varTypeIsIntegral(scanNode->TypeGet()) && !scanNode->TypeIs(TYP_STRUCT)) { break; } - scanNode = scanNode->gtNext; + // Verify the stored value can reach the load — the store's data must be + // invariant between the store and the load (no intervening modifications). + if (!IsInvariantInRange(store->Data(), scanNode)) + { + break; + } + + // Extract the portion of the constant that corresponds to the read. + ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); + unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; + size_t readMask = (readSize >= sizeof(size_t)) + ? ~(size_t)0 + : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; + ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); + + // For the forwarded constant, use the read's type if it's integral, + // otherwise use the store type. + var_types fwdType = varTypeIsIntegral(scanNode->TypeGet()) ? scanNode->TypeGet() : storeType; + + JITDUMP("Forwarding constant store [%06u] to load [%06u] for V%02u[+%u]: " + "store value 0x%llx, forwarded value 0x%llx (read offset +%u, size %u)\n", + m_compiler->dspTreeID(store), m_compiler->dspTreeID(scanNode), lclNum, readOffset, + (unsigned long long)(size_t)fullVal, (unsigned long long)(size_t)forwardVal, + readOffset, readSize); + + // Create a new constant node and insert it in place of the load. + // Mark it with GTF_ICON_STRUCT_INIT_VAL so that TryTransformStoreObjAsStoreInd + // uses it as-is instead of treating the low byte as a fill pattern. + GenTree* newConst = m_compiler->gtNewIconNode(forwardVal, fwdType); + newConst->gtFlags |= GTF_ICON_STRUCT_INIT_VAL; + BlockRange().InsertAfter(scanNode, newConst); + + // Replace all uses of the load with the new constant. + LIR::Use use; + if (BlockRange().TryGetUse(scanNode, &use)) + { + use.ReplaceWith(newConst); + } + + // Remove the load node. + BlockRange().Remove(scanNode); + + // The store may not be dead, so we leave it in place. + break; } - } -#endif // TARGET_XARCH || TARGET_ARM64 + // Stop at stores to the same local (would change the value). + if (scanNode->OperIsLocalStore() && scanNode->AsLclVarCommon()->GetLclNum() == lclNum) + { + break; + } + + scanNode = scanNode->gtNext; + } } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index a70e9a3940167d..6bcbad7b43ca5c 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -390,6 +390,7 @@ class Lowering final : public Phase GenTree* LowerStoreIndir(GenTreeStoreInd* node); void LowerStoreIndirCoalescing(GenTreeIndir* node); void LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store); + void TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store); GenTree* LowerAdd(GenTreeOp* node); GenTree* LowerMul(GenTreeOp* mul); bool TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode); From 671ea9b110a003ffadad5d478ef16713c4ccffd2 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 14:56:01 -0700 Subject: [PATCH 16/27] JIT: Remove unnecessary IsInvariantInRange check from store forwarding The check is redundant: the forwarded value is always a constant (which is inherently invariant), address-exposed locals are already excluded, and the scan stops at any intervening store to the same local. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 385f9392298d11..4349e1c70c7515 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11360,13 +11360,6 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) break; } - // Verify the stored value can reach the load — the store's data must be - // invariant between the store and the load (no intervening modifications). - if (!IsInvariantInRange(store->Data(), scanNode)) - { - break; - } - // Extract the portion of the constant that corresponds to the read. ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; From 04b30010195c707666ad80b69cfc58eaccd39c32 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 15:02:04 -0700 Subject: [PATCH 17/27] JIT: Extract non-constant coalescing into TryCoalesceNonConstStoreLclFld Move the shift+OR composition logic for non-constant store coalescing into its own helper method for clarity. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 242 ++++++++++++++++++++++---------------- src/coreclr/jit/lower.h | 4 + 2 files changed, 142 insertions(+), 104 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 4349e1c70c7515..ae0d1e1a7dee51 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11169,144 +11169,178 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) continue; } -#ifdef TARGET_64BIT - // Non-constant values: compose two integral values into a wider value via - // cast + shift + OR. For now, restrict to int+int -> long which is the dominant - // case and avoids complications with small type register widening. - if (currType != TYP_INT || newType != TYP_LONG) + if (TryCoalesceNonConstStoreLclFld(store, prevStore, currType, newType)) { return; } + return; - // Non-constant coalescing increases code size (adds cast+shift+or instructions). - // Only do it if we can see that the local is read back at a wider width within - // the next few instructions, which would cause a store-forwarding stall. - { - unsigned newOffset = min(prevOffset, currOffset); - unsigned combinedSize = genTypeSize(currType) * 2; - bool foundWiderRead = false; - GenTree* scanNode = store->gtNext; - const int scanLimit = 20; + } while (true); + + TryForwardConstantStoreLclFld(store); + +#endif // TARGET_XARCH || TARGET_ARM64 +} + +//------------------------------------------------------------------------ +// TryCoalesceNonConstStoreLclFld: Try to coalesce two adjacent non-constant +// GT_STORE_LCL_FLD stores into a single wider store by composing the two +// register values via cast + shift + OR. Only handles int+int -> long. +// +// This is only profitable when a wider read follows, which would otherwise +// cause a store-forwarding stall. +// +// Arguments: +// store - the current GT_STORE_LCL_FLD node +// prevStore - the previous GT_STORE_LCL_FLD node (same local, adjacent offset) +// currType - the type of both stores (must be TYP_INT) +// newType - the wider type to coalesce into (must be TYP_LONG) +// +// Return Value: +// true if the coalescing was performed, false otherwise. +// +bool Lowering::TryCoalesceNonConstStoreLclFld(GenTreeLclVarCommon* store, + GenTreeLclFld* prevStore, + var_types currType, + var_types newType) +{ +#ifdef TARGET_64BIT + // For now, restrict to int+int -> long which is the dominant case + // and avoids complications with small type register widening. + if (currType != TYP_INT || newType != TYP_LONG) + { + return false; + } - for (int scanCount = 0; scanNode != nullptr && scanCount < scanLimit; scanCount++) + unsigned currLclNum = store->GetLclNum(); + unsigned currOffset = store->AsLclFld()->GetLclOffs(); + unsigned prevOffset = prevStore->GetLclOffs(); + + // Non-constant coalescing increases code size (adds cast+shift+or instructions). + // Only do it if we can see that the local is read back at a wider width within + // the next few instructions, which would cause a store-forwarding stall. + { + unsigned newOffset = min(prevOffset, currOffset); + unsigned combinedSize = genTypeSize(currType) * 2; + bool foundWiderRead = false; + GenTree* scanNode = store->gtNext; + const int scanLimit = 20; + + for (int scanCount = 0; scanNode != nullptr && scanCount < scanLimit; scanCount++) + { + // Stop scanning at control flow boundaries. + if (scanNode->OperIsConditionalJump() || scanNode->OperGet() == GT_JMP || + scanNode->OperGet() == GT_RETURN || scanNode->OperGet() == GT_SWIFT_ERROR_RET) { - // Stop scanning at control flow boundaries. - if (scanNode->OperIsConditionalJump() || scanNode->OperGet() == GT_JMP || - scanNode->OperGet() == GT_RETURN || scanNode->OperGet() == GT_SWIFT_ERROR_RET) - { - break; - } + break; + } - // Check for a local read from the same variable that overlaps our store range - // at a wider width. - if (scanNode->OperIsLocalRead() && scanNode->AsLclVarCommon()->GetLclNum() == currLclNum) + // Check for a local read from the same variable that overlaps our store range + // at a wider width. + if (scanNode->OperIsLocalRead() && scanNode->AsLclVarCommon()->GetLclNum() == currLclNum) + { + unsigned readOffset = scanNode->AsLclVarCommon()->GetLclOffs(); + unsigned readSize = genTypeSize(scanNode->TypeGet()); + if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) { - unsigned readOffset = scanNode->AsLclVarCommon()->GetLclOffs(); - unsigned readSize = genTypeSize(scanNode->TypeGet()); - if (readSize == 0 && scanNode->TypeIs(TYP_STRUCT)) - { - readSize = scanNode->AsLclVarCommon()->GetLayout(m_compiler)->GetSize(); - } - - // Does this read overlap our combined store range and is it wider? - if (readSize > genTypeSize(currType) && readOffset <= newOffset && - (readOffset + readSize) >= (newOffset + combinedSize)) - { - foundWiderRead = true; - break; - } + readSize = scanNode->AsLclVarCommon()->GetLayout(m_compiler)->GetSize(); } - // Stop at stores to the same local (would invalidate the forwarding scenario). - if (scanNode->OperIsLocalStore() && scanNode->AsLclVarCommon()->GetLclNum() == currLclNum) + // Does this read overlap our combined store range and is it wider? + if (readSize > genTypeSize(currType) && readOffset <= newOffset && + (readOffset + readSize) >= (newOffset + combinedSize)) { + foundWiderRead = true; break; } - - scanNode = scanNode->gtNext; } - if (!foundWiderRead) + // Stop at stores to the same local (would invalidate the forwarding scenario). + if (scanNode->OperIsLocalStore() && scanNode->AsLclVarCommon()->GetLclNum() == currLclNum) { - JITDUMP("Skipping non-const GT_STORE_LCL_FLD coalescing: no wider read found within %d nodes\n", - scanLimit); - return; + break; } + + scanNode = scanNode->gtNext; } - // Both values must have closed tree ranges so we can safely relocate them. - bool isCurrValueClosed = false; - bool isPrevValueClosed = false; - BlockRange().GetTreeRange(currValue, &isCurrValueClosed); - BlockRange().GetTreeRange(prevValue, &isPrevValueClosed); - if (!isCurrValueClosed || !isPrevValueClosed) + if (!foundWiderRead) { - return; + JITDUMP("Skipping non-const GT_STORE_LCL_FLD coalescing: no wider read found within %d nodes\n", + scanLimit); + return false; } + } - JITDUMP("Coalescing two non-const GT_STORE_LCL_FLD stores via shift+OR:\n"); - JITDUMP(" Previous store: V%02u [+%u] %s\n", currLclNum, prevOffset, varTypeName(prevType)); - JITDUMP(" Current store: V%02u [+%u] %s\n", currLclNum, currOffset, varTypeName(currType)); - JITDUMP(" New type: %s\n", varTypeName(newType)); - - // Identify which value goes in the low bits and which in the high bits. - GenTree* lowValue = (prevOffset < currOffset) ? prevValue : currValue; - GenTree* highValue = (prevOffset < currOffset) ? currValue : prevValue; + GenTree* currValue = store->Data(); + GenTree* prevValue = prevStore->Data(); - // Clear containment flags — the values may have been marked contained for - // the original stores but need to be uncontained for the new CAST/OR tree. - lowValue->ClearContained(); - highValue->ClearContained(); + // Both values must have closed tree ranges so we can safely relocate them. + bool isCurrValueClosed = false; + bool isPrevValueClosed = false; + BlockRange().GetTreeRange(currValue, &isCurrValueClosed); + BlockRange().GetTreeRange(prevValue, &isPrevValueClosed); + if (!isCurrValueClosed || !isPrevValueClosed) + { + return false; + } - // Remove the previous store node, keeping its data subtree in the LIR. - prevStore->Data() = nullptr; - BlockRange().Remove(prevStore); + var_types prevType = prevStore->TypeGet(); + JITDUMP("Coalescing two non-const GT_STORE_LCL_FLD stores via shift+OR:\n"); + JITDUMP(" Previous store: V%02u [+%u] %s\n", currLclNum, prevOffset, varTypeName(prevType)); + JITDUMP(" Current store: V%02u [+%u] %s\n", currLclNum, currOffset, varTypeName(currType)); + JITDUMP(" New type: %s\n", varTypeName(newType)); - // Zero-extend both values to the new wider type (int -> long). - unsigned shiftBits = genTypeSize(currType) * BITS_PER_BYTE; + // Identify which value goes in the low bits and which in the high bits. + GenTree* lowValue = (prevOffset < currOffset) ? prevValue : currValue; + GenTree* highValue = (prevOffset < currOffset) ? currValue : prevValue; - if (genTypeSize(newType) > genTypeSize(genActualType(lowValue))) - { - GenTree* castLow = m_compiler->gtNewCastNode(newType, lowValue, true, newType); - BlockRange().InsertBefore(store, castLow); - lowValue = castLow; - } + // Clear containment flags — the values may have been marked contained for + // the original stores but need to be uncontained for the new CAST/OR tree. + lowValue->ClearContained(); + highValue->ClearContained(); - if (genTypeSize(newType) > genTypeSize(genActualType(highValue))) - { - GenTree* castHigh = m_compiler->gtNewCastNode(newType, highValue, true, newType); - BlockRange().InsertBefore(store, castHigh); - highValue = castHigh; - } + // Remove the previous store node, keeping its data subtree in the LIR. + prevStore->Data() = nullptr; + BlockRange().Remove(prevStore); - // Shift the high value left. - GenTree* shiftAmount = m_compiler->gtNewIconNode((ssize_t)shiftBits); - GenTree* shifted = m_compiler->gtNewOperNode(GT_LSH, newType, highValue, shiftAmount); - BlockRange().InsertBefore(store, shiftAmount, shifted); + // Zero-extend both values to the new wider type (int -> long). + unsigned shiftBits = genTypeSize(currType) * BITS_PER_BYTE; - // OR the low and shifted-high values. - GenTree* combined = m_compiler->gtNewOperNode(GT_OR, newType, lowValue, shifted); - BlockRange().InsertBefore(store, combined); + if (genTypeSize(newType) > genTypeSize(genActualType(lowValue))) + { + GenTree* castLow = m_compiler->gtNewCastNode(newType, lowValue, true, newType); + BlockRange().InsertBefore(store, castLow); + lowValue = castLow; + } - // Update the current store to use the combined value at the lower offset. - unsigned newOffset = min(prevOffset, currOffset); - store->AsLclFld()->SetLclOffs(newOffset); - store->gtType = newType; - store->Data() = combined; + if (genTypeSize(newType) > genTypeSize(genActualType(highValue))) + { + GenTree* castHigh = m_compiler->gtNewCastNode(newType, highValue, true, newType); + BlockRange().InsertBefore(store, castHigh); + highValue = castHigh; + } - JITDUMP("Coalesced two non-const stores into a single store via shift+OR\n"); - // Don't loop for non-constant coalescing — the combined value is no longer a simple - // constant, so further coalescing would need a different approach. - return; -#else - return; -#endif // TARGET_64BIT + // Shift the high value left. + GenTree* shiftAmount = m_compiler->gtNewIconNode((ssize_t)shiftBits); + GenTree* shifted = m_compiler->gtNewOperNode(GT_LSH, newType, highValue, shiftAmount); + BlockRange().InsertBefore(store, shiftAmount, shifted); - } while (true); + // OR the low and shifted-high values. + GenTree* combined = m_compiler->gtNewOperNode(GT_OR, newType, lowValue, shifted); + BlockRange().InsertBefore(store, combined); - TryForwardConstantStoreLclFld(store); + // Update the current store to use the combined value at the lower offset. + unsigned newOffset = min(prevOffset, currOffset); + store->AsLclFld()->SetLclOffs(newOffset); + store->gtType = newType; + store->Data() = combined; -#endif // TARGET_XARCH || TARGET_ARM64 + JITDUMP("Coalesced two non-const stores into a single store via shift+OR\n"); + return true; +#else + return false; +#endif // TARGET_64BIT } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 6bcbad7b43ca5c..fb17e64555b949 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -391,6 +391,10 @@ class Lowering final : public Phase void LowerStoreIndirCoalescing(GenTreeIndir* node); void LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store); void TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store); + bool TryCoalesceNonConstStoreLclFld(GenTreeLclVarCommon* store, + GenTreeLclFld* prevStore, + var_types currType, + var_types newType); GenTree* LowerAdd(GenTreeOp* node); GenTree* LowerMul(GenTreeOp* mul); bool TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode); From f160a84856dea4b9cba23201db41ac816ec4b236 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 15:11:20 -0700 Subject: [PATCH 18/27] JIT: Fix formatting via jit-format Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index ae0d1e1a7dee51..2f6fd548005dcf 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11266,8 +11266,7 @@ bool Lowering::TryCoalesceNonConstStoreLclFld(GenTreeLclVarCommon* store, if (!foundWiderRead) { - JITDUMP("Skipping non-const GT_STORE_LCL_FLD coalescing: no wider read found within %d nodes\n", - scanLimit); + JITDUMP("Skipping non-const GT_STORE_LCL_FLD coalescing: no wider read found within %d nodes\n", scanLimit); return false; } } @@ -11355,8 +11354,7 @@ bool Lowering::TryCoalesceNonConstStoreLclFld(GenTreeLclVarCommon* store, void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) { if (!store->OperIs(GT_STORE_LCL_FLD) || !store->Data()->IsCnsIntOrI() || - store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler) || - m_compiler->lvaVarAddrExposed(store->GetLclNum())) + store->Data()->AsIntCon()->ImmedValNeedsReloc(m_compiler) || m_compiler->lvaVarAddrExposed(store->GetLclNum())) { return; } @@ -11395,12 +11393,10 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) } // Extract the portion of the constant that corresponds to the read. - ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); - unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; - size_t readMask = (readSize >= sizeof(size_t)) - ? ~(size_t)0 - : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; - ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); + ssize_t fullVal = store->Data()->AsIntCon()->IconValue(); + unsigned bitOffset = (readOffset - offset) * BITS_PER_BYTE; + size_t readMask = (readSize >= sizeof(size_t)) ? ~(size_t)0 : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; + ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); // For the forwarded constant, use the read's type if it's integral, // otherwise use the store type. @@ -11409,8 +11405,7 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) JITDUMP("Forwarding constant store [%06u] to load [%06u] for V%02u[+%u]: " "store value 0x%llx, forwarded value 0x%llx (read offset +%u, size %u)\n", m_compiler->dspTreeID(store), m_compiler->dspTreeID(scanNode), lclNum, readOffset, - (unsigned long long)(size_t)fullVal, (unsigned long long)(size_t)forwardVal, - readOffset, readSize); + (unsigned long long)(size_t)fullVal, (unsigned long long)(size_t)forwardVal, readOffset, readSize); // Create a new constant node and insert it in place of the load. // Mark it with GTF_ICON_STRUCT_INIT_VAL so that TryTransformStoreObjAsStoreInd From f8261e8aa4be11baa9edf44dbce174717e457009 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 15:21:16 -0700 Subject: [PATCH 19/27] JIT: Remove redundant control flow boundary checks from scan loops BlockRange() is per-block, so gtNext naturally returns null at block boundaries. No need to explicitly check for conditional jumps, returns, or other terminators. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 2f6fd548005dcf..74496affa76651 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11228,13 +11228,6 @@ bool Lowering::TryCoalesceNonConstStoreLclFld(GenTreeLclVarCommon* store, for (int scanCount = 0; scanNode != nullptr && scanCount < scanLimit; scanCount++) { - // Stop scanning at control flow boundaries. - if (scanNode->OperIsConditionalJump() || scanNode->OperGet() == GT_JMP || - scanNode->OperGet() == GT_RETURN || scanNode->OperGet() == GT_SWIFT_ERROR_RET) - { - break; - } - // Check for a local read from the same variable that overlaps our store range // at a wider width. if (scanNode->OperIsLocalRead() && scanNode->AsLclVarCommon()->GetLclNum() == currLclNum) From 2a1eade8bffa4e8cb6fb74f8d27927c9cb896e22 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 16:05:31 -0700 Subject: [PATCH 20/27] JIT: Fix store forwarding for small types and struct reads Use genActualType for the forwarded constant type to ensure CNS_INT nodes use register-sized types (TYP_INT) instead of small types like TYP_USHORT which are not valid for standalone constants. Also restrict forwarding to integral-typed reads only. Struct-typed reads feed STORE_BLK nodes that expect struct operands with specific layouts and cannot accept a bare CNS_INT replacement. Fixes coreclr_tests failures. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 74496affa76651..16ef84e7b9238e 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11379,8 +11379,9 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) break; } - // Only forward to integral reads (not struct reads wider than a register). - if (!varTypeIsIntegral(scanNode->TypeGet()) && !scanNode->TypeIs(TYP_STRUCT)) + // Only forward to integral-typed reads. Struct-typed reads feed into + // STORE_BLK nodes that expect struct operands with specific layouts. + if (!varTypeIsIntegral(scanNode->TypeGet())) { break; } @@ -11391,9 +11392,8 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) size_t readMask = (readSize >= sizeof(size_t)) ? ~(size_t)0 : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); - // For the forwarded constant, use the read's type if it's integral, - // otherwise use the store type. - var_types fwdType = varTypeIsIntegral(scanNode->TypeGet()) ? scanNode->TypeGet() : storeType; + // For the forwarded constant, use the actual register type. + var_types fwdType = varTypeIsIntegral(scanNode->TypeGet()) ? genActualType(scanNode->TypeGet()) : storeType; JITDUMP("Forwarding constant store [%06u] to load [%06u] for V%02u[+%u]: " "store value 0x%llx, forwarded value 0x%llx (read offset +%u, size %u)\n", From d79c292c9cddd5e3404b0b58793de0725e8c5f6c Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Thu, 26 Mar 2026 18:59:41 -0700 Subject: [PATCH 21/27] JIT: Address PR review feedback - Change early returns in LowerStoreLclFldCoalescing to break so that TryForwardConstantStoreLclFld is reached even when coalescing is not applicable (different local, non-adjacent offsets, etc.). - Fix TryTransformStoreObjAsStoreInd to update blkNode->Data() after unwrapping GT_INIT_VAL, preventing a stale reference. - Move constant node creation inside TryGetUse success block in TryForwardConstantStoreLclFld to avoid inserting unreferenced nodes into LIR when the load has no user. - Use genActualType for forwarded constant type to avoid small types. - Allow struct-typed reads in forwarding when the struct fits in a register, but exclude reads feeding STORE_BLK or promoted stores. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 58 ++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 16ef84e7b9238e..f119628a9227e3 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10955,7 +10955,7 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) // Must be the same local variable. if (prevStore->GetLclNum() != currLclNum) { - return; + break; } // Both stores must be the same type. @@ -11021,14 +11021,14 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) } } - return; + break; } // The offsets must be adjacent (differ by exactly the type size). unsigned prevOffset = prevStore->GetLclOffs(); if (abs((int)prevOffset - (int)currOffset) != (int)genTypeSize(currType)) { - return; + break; } GenTree* currValue = store->Data(); @@ -11039,7 +11039,7 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) LIR::ReadOnlyRange prevRange = BlockRange().GetTreeRange(prevStore, &isPrevClosedRange); if (!isPrevClosedRange) { - return; + break; } bool isCurrConst = currValue->OperIsConst() && @@ -11379,11 +11379,34 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) break; } - // Only forward to integral-typed reads. Struct-typed reads feed into - // STORE_BLK nodes that expect struct operands with specific layouts. + // Only forward to integral-typed reads, or struct reads that fit in a register. + // Struct reads that feed promoted STORE_LCL_FLD(P) nodes cannot accept a bare + // CNS_INT because the promoted store decomposes into per-field stores. if (!varTypeIsIntegral(scanNode->TypeGet())) { - break; + if (!scanNode->TypeIs(TYP_STRUCT)) + { + break; + } + ClassLayout* layout = scanNode->AsLclVarCommon()->GetLayout(m_compiler); + if (layout == nullptr || layout->GetRegisterType() == TYP_UNDEF) + { + break; + } + // Check if the load's user is a promoted store or block copy — can't forward. + LIR::Use checkUse; + if (BlockRange().TryGetUse(scanNode, &checkUse)) + { + GenTree* user = checkUse.User(); + if (user->OperIs(GT_STORE_BLK)) + { + break; + } + if (user->OperIsLocalStore() && m_compiler->lvaGetDesc(user->AsLclVarCommon())->lvPromoted) + { + break; + } + } } // Extract the portion of the constant that corresponds to the read. @@ -11400,24 +11423,20 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) m_compiler->dspTreeID(store), m_compiler->dspTreeID(scanNode), lclNum, readOffset, (unsigned long long)(size_t)fullVal, (unsigned long long)(size_t)forwardVal, readOffset, readSize); - // Create a new constant node and insert it in place of the load. - // Mark it with GTF_ICON_STRUCT_INIT_VAL so that TryTransformStoreObjAsStoreInd - // uses it as-is instead of treating the low byte as a fill pattern. - GenTree* newConst = m_compiler->gtNewIconNode(forwardVal, fwdType); - newConst->gtFlags |= GTF_ICON_STRUCT_INIT_VAL; - BlockRange().InsertAfter(scanNode, newConst); - - // Replace all uses of the load with the new constant. + // Replace the load with a new constant, if the load has a user. LIR::Use use; if (BlockRange().TryGetUse(scanNode, &use)) { + // Create a new constant node and insert it in place of the load. + // Mark it with GTF_ICON_STRUCT_INIT_VAL so that TryTransformStoreObjAsStoreInd + // uses it as-is instead of treating the low byte as a fill pattern. + GenTree* newConst = m_compiler->gtNewIconNode(forwardVal, fwdType); + newConst->gtFlags |= GTF_ICON_STRUCT_INIT_VAL; + BlockRange().InsertAfter(scanNode, newConst); use.ReplaceWith(newConst); + BlockRange().Remove(scanNode); } - // Remove the load node. - BlockRange().Remove(scanNode); - - // The store may not be dead, so we leave it in place. break; } @@ -12407,6 +12426,7 @@ bool Lowering::TryTransformStoreObjAsStoreInd(GenTreeBlk* blkNode) { BlockRange().Remove(src); src = src->gtGetOp1(); + blkNode->SetData(src); } // If the constant was produced by store-load bypass (GTF_ICON_STRUCT_INIT_VAL), From 3945afb2d1e018ffad0d2658ccc07db593aed5e1 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 27 Mar 2026 08:45:05 -0700 Subject: [PATCH 22/27] JIT: Change GTF_ICON_STRUCT_INIT_VAL bit to avoid conflict Change GTF_ICON_STRUCT_INIT_VAL from 0x00100000 to 0x00400000 to avoid sharing a bit with GTF_IND_ALLOW_NON_ATOMIC. While these flags are on different node types (GT_CNS_INT vs GT_IND), flag propagation from data nodes to parent STOREIND nodes could cause the STOREIND coalescing to misinterpret a constant's struct-init flag as an atomicity hint, leading to incorrect store merging on x86. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/gentree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 69592d8351b6bc..96bba9e2208d0b 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -519,7 +519,7 @@ enum GenTreeFlags : unsigned // GTF_ICON_REUSE_REG_VAL = 0x00800000 // GT_CNS_INT -- GTF_REUSE_REG_VAL, defined above GTF_ICON_SIMD_COUNT = 0x00200000, // GT_CNS_INT -- constant is Vector.Count - GTF_ICON_STRUCT_INIT_VAL = 0x00100000, // GT_CNS_INT -- constant is a full struct init value, not a byte pattern + GTF_ICON_STRUCT_INIT_VAL = 0x00400000, // GT_CNS_INT -- constant is a full struct init value, not a byte pattern GTF_OVERFLOW = 0x10000000, // Supported for: GT_ADD, GT_SUB, GT_MUL and GT_CAST. // Requires an overflow check. Use gtOverflow(Ex)() to check this flag. From 786addd44d2190435f07af76d9a2af5312a529d0 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 27 Mar 2026 09:11:21 -0700 Subject: [PATCH 23/27] JIT: Restrict store forwarding to integral-typed reads only Struct-typed reads have ABI and layout requirements (e.g., stack argument passing on x86) that a bare CNS_INT replacement cannot satisfy. Forwarding a 4-byte CNS_INT in place of a 1-byte struct argument corrupts the calling convention on x86. Revert to the strict integral-only check, removing the struct register-type and STORE_BLK/promoted-store special cases. Fixes Runtime_73628 test failure on x86. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index f119628a9227e3..d0352e8e0a0e56 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11379,34 +11379,11 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) break; } - // Only forward to integral-typed reads, or struct reads that fit in a register. - // Struct reads that feed promoted STORE_LCL_FLD(P) nodes cannot accept a bare - // CNS_INT because the promoted store decomposes into per-field stores. + // Only forward to integral-typed reads. Struct-typed reads have specific + // ABI and layout requirements that a bare CNS_INT cannot satisfy. if (!varTypeIsIntegral(scanNode->TypeGet())) { - if (!scanNode->TypeIs(TYP_STRUCT)) - { - break; - } - ClassLayout* layout = scanNode->AsLclVarCommon()->GetLayout(m_compiler); - if (layout == nullptr || layout->GetRegisterType() == TYP_UNDEF) - { - break; - } - // Check if the load's user is a promoted store or block copy — can't forward. - LIR::Use checkUse; - if (BlockRange().TryGetUse(scanNode, &checkUse)) - { - GenTree* user = checkUse.User(); - if (user->OperIs(GT_STORE_BLK)) - { - break; - } - if (user->OperIsLocalStore() && m_compiler->lvaGetDesc(user->AsLclVarCommon())->lvPromoted) - { - break; - } - } + break; } // Extract the portion of the constant that corresponds to the read. From 283385a4ce87bf18fef89eeb63229ff3ec4290aa Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 27 Mar 2026 09:52:16 -0700 Subject: [PATCH 24/27] JIT: ABI-aware struct forwarding in TryForwardConstantStoreLclFld On 64-bit targets, allow forwarding through struct-typed reads when: - The struct layout fits in a single register (GetRegisterType != UNDEF) - The struct is >= 4 bytes (smaller structs have STORE_BLK complications) - The read does not feed a PUTARG_STK (calling convention mismatch) On 32-bit targets, struct reads are never forwarded since struct vs integer arguments have different calling conventions on x86. This restores the Priority:.cctor optimization (direct constant stores to static fields without stack round-trip) that was lost by the previous integral-only restriction, while keeping x86 safe. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index d0352e8e0a0e56..285a63b004b3d7 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11380,10 +11380,38 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) } // Only forward to integral-typed reads. Struct-typed reads have specific - // ABI and layout requirements that a bare CNS_INT cannot satisfy. + // ABI and layout requirements that a bare CNS_INT cannot always satisfy. + // On 64-bit, allow struct reads whose layout fits in a register, unless + // the read feeds a PUTARG_STK (where struct vs int ABI may differ). if (!varTypeIsIntegral(scanNode->TypeGet())) { - break; +#ifdef TARGET_64BIT + if (scanNode->TypeIs(TYP_STRUCT)) + { + ClassLayout* layout = scanNode->AsLclVarCommon()->GetLayout(m_compiler); + if (layout == nullptr || layout->GetRegisterType() == TYP_UNDEF) + { + break; + } + // Only allow struct forwarding for structs >= 4 bytes. Smaller structs + // have ABI complications (e.g., promoted STORE_BLK decomposition). + if (layout->GetSize() < 4) + { + break; + } + // PUTARG_STK: struct vs int may differ in calling convention. + LIR::Use checkUse; + if (BlockRange().TryGetUse(scanNode, &checkUse) && + checkUse.User()->OperIs(GT_PUTARG_STK)) + { + break; + } + } + else +#endif + { + break; + } } // Extract the portion of the constant that corresponds to the read. From fcf8bef51e3ef788a1a96124a40440129a4735fd Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 27 Mar 2026 10:49:52 -0700 Subject: [PATCH 25/27] JIT: Enable struct forwarding on all platforms Remove the TARGET_64BIT guard from struct read forwarding. The ABI safety is handled by the PUTARG_STK and size checks, which apply equally on x86 and x64. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 47 +++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 26 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 285a63b004b3d7..560e0c5b54cdf8 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -11379,36 +11379,31 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) break; } - // Only forward to integral-typed reads. Struct-typed reads have specific - // ABI and layout requirements that a bare CNS_INT cannot always satisfy. - // On 64-bit, allow struct reads whose layout fits in a register, unless - // the read feeds a PUTARG_STK (where struct vs int ABI may differ). + // Only forward to integral-typed reads. For struct-typed reads, allow + // forwarding when the struct fits in a register and the read doesn't + // feed a PUTARG_STK (where struct vs int ABI may differ). if (!varTypeIsIntegral(scanNode->TypeGet())) { -#ifdef TARGET_64BIT - if (scanNode->TypeIs(TYP_STRUCT)) + if (!scanNode->TypeIs(TYP_STRUCT)) { - ClassLayout* layout = scanNode->AsLclVarCommon()->GetLayout(m_compiler); - if (layout == nullptr || layout->GetRegisterType() == TYP_UNDEF) - { - break; - } - // Only allow struct forwarding for structs >= 4 bytes. Smaller structs - // have ABI complications (e.g., promoted STORE_BLK decomposition). - if (layout->GetSize() < 4) - { - break; - } - // PUTARG_STK: struct vs int may differ in calling convention. - LIR::Use checkUse; - if (BlockRange().TryGetUse(scanNode, &checkUse) && - checkUse.User()->OperIs(GT_PUTARG_STK)) - { - break; - } + break; } - else -#endif + ClassLayout* layout = scanNode->AsLclVarCommon()->GetLayout(m_compiler); + if (layout == nullptr || layout->GetRegisterType() == TYP_UNDEF) + { + break; + } + // Small structs (< 4 bytes) have complications with STORE_BLK + // decomposition into promoted field stores. + if (layout->GetSize() < 4) + { + break; + } + // Don't forward if the struct read feeds a PUTARG_STK — struct vs int + // may have different calling conventions (especially on x86). + LIR::Use checkUse; + if (BlockRange().TryGetUse(scanNode, &checkUse) && + checkUse.User()->OperIs(GT_PUTARG_STK)) { break; } From 7efe2d4e21a8dd98207302e210258064e57cb265 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 27 Mar 2026 16:25:26 -0700 Subject: [PATCH 26/27] JIT: Address second round of PR review feedback - Skip constant coalescing for address-exposed locals to preserve atomicity guarantees (matching LowerStoreIndirCoalescing behavior). - Call LowerNode on inserted CAST/LSH/OR nodes in non-constant coalescing so containment and target-specific lowering is applied. - Sign-extend extracted values for signed small types (TYP_BYTE, TYP_SHORT) in TryForwardConstantStoreLclFld to match GT_LCL_FLD load semantics. - Check for side effects (GTF_SIDE_EFFECT | GTF_ORDER_SIDEEFF) in both data trees before non-constant coalescing to ensure reordering the stores is safe. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 560e0c5b54cdf8..8da383319b2aa8 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10930,6 +10930,14 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) return; } + // For address-exposed locals, the wider merged store may not be as atomic + // as the individual narrower stores. Skip coalescing to match the + // atomicity guarantees of LowerStoreIndirCoalescing. + if (m_compiler->lvaVarAddrExposed(currLclNum)) + { + break; + } + // Make sure the current node's tree range is closed (no unexpected interleaved nodes). bool isClosedRange = false; LIR::ReadOnlyRange currRange = BlockRange().GetTreeRange(store, &isClosedRange); @@ -11277,6 +11285,17 @@ bool Lowering::TryCoalesceNonConstStoreLclFld(GenTreeLclVarCommon* store, return false; } + // Removing the previous store moves it past the current store's data evaluation. + // This is unsafe if the data tree has side effects or references the same local. + if ((currValue->gtFlags & (GTF_SIDE_EFFECT | GTF_ORDER_SIDEEFF)) != 0) + { + return false; + } + if ((prevValue->gtFlags & (GTF_SIDE_EFFECT | GTF_ORDER_SIDEEFF)) != 0) + { + return false; + } + var_types prevType = prevStore->TypeGet(); JITDUMP("Coalescing two non-const GT_STORE_LCL_FLD stores via shift+OR:\n"); JITDUMP(" Previous store: V%02u [+%u] %s\n", currLclNum, prevOffset, varTypeName(prevType)); @@ -11303,6 +11322,7 @@ bool Lowering::TryCoalesceNonConstStoreLclFld(GenTreeLclVarCommon* store, { GenTree* castLow = m_compiler->gtNewCastNode(newType, lowValue, true, newType); BlockRange().InsertBefore(store, castLow); + LowerNode(castLow); lowValue = castLow; } @@ -11310,6 +11330,7 @@ bool Lowering::TryCoalesceNonConstStoreLclFld(GenTreeLclVarCommon* store, { GenTree* castHigh = m_compiler->gtNewCastNode(newType, highValue, true, newType); BlockRange().InsertBefore(store, castHigh); + LowerNode(castHigh); highValue = castHigh; } @@ -11317,10 +11338,12 @@ bool Lowering::TryCoalesceNonConstStoreLclFld(GenTreeLclVarCommon* store, GenTree* shiftAmount = m_compiler->gtNewIconNode((ssize_t)shiftBits); GenTree* shifted = m_compiler->gtNewOperNode(GT_LSH, newType, highValue, shiftAmount); BlockRange().InsertBefore(store, shiftAmount, shifted); + LowerNode(shifted); // OR the low and shifted-high values. GenTree* combined = m_compiler->gtNewOperNode(GT_OR, newType, lowValue, shifted); BlockRange().InsertBefore(store, combined); + LowerNode(combined); // Update the current store to use the combined value at the lower offset. unsigned newOffset = min(prevOffset, currOffset); @@ -11415,6 +11438,20 @@ void Lowering::TryForwardConstantStoreLclFld(GenTreeLclVarCommon* store) size_t readMask = (readSize >= sizeof(size_t)) ? ~(size_t)0 : ((size_t)1 << (readSize * BITS_PER_BYTE)) - 1; ssize_t forwardVal = (ssize_t)(((size_t)fullVal >> bitOffset) & readMask); + // For signed small types, sign-extend the extracted value to match + // the GT_LCL_FLD load semantics. + var_types loadType = scanNode->TypeGet(); + if (varTypeIsIntegral(loadType) && varTypeIsSmall(loadType) && !varTypeIsUnsigned(loadType) && + readSize < sizeof(ssize_t)) + { + unsigned loadBits = readSize * BITS_PER_BYTE; + ssize_t signBit = (ssize_t)1 << (loadBits - 1); + if ((forwardVal & signBit) != 0) + { + forwardVal |= ~(((ssize_t)1 << loadBits) - 1); + } + } + // For the forwarded constant, use the actual register type. var_types fwdType = varTypeIsIntegral(scanNode->TypeGet()) ? genActualType(scanNode->TypeGet()) : storeType; From b698b839b91477b61bc3097db9d6aac22013dcc0 Mon Sep 17 00:00:00 2001 From: Andy Ayers Date: Fri, 27 Mar 2026 17:42:55 -0700 Subject: [PATCH 27/27] JIT: Remove overly broad address-exposed coalescing restriction The blanket lvaVarAddrExposed check was preventing constant store coalescing for address-exposed locals, causing GUID initialization and other constant struct patterns to regress from a single xmm load to many individual byte/word stores. Stack locals are always naturally aligned, so the merged wider store is safe even for address-exposed locals. The atomicity concern (torn writes visible to racy observers) does not apply to constant coalescing where we are just combining known constant values. Removes 280 method regressions on aspnet2. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/coreclr/jit/lower.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 8da383319b2aa8..fbeae2ad49247a 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -10930,14 +10930,6 @@ void Lowering::LowerStoreLclFldCoalescing(GenTreeLclVarCommon* store) return; } - // For address-exposed locals, the wider merged store may not be as atomic - // as the individual narrower stores. Skip coalescing to match the - // atomicity guarantees of LowerStoreIndirCoalescing. - if (m_compiler->lvaVarAddrExposed(currLclNum)) - { - break; - } - // Make sure the current node's tree range is closed (no unexpected interleaved nodes). bool isClosedRange = false; LIR::ReadOnlyRange currRange = BlockRange().GetTreeRange(store, &isClosedRange);