From 2caa126ab1e35639cb90e989b6d3edb38269670c Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sun, 5 Apr 2026 08:43:37 -0700
Subject: [PATCH 1/9] JIT: STORE_LCL_FLD coalescing

Refactor the lowering-time store coalescing machinery so adjacent constant GT_STORE_LCL_FLD
writes can reuse the existing combine logic that was previously limited to STOREIND/STORE_BLK.
Gate the new local-field path from LowerStoreLocCommon with JitEnableStoreLclFldCoalescing.
Extend the struct-promotion regression coverage for the newly handled cases.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/jitconfigvalues.h             |   1 +
 src/coreclr/jit/lower.cpp                     | 621 +++++++++++-------
 src/coreclr/jit/lower.h                       |  57 +-
 .../JIT/Directed/StructPromote/SpAddr.cs      |  55 +-
 4 files changed, 496 insertions(+), 238 deletions(-)

diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 6890ee573919d7..5318304d9206de 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -91,6 +91,7 @@ CONFIG_INTEGER(JitHideAlignBehindJmp, "JitHideAlignBehindJmp", 1)
 
 // Track stores to locals done through return buffers.
 CONFIG_INTEGER(JitOptimizeStructHiddenBuffer, "JitOptimizeStructHiddenBuffer", 1)
+RELEASE_CONFIG_INTEGER(JitEnableStoreLclFldCoalescing, "JitEnableStoreLclFldCoalescing", 1)
 
 CONFIG_INTEGER(JitUnrollLoopMaxIterationCount,
                "JitUnrollLoopMaxIterationCount",
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 6ebc78c1433591..06e5b3fe8b4689 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -5662,6 +5662,11 @@ GenTree* Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore)
 
     TryRetypingFloatingPointStoreToIntegerStore(lclStore);
 
+    if (lclStore->OperIs(GT_STORE_LCL_FLD) && (JitConfig.JitEnableStoreLclFldCoalescing() != 0))
+    {
+        LowerStoreCoalescing(lclStore);
+    }
+
     GenTree*   src           = lclStore->gtGetOp1();
     LclVarDsc* varDsc        = m_compiler->lvaGetDesc(lclStore);
     const bool srcIsMultiReg = src->IsMultiRegNode();
@@ -10436,6 +10441,7 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
     }
 
     data->targetType = ind->TypeGet();
+    data->accessSize = ind->Size();
     data->value      = isStore ? ind->Data() : nullptr;
     if (ind->Addr()->OperIs(GT_LEA))
     {
@@ -10458,6 +10464,10 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
         data->index    = index == nullptr ? nullptr : index;
         data->scale    = ind->Addr()->AsAddrMode()->GetScale();
         data->offset   = ind->Addr()->AsAddrMode()->Offset();
+        if ((base != nullptr) && base->OperIs(GT_LCL_VAR))
+        {
+            data->lclNum = base->AsLclVar()->GetLclNum();
+        }
     }
     else if (isNodeInvariant(m_compiler, ind->Addr(), true))
     {
@@ -10466,6 +10476,10 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
         data->index    = nullptr;
         data->scale    = 1;
         data->offset   = 0;
+        if ((ind->Addr() != nullptr) && ind->Addr()->OperIs(GT_LCL_VAR))
+        {
+            data->lclNum = ind->Addr()->AsLclVar()->GetLclNum();
+        }
     }
     else
     {
@@ -10483,125 +10497,264 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
 
     data->rangeStart = range.FirstNode();
     data->rangeEnd   = range.LastNode();
+    data->storeFlags = ind->gtFlags;
 
     return true;
 }
 
 //------------------------------------------------------------------------
-// LowerStoreIndirCoalescing: If the given IND/STOREIND node is followed by a similar
-//    IND/STOREIND node, try to merge them into a single store of a twice wider type. Example:
+// GetStoreCoalescingData: get the data needed to perform store coalescing for either
+//    STOREIND/STORE_BLK or STORE_LCL_FLD.
 //
-//    *  STOREIND  int
-//    +--*  LCL_VAR   byref  V00
-//    \--*  CNS_INT   int    0x1
+// Arguments:
+//    store - the store node
+//    data  - [OUT] coalescing data
 //
-//    *  STOREIND  int
-//    +--*  LEA(b+4)  byref
-//    |  \--*  LCL_VAR   byref  V00
-//    \--*  CNS_INT   int    0x2
+// Return Value:
+//    true if the data was successfully retrieved, false otherwise.
 //
-//    We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value
+bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* data) const
+{
+    if (store->OperIs(GT_STOREIND, GT_STORE_BLK))
+    {
+        return GetLoadStoreCoalescingData(store->AsIndir(), data);
+    }
+
+    if (!store->OperIs(GT_STORE_LCL_FLD))
+    {
+        return false;
+    }
+
+    auto isNodeInvariant = [](Compiler* compiler, GenTree* node) {
+        if (node->OperIsConst())
+        {
+            return true;
+        }
+
+        return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum());
+    };
+
+    auto* lclStore = store->AsLclFld();
+    if (!isNodeInvariant(m_compiler, lclStore->Data()))
+    {
+        return false;
+    }
+
+    bool               isClosedRange = false;
+    LIR::ReadOnlyRange range         = BlockRange().GetTreeRange(store, &isClosedRange);
+    if (!isClosedRange)
+    {
+        return false;
+    }
+
+    LclVarDsc* varDsc      = m_compiler->lvaGetDesc(lclStore);
+    data->targetType       = store->TypeGet();
+    data->accessSize       = lclStore->GetSize();
+    data->value            = lclStore->Data();
+    data->offset           = lclStore->GetLclOffs();
+    data->lclNum           = lclStore->GetLclNum();
+    data->isLocalStoreNode = true;
+    data->isAddressExposed = varDsc->IsAddressExposed();
+    data->storeFlags       = store->gtFlags;
+    data->rangeStart       = range.FirstNode();
+    data->rangeEnd         = range.LastNode();
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// LowerCheckCoalescedStoreAtomicity: decide whether it's safe to coalesce two adjacent
+//    stores into a single wider store without violating atomicity guarantees.
 //
-//    *  STOREIND  long
-//    +--*  LEA(b+0)  byref
-//    |  \--*  LCL_VAR   byref  V00
-//    \--*  CNS_INT   long  0x200000001
+// Arguments:
+//    currStore - the current store node
+//    prevStore - the previous store node
+//    currData  - coalescing data for the current store
+//    prevData  - coalescing data for the previous store
 //
-//   NOTE: Our memory model allows us to do this optimization, see Memory-model.md:
-//     * Adjacent non-volatile writes to the same location can be coalesced. (see Memory-model.md)
+// Return Value:
+//    true if it is safe to coalesce the two stores, false otherwise.
+//
+bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree*                       currStore,
+                                                 GenTree*                       prevStore,
+                                                 const LoadStoreCoalescingData& currData,
+                                                 const LoadStoreCoalescingData& prevData)
+{
+    // Now the hardest part: decide whether it's safe to use an unaligned write.
+    //
+    // IND<byte> is always fine (and all IND<X> created here from such)
+    // IND<simd> is not required to be atomic per our Memory Model
+    bool     allowsNonAtomic = currData.AllowsNonAtomic() && prevData.AllowsNonAtomic();
+    int      minOffset       = min(prevData.offset, currData.offset);
+    int      maxEndOffset    = max(prevData.offset + static_cast<int>(prevData.accessSize),
+                                   currData.offset + static_cast<int>(currData.accessSize));
+    unsigned combinedSize    = static_cast<unsigned>(maxEndOffset - minOffset);
+
+    if (!allowsNonAtomic && (currData.lclNum == m_compiler->info.compRetBuffArg))
+    {
+        // RetBuf is a private stack memory, so we don't need to worry about atomicity.
+        allowsNonAtomic = true;
+    }
+
+    if (!allowsNonAtomic && currData.IsLocalStore() && !currData.isAddressExposed && !prevData.isAddressExposed)
+    {
+        allowsNonAtomic = true;
+    }
+
+    if (!allowsNonAtomic && currData.IsLocalStore())
+    {
+        // During lowering we cannot rely on the final stack offset yet. Instead, use the stack-home
+        // alignment guarantees implied by local allocation: locals are naturally aligned up to pointer
+        // size, and address-exposed locals must be conservative beyond that.
+        unsigned homeAlignment = min(m_compiler->lvaLclStackHomeSize(currData.lclNum), (unsigned)TARGET_POINTER_SIZE);
+        bool     isCombinedStoreAtomic = (combinedSize <= homeAlignment) && ((minOffset % combinedSize) == 0);
+
+#ifdef TARGET_ARM64
+        if (currData.accessSize == TARGET_POINTER_SIZE)
+        {
+            // Per Arm ARM, a 128-bit SIMD write that is 64-bit aligned is treated as a pair of
+            // single-copy atomic 64-bit writes.
+            isCombinedStoreAtomic = (homeAlignment >= TARGET_POINTER_SIZE) && ((minOffset % TARGET_POINTER_SIZE) == 0);
+        }
+#endif
+
+        return isCombinedStoreAtomic;
+    }
+
+    if (!allowsNonAtomic && (genTypeSize(currStore) > 1) && !varTypeIsSIMD(currStore))
+    {
+        // Ignore indices for now, they can invalidate our alignment assumptions.
+        // Although, we can take scale into account.
+        if (currData.index != nullptr)
+        {
+            return false;
+        }
+
+        // Base address being TYP_REF gives us a hint that data is pointer-aligned.
+        if (!currData.baseAddr->TypeIs(TYP_REF))
+        {
+            return false;
+        }
+
+        // Check whether the combined indir is still aligned.
+        bool isCombinedIndirAtomic =
+            (combinedSize <= TARGET_POINTER_SIZE) && ((minOffset % static_cast<int>(combinedSize)) == 0);
+
+        if (combinedSize == 2 * TARGET_POINTER_SIZE)
+        {
+#ifdef TARGET_ARM64
+            // Per Arm Architecture Reference Manual for A-profile architecture:
+            //
+            // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory
+            //   are treated as a pair of single - copy atomic 64 - bit writes.
+            //
+            // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null)
+            //
+            // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise
+            // we're already doing a load that has no atomicity guarantees.
+            isCombinedIndirAtomic = true;
+#endif
+        }
+        else if (combinedSize > TARGET_POINTER_SIZE)
+        {
+            return false;
+        }
+
+        if (!isCombinedIndirAtomic)
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// LowerStoreCoalescing: if the given store is followed by a similar adjacent constant store,
+//    try to merge them into a single wider store. Supports STOREIND/STORE_BLK and STORE_LCL_FLD.
 //
 // Arguments:
-//    ind - the current STOREIND node
+//    node - current store node
 //
-void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
+void Lowering::LowerStoreCoalescing(GenTree* node)
 {
-// LA, RISC-V and ARM32 more likely to receive a terrible performance hit from
-// unaligned accesses making this optimization questionable.
 #if defined(TARGET_XARCH) || defined(TARGET_ARM64)
     if (!m_compiler->opts.OptimizationEnabled())
     {
         return;
     }
 
-    if (!ind->OperIs(GT_STOREIND, GT_STORE_BLK))
+    if (!node->OperIs(GT_STOREIND, GT_STORE_BLK, GT_STORE_LCL_FLD))
     {
-        // Load coalescing is not yet supported
         return;
     }
 
-    // We're going to do it in a loop while we see suitable STOREINDs to coalesce.
-    // E.g.: we have the following LIR sequence:
-    //
-    //     ...addr nodes...
-    //   STOREIND(int)
-    //     ...addr nodes...
-    //   STOREIND(short)
-    //     ...addr nodes...
-    //   STOREIND(short) <-- we're here
-    //
-    // First we merge two 'short' stores, then we merge the result with the 'int' store
-    // to get a single store of 8 bytes.
     do
     {
         LoadStoreCoalescingData currData;
         LoadStoreCoalescingData prevData;
 
-        // Get coalescing data for the current STOREIND
-        if (!GetLoadStoreCoalescingData(ind, &currData))
+        if (!GetStoreCoalescingData(node, &currData))
         {
             return;
         }
 
         GenTree* prevTree = currData.rangeStart->gtPrev;
-        // Now we need to find the previous STOREIND,
-        // we can ignore any NOPs or IL_OFFSETs in-between
         while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET))
         {
             prevTree = prevTree->gtPrev;
         }
 
-        // It's not a store - bail out.
-        if ((prevTree == nullptr) || !prevTree->OperIs(GT_STOREIND, GT_STORE_BLK))
+        if (prevTree == nullptr)
         {
             return;
         }
 
-        // Get coalescing data for the previous STOREIND
-        GenTreeIndir* prevInd = prevTree->AsIndir();
-        if (!GetLoadStoreCoalescingData(prevInd, &prevData))
+        if (node->OperIs(GT_STORE_LCL_FLD))
+        {
+            if (!prevTree->OperIs(GT_STORE_LCL_FLD))
+            {
+                return;
+            }
+        }
+        else if (!prevTree->OperIs(GT_STOREIND, GT_STORE_BLK))
         {
             return;
         }
 
-        if (!currData.IsAddressEqual(prevData, /*ignoreOffset*/ true))
+        if (!GetStoreCoalescingData(prevTree, &prevData))
         {
-            // Non-offset part of the address is not the same - bail out.
             return;
         }
 
-        // The same offset means that we're storing to the same location of the same width.
-        // Just remove the previous store then.
-        if (prevData.offset == currData.offset)
+        bool const allowOverlapOptimization = node->OperIs(GT_STORE_LCL_FLD);
+        bool const sameLocation = currData.IsAddressEqual(prevData, /*ignoreOffset*/ true, allowOverlapOptimization);
+
+        if (!sameLocation)
         {
+            return;
+        }
+
+        if ((currData.offset == prevData.offset) && (currData.targetType == prevData.targetType))
+        {
+            if (m_compiler->gtTreeHasSideEffects(prevData.value, GTF_SIDE_EFFECT | GTF_GLOB_EFFECT | GTF_ASG) ||
+                m_compiler->gtTreeHasSideEffects(currData.value, GTF_SIDE_EFFECT | GTF_GLOB_EFFECT | GTF_ASG))
+            {
+                return;
+            }
+
             BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
             continue;
         }
 
-        // TODO-ARM64-CQ: enable TYP_REF if we find a case where it's beneficial.
-        // The algorithm does support TYP_REF (with null value), but it seems to be not worth
-        // it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two
-        // items at once. Although, it may be profitable to do "stp q0, q0, [addr]".
-        if (!varTypeIsIntegral(ind) && !varTypeIsSIMD(ind))
+        if (!varTypeIsIntegral(node) && !varTypeIsSIMD(node))
         {
             return;
         }
 
-        assert(ind->OperIs(GT_STOREIND));
-        assert(prevInd->OperIs(GT_STOREIND));
         assert(prevData.IsStore());
         assert(currData.IsStore());
 
-        // For now, only non-relocatable constants are supported for data.
         if (!prevData.value->OperIsConst() || !currData.value->OperIsConst())
         {
             return;
@@ -10617,161 +10770,132 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
             return;
         }
 
-        // Otherwise, the difference between two offsets has to match the size of the type.
-        // We don't support overlapping stores.
-        if (abs(prevData.offset - currData.offset) != (int)genTypeSize(prevData.targetType))
+        if (node->OperIs(GT_STOREIND, GT_STORE_BLK) && !node->AsIndir()->Addr()->OperIs(GT_LEA))
         {
             return;
         }
 
-        // For now, we require the current STOREIND to have LEA (previous store may not have it)
-        // So we can easily adjust the offset, consider making it more flexible in future.
-        if (!ind->Addr()->OperIs(GT_LEA))
-        {
-            return;
-        }
-
-        // Now the hardest part: decide whether it's safe to use an unaligned write.
-        //
-        // IND<byte> is always fine (and all IND<X> created here from such)
-        // IND<simd> is not required to be atomic per our Memory Model
-        bool allowsNonAtomic =
-            ((ind->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0) && ((prevInd->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0);
+        var_types  oldType             = node->TypeGet();
+        var_types  newType             = TYP_UNDEF;
+        bool       tryReusingPrevValue = false;
+        int const  minOffset           = min(prevData.offset, currData.offset);
+        int const  prevEndOffset       = prevData.offset + static_cast<int>(prevData.accessSize);
+        int const  currEndOffset       = currData.offset + static_cast<int>(currData.accessSize);
+        int const  maxEndOffset        = max(prevEndOffset, currEndOffset);
+        int const  combinedSize        = maxEndOffset - minOffset;
+        bool const prevContainsCurr    = (prevData.offset <= currData.offset) && (prevEndOffset >= currEndOffset);
+        bool const currContainsPrev    = (currData.offset <= prevData.offset) && (currEndOffset >= prevEndOffset);
+        bool const storesAreAdjacent   = (prevEndOffset == currData.offset) || (currEndOffset == prevData.offset);
+        bool const storesHaveSameSize  = prevData.accessSize == currData.accessSize;
 
-        if (!allowsNonAtomic && currData.baseAddr->OperIs(GT_LCL_VAR) &&
-            (currData.baseAddr->AsLclVar()->GetLclNum() == m_compiler->info.compRetBuffArg))
+        if (allowOverlapOptimization && currContainsPrev)
         {
-            // RetBuf is a private stack memory, so we don't need to worry about atomicity.
-            allowsNonAtomic = true;
+            BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
+            continue;
         }
 
-        if (!allowsNonAtomic && (genTypeSize(ind) > 1) && !varTypeIsSIMD(ind))
+        if (allowOverlapOptimization && prevContainsCurr)
         {
-            // TODO-CQ: if we see that the target is a local memory (non address exposed)
-            // we can use any type (including SIMD) for a new load.
-
-            // Ignore indices for now, they can invalidate our alignment assumptions.
-            // Although, we can take scale into account.
-            if (currData.index != nullptr)
+            if (!varTypeIsIntegral(node))
             {
                 return;
             }
 
-            // Base address being TYP_REF gives us a hint that data is pointer-aligned.
-            if (!currData.baseAddr->TypeIs(TYP_REF))
+            switch (combinedSize)
             {
-                return;
+                case 1:
+                    newType = TYP_UBYTE;
+                    break;
+                case 2:
+                    newType = TYP_USHORT;
+                    break;
+                case 4:
+                    newType = TYP_INT;
+                    break;
+#ifdef TARGET_64BIT
+                case 8:
+                    newType = TYP_LONG;
+                    break;
+#endif
+                default:
+                    return;
             }
-
-            // Check whether the combined indir is still aligned.
-            bool isCombinedIndirAtomic = (genTypeSize(ind) < TARGET_POINTER_SIZE) &&
-                                         (min(prevData.offset, currData.offset) % (genTypeSize(ind) * 2)) == 0;
-
-            if (genTypeSize(ind) == TARGET_POINTER_SIZE)
+        }
+        else
+        {
+            if (!storesAreAdjacent || !storesHaveSameSize)
             {
-#ifdef TARGET_ARM64
-                // Per Arm Architecture Reference Manual for A-profile architecture:
-                //
-                // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory
-                //   are treated as a pair of single - copy atomic 64 - bit writes.
-                //
-                // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null)
-                //
-                // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise
-                // we're already doing a load that has no atomicity guarantees.
-                isCombinedIndirAtomic = true;
-#endif
+                return;
             }
 
-            if (!isCombinedIndirAtomic)
+            if (abs(prevData.offset - currData.offset) != static_cast<int>(prevData.accessSize))
             {
                 return;
             }
-        }
 
-        // Since we're merging two stores of the same type, the new type is twice wider.
-        var_types oldType             = ind->TypeGet();
-        var_types newType             = TYP_UNDEF;
-        bool      tryReusingPrevValue = false;
-        switch (oldType)
-        {
-            case TYP_BYTE:
-            case TYP_UBYTE:
-                newType = TYP_USHORT;
-                break;
+            switch (oldType)
+            {
+                case TYP_BYTE:
+                case TYP_UBYTE:
+                    newType = TYP_USHORT;
+                    break;
 
-            case TYP_SHORT:
-            case TYP_USHORT:
-                newType = TYP_INT;
-                break;
+                case TYP_SHORT:
+                case TYP_USHORT:
+                    newType = TYP_INT;
+                    break;
 
 #ifdef TARGET_64BIT
-            case TYP_INT:
-                newType = TYP_LONG;
-                break;
+                case TYP_INT:
+                    newType = TYP_LONG;
+                    break;
 
 #if defined(FEATURE_HW_INTRINSICS)
-            case TYP_LONG:
-            case TYP_REF:
-                // TLDR: we should be here only if one of the conditions is true:
-                // 1) Both GT_INDs have GTF_IND_ALLOW_NON_ATOMIC flag
-                // 2) ARM64: Data is at least 8-byte aligned
-                // 3) AMD64: Data is at least 16-byte aligned on AMD/Intel with AVX+
-                //
-                newType = TYP_SIMD16;
-                if ((oldType == TYP_REF) &&
-                    (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0)))
-                {
-                    // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.:
-                    //
-                    //   arr[1] = "hello";
-                    //   arr[0] = "world";
-                    //
-                    // but we don't want to load managed references into SIMD registers (we can only do so
-                    // when we can issue a nongc region for a block)
-                    return;
-                }
-                break;
+                case TYP_LONG:
+                case TYP_REF:
+                    newType = TYP_SIMD16;
+                    if ((oldType == TYP_REF) &&
+                        (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0)))
+                    {
+                        return;
+                    }
+                    break;
 
 #if defined(TARGET_AMD64)
-            case TYP_SIMD16:
-                if (m_compiler->getPreferredVectorByteLength() >= 32)
-                {
-                    newType = TYP_SIMD32;
+                case TYP_SIMD16:
+                    if (m_compiler->getPreferredVectorByteLength() >= 32)
+                    {
+                        newType = TYP_SIMD32;
+                        break;
+                    }
+                    tryReusingPrevValue = true;
                     break;
-                }
-                tryReusingPrevValue = true;
-                break;
 
-            case TYP_SIMD32:
-                if (m_compiler->getPreferredVectorByteLength() >= 64)
-                {
-                    newType = TYP_SIMD64;
+                case TYP_SIMD32:
+                    if (m_compiler->getPreferredVectorByteLength() >= 64)
+                    {
+                        newType = TYP_SIMD64;
+                        break;
+                    }
+                    tryReusingPrevValue = true;
                     break;
-                }
-                tryReusingPrevValue = true;
-                break;
-#elif defined(TARGET_ARM64) // TARGET_AMD64
-            case TYP_SIMD16:
-                tryReusingPrevValue = true;
-                break;
-
-#endif // TARGET_ARM64
-#endif // FEATURE_HW_INTRINSICS
-#endif // TARGET_64BIT
+#elif defined(TARGET_ARM64)
+                case TYP_SIMD16:
+                    tryReusingPrevValue = true;
+                    break;
+#endif
+#endif
+#endif
+                default:
+                    return;
+            }
+        }
 
-            // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
-            // be converted to TYP_INT/TYP_LONG for constant value.
-            //
-            // TYP_UINT and TYP_ULONG are not legal for GT_IND.
-            //
-            default:
-                return;
+        if (!LowerCheckCoalescedStoreAtomicity(node, prevTree, currData, prevData))
+        {
+            return;
         }
 
-        // If we can't merge these two stores into a single store, we can at least
-        // cache prevData.value to a local and reuse it in currData.
-        // Normally, LSRA is expected to do this for us, but it's not always the case for SIMD.
         if (tryReusingPrevValue)
         {
 #if defined(FEATURE_HW_INTRINSICS)
@@ -10783,43 +10907,56 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
                     m_compiler->gtNewLclvNode(use.ReplaceWithLclVar(m_compiler), prevData.value->TypeGet());
                 BlockRange().InsertBefore(currData.value, prevValueTmp);
                 BlockRange().Remove(currData.value);
-                ind->Data() = prevValueTmp;
+
+                if (node->OperIsLocalStore())
+                {
+                    node->AsLclVarCommon()->Data() = prevValueTmp;
+                }
+                else
+                {
+                    node->AsIndir()->Data() = prevValueTmp;
+                }
             }
-#endif // FEATURE_HW_INTRINSICS
+#endif
             return;
         }
 
         assert(newType != TYP_UNDEF);
 
-        // We should not be here for stores requiring write barriers.
-        assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind->AsStoreInd()));
-        assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd->AsStoreInd()));
+        if (node->OperIs(GT_STOREIND, GT_STORE_BLK))
+        {
+            auto* ind     = node->AsStoreInd();
+            auto* prevInd = prevTree->AsStoreInd();
 
-        // Delete previous STOREIND entirely
-        BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
+            assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind));
+            assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd));
 
-        // It's not expected to be contained yet, but just in case...
-        ind->Data()->ClearContained();
+            BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
 
-        // We know it's always LEA for now
-        GenTreeAddrMode* addr = ind->Addr()->AsAddrMode();
+            ind->Data()->ClearContained();
+            GenTreeAddrMode* addr = ind->Addr()->AsAddrMode();
+            addr->SetOffset(min(prevData.offset, currData.offset));
 
-        // Update offset to be the minimum of the two
-        addr->SetOffset(min(prevData.offset, currData.offset));
+            ind->gtType         = newType;
+            ind->Data()->gtType = newType;
+        }
+        else
+        {
+            auto* lclStore = node->AsLclFld();
+            BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
 
-        // Update type for both STOREIND and val
-        ind->gtType         = newType;
-        ind->Data()->gtType = newType;
+            lclStore->Data()->ClearContained();
+            lclStore->gtType         = newType;
+            lclStore->Data()->gtType = newType;
+            lclStore->SetLclOffs(min(prevData.offset, currData.offset));
+        }
 
 #if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS)
-        // Upgrading two SIMD stores to a wider SIMD store.
-        // Only on x64 since ARM64 has no options above SIMD16
         if (varTypeIsSIMD(oldType))
         {
             int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8;
             int8_t* upperCns = currData.value->AsVecCon()->gtSimdVal.i8;
 
-            // if the previous store was at a higher address, swap the constants
             if (prevData.offset > currData.offset)
             {
                 std::swap(lowerCns, upperCns);
@@ -10830,56 +10967,94 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
             memcpy(newCns.i8, lowerCns, oldWidth);
             memcpy(newCns.i8 + oldWidth, upperCns, oldWidth);
 
-            ind->Data()->AsVecCon()->gtSimdVal = newCns;
+            currData.value->AsVecCon()->gtSimdVal = newCns;
             continue;
         }
 #endif
 
-        size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue();
-        size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue();
-
-        // if the previous store was at a higher address, swap the constants
-        if (prevData.offset > currData.offset)
-        {
-            std::swap(lowerCns, upperCns);
-        }
+        size_t lowerCns = static_cast<size_t>(prevData.value->AsIntCon()->IconValue());
+        size_t upperCns = static_cast<size_t>(currData.value->AsIntCon()->IconValue());
 
 #if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS)
-        // We're promoting two TYP_LONG/TYP_REF into TYP_SIMD16
-        // All legality checks were done above.
         if (varTypeIsSIMD(newType))
         {
-            // Replace two 64bit constants with a single 128bit constant
+            if (prevData.offset > currData.offset)
+            {
+                std::swap(lowerCns, upperCns);
+            }
+
             int8_t val[16];
             memcpy(val, &lowerCns, 8);
             memcpy(val + 8, &upperCns, 8);
             GenTreeVecCon* vecCns = m_compiler->gtNewVconNode(newType, &val);
 
-            BlockRange().InsertAfter(ind->Data(), vecCns);
-            BlockRange().Remove(ind->Data());
-            ind->gtOp2 = vecCns;
+            BlockRange().InsertAfter(currData.value, vecCns);
+            BlockRange().Remove(currData.value);
+
+            if (node->OperIsLocalStore())
+            {
+                node->AsLclVarCommon()->Data() = vecCns;
+            }
+            else
+            {
+                node->AsIndir()->Data() = vecCns;
+            }
+
             continue;
         }
-#endif // TARGET_64BIT && FEATURE_HW_INTRINSICS
+#endif
 
-        // Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
-        // the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
-        size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_PER_BYTE;
-        lowerCns &= mask;
-        upperCns &= mask;
+        size_t prevMask = ~(size_t(0)) >> (sizeof(size_t) - prevData.accessSize) * BITS_PER_BYTE;
+        size_t currMask = ~(size_t(0)) >> (sizeof(size_t) - currData.accessSize) * BITS_PER_BYTE;
+        size_t newMask  = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(newType)) * BITS_PER_BYTE;
+        lowerCns &= prevMask;
+        upperCns &= currMask;
 
-        size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_PER_BYTE)));
+        size_t val = 0;
+        val |= (lowerCns << ((prevData.offset - minOffset) * BITS_PER_BYTE));
+        val |= (upperCns << ((currData.offset - minOffset) * BITS_PER_BYTE));
+        val &= newMask;
         JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val);
 
-        ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val;
-        if (genTypeSize(oldType) == 1)
+        auto* intCon      = currData.value->AsIntCon();
+        intCon->gtIconVal = static_cast<ssize_t>(val);
+        if (genTypeSize(oldType) == 1 && node->OperIsIndir())
         {
-            // A mark for future foldings that this IND doesn't need to be atomic.
-            ind->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC;
+            node->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC;
         }
-
     } while (true);
-#endif // TARGET_XARCH || TARGET_ARM64
+#endif
+}
+
+//------------------------------------------------------------------------
+// LowerStoreIndirCoalescing: If the given IND/STOREIND node is followed by a similar
+//    IND/STOREIND node, try to merge them into a single store of a twice wider type. Example:
+//
+//    *  STOREIND  int
+//    +--*  LCL_VAR   byref  V00
+//    \--*  CNS_INT   int    0x1
+//
+//    *  STOREIND  int
+//    +--*  LEA(b+4)  byref
+//    |  \--*  LCL_VAR   byref  V00
+//    \--*  CNS_INT   int    0x2
+//
+//    We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value
+//
+//    *  STOREIND  long
+//    +--*  LEA(b+0)  byref
+//    |  \--*  LCL_VAR   byref  V00
+//    \--*  CNS_INT   long  0x200000001
+//
+//   NOTE: Our memory model allows us to do this optimization, see Memory-model.md:
+//     * Adjacent non-volatile writes to the same location can be coalesced. (see Memory-model.md)
+//
+// Arguments:
+//    ind - the current STOREIND node
+//
+void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
+{
+    LowerStoreCoalescing(ind);
 }
 
 //------------------------------------------------------------------------
diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h
index 763f68ff607771..47f2c09f730fdd 100644
--- a/src/coreclr/jit/lower.h
+++ b/src/coreclr/jit/lower.h
@@ -350,32 +350,62 @@ class Lowering final : public Phase
 
     struct LoadStoreCoalescingData
     {
-        var_types targetType;
-        GenTree*  baseAddr;
-        GenTree*  index;
-        GenTree*  value;
-        uint32_t  scale;
-        int       offset;
-        GenTree*  rangeStart;
-        GenTree*  rangeEnd;
+        var_types    targetType       = TYP_UNDEF;
+        GenTree*     baseAddr         = nullptr;
+        GenTree*     index            = nullptr;
+        GenTree*     value            = nullptr;
+        uint32_t     scale            = 1;
+        int          offset           = 0;
+        unsigned     accessSize       = 0;
+        unsigned     lclNum           = BAD_VAR_NUM;
+        GenTreeFlags storeFlags       = GTF_EMPTY;
+        bool         isLocalStoreNode = false;
+        bool         isAddressExposed = false;
+        GenTree*     rangeStart       = nullptr;
+        GenTree*     rangeEnd         = nullptr;
 
         bool IsStore() const
         {
             return value != nullptr;
         }
 
-        bool IsAddressEqual(const LoadStoreCoalescingData& other, bool ignoreOffset) const
+        bool IsLocalStore() const
         {
-            if ((scale != other.scale) || (targetType != other.targetType) ||
-                !GenTree::Compare(baseAddr, other.baseAddr) || !GenTree::Compare(index, other.index))
+            return isLocalStoreNode;
+        }
+
+        bool AllowsNonAtomic() const
+        {
+            return (storeFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0;
+        }
+
+        bool IsAddressEqual(const LoadStoreCoalescingData& other,
+                            bool                           ignoreOffset,
+                            bool                           allowDifferentTypes = false) const
+        {
+            if ((scale != other.scale) || (!allowDifferentTypes && (targetType != other.targetType)))
             {
                 return false;
             }
+
+            if (IsLocalStore() || other.IsLocalStore())
+            {
+                if (!IsLocalStore() || !other.IsLocalStore() || (lclNum != other.lclNum))
+                {
+                    return false;
+                }
+            }
+            else if (!GenTree::Compare(baseAddr, other.baseAddr) || !GenTree::Compare(index, other.index))
+            {
+                return false;
+            }
+
             return ignoreOffset || (offset == other.offset);
         }
     };
 
     bool GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescingData* data) const;
+    bool GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* data) const;
 
     // Per tree node member functions
     GenTree* LowerStoreIndirCommon(GenTreeStoreInd* ind);
@@ -388,7 +418,12 @@ class Lowering final : public Phase
     void     MarkTree(GenTree* root);
     void     UnmarkTree(GenTree* root);
     GenTree* LowerStoreIndir(GenTreeStoreInd* node);
+    void     LowerStoreCoalescing(GenTree* node);
     void     LowerStoreIndirCoalescing(GenTreeIndir* node);
+    bool     LowerCheckCoalescedStoreAtomicity(GenTree*                       currStore,
+                                               GenTree*                       prevStore,
+                                               const LoadStoreCoalescingData& currData,
+                                               const LoadStoreCoalescingData& prevData);
     GenTree* LowerAdd(GenTreeOp* node);
     GenTree* LowerMul(GenTreeOp* mul);
     bool     TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode);
diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
index b9fa68710da733..9ae7a9c83543ff 100644
--- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs
+++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
@@ -2,20 +2,24 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 //
 
-using System.Runtime.CompilerServices;
 using System;
+using System.Runtime.CompilerServices;
 using Xunit;
 
 public class SpAddr
 {
-
-    // Struct in reg (2 ints)
     struct S
     {
         public int i0;
         public int i1;
     }
 
+    struct Pair
+    {
+        public int A;
+        public int B;
+    }
+
     [MethodImpl(MethodImplOptions.NoInlining)]
     static int Foo(S s0, S s1)
     {
@@ -35,12 +39,55 @@ static int M(int i0, int i1, int i2, int i3)
         return Foo(s0, s1); // r0 <= r1; r1 <= r0; r2 <= r3; r3 <= r2
     }
 
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long Consume(Pair p) => ((long)p.B << 32) | (uint)p.A;
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static void Expose(ref Pair p)
+    {
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long NonAddressExposed(int a, int b)
+    {
+        Pair p;
+        p.A = a;
+        p.B = b;
+        return Consume(p);
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long AddressExposed(int a, int b)
+    {
+        Pair p = default;
+        Expose(ref p);
+        p.A = a;
+        p.B = b;
+        return Consume(p);
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static int OverlappingZeroThenShort()
+    {
+        Pair p = default;
+        p.A = 0;
+        Unsafe.As<Pair, short>(ref p) = 256;
+        return p.A;
+    }
+
     [Fact]
     public static int TestEntryPoint()
     {
         int res = M(1, 2, 3, 4);
         Console.WriteLine("M(1, 2, 3, 4) is {0}.", res);
-        if (res == 10)
+        long nonAddressExposed = NonAddressExposed(0x11223344, 0x55667788);
+        long addressExposed    = AddressExposed(0x10203040, 0x50607080);
+        int  overlapping       = OverlappingZeroThenShort();
+
+        if ((res == 10) &&
+            (nonAddressExposed == 0x5566778811223344L) &&
+            (addressExposed == 0x5060708010203040L) &&
+            (overlapping == 256))
             return 100;
         else
             return 99;

From 8d03f6e7395093a93ec38e941f5aa623fa57adae Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Tue, 7 Apr 2026 13:39:39 -0700
Subject: [PATCH 2/9] review feedback

---
 src/coreclr/jit/lower.cpp | 131 +++++++++++++++++++++++---------------
 src/coreclr/jit/lower.h   |   1 -
 2 files changed, 81 insertions(+), 51 deletions(-)

diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 06e5b3fe8b4689..f71394ac4d63c9 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -10391,6 +10391,34 @@ void Lowering::LowerBlockStoreAsHelperCall(GenTreeBlk* blkNode)
     }
 }
 
+//------------------------------------------------------------------------
+// IsStoreCoalescingInvariantNode: check whether a node is simple enough to participate in
+//    lowering-time store coalescing.
+//
+// Arguments:
+//    compiler  - the compiler instance
+//    node      - the node to examine
+//    allowNull - true if a null node should be considered valid
+//
+// Return Value:
+//    true if the node is an allowed invariant input for store coalescing; otherwise false.
+//
+static bool IsStoreCoalescingInvariantNode(Compiler* compiler, GenTree* node, bool allowNull = false)
+{
+    if (node == nullptr)
+    {
+        return allowNull;
+    }
+
+    if (node->OperIsConst())
+    {
+        return true;
+    }
+
+    // We can allow bigger trees here, but it's not clear if it's worth it.
+    return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum());
+}
+
 //------------------------------------------------------------------------
 // GetLoadStoreCoalescingData: given a STOREIND/IND node, get the data needed to perform
 //    store/load coalescing including pointer to the previous node.
@@ -10414,23 +10442,10 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
     const bool isStore = ind->OperIs(GT_STOREIND, GT_STORE_BLK);
     const bool isLoad  = ind->OperIs(GT_IND);
 
-    auto isNodeInvariant = [](Compiler* m_compiler, GenTree* node, bool allowNull) {
-        if (node == nullptr)
-        {
-            return allowNull;
-        }
-        if (node->OperIsConst())
-        {
-            return true;
-        }
-        // We can allow bigger trees here, but it's not clear if it's worth it.
-        return node->OperIs(GT_LCL_VAR) && !m_compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum());
-    };
-
     if (isStore)
     {
         // For stores, Data() is expected to be an invariant node
-        if (!isNodeInvariant(m_compiler, ind->Data(), false))
+        if (!IsStoreCoalescingInvariantNode(m_compiler, ind->Data()))
         {
             return false;
         }
@@ -10447,14 +10462,14 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
     {
         GenTree* base  = ind->Addr()->AsAddrMode()->Base();
         GenTree* index = ind->Addr()->AsAddrMode()->Index();
-        if (!isNodeInvariant(m_compiler, base, false))
+        if (!IsStoreCoalescingInvariantNode(m_compiler, base))
         {
             // Base must be a local. It's possible for it to be nullptr when index is not null,
             // but let's ignore such cases.
             return false;
         }
 
-        if (!isNodeInvariant(m_compiler, index, true))
+        if (!IsStoreCoalescingInvariantNode(m_compiler, index, true))
         {
             // Index should be either nullptr or a local.
             return false;
@@ -10469,7 +10484,7 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
             data->lclNum = base->AsLclVar()->GetLclNum();
         }
     }
-    else if (isNodeInvariant(m_compiler, ind->Addr(), true))
+    else if (IsStoreCoalescingInvariantNode(m_compiler, ind->Addr(), true))
     {
         // Address is just a local, no offset, scale is 1
         data->baseAddr = ind->Addr();
@@ -10525,17 +10540,8 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d
         return false;
     }
 
-    auto isNodeInvariant = [](Compiler* compiler, GenTree* node) {
-        if (node->OperIsConst())
-        {
-            return true;
-        }
-
-        return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum());
-    };
-
     auto* lclStore = store->AsLclFld();
-    if (!isNodeInvariant(m_compiler, lclStore->Data()))
+    if (!IsStoreCoalescingInvariantNode(m_compiler, lclStore->Data()))
     {
         return false;
     }
@@ -10568,7 +10574,6 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d
 //
 // Arguments:
 //    currStore - the current store node
-//    prevStore - the previous store node
 //    currData  - coalescing data for the current store
 //    prevData  - coalescing data for the previous store
 //
@@ -10576,7 +10581,6 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d
 //    true if it is safe to coalesce the two stores, false otherwise.
 //
 bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree*                       currStore,
-                                                 GenTree*                       prevStore,
                                                  const LoadStoreCoalescingData& currData,
                                                  const LoadStoreCoalescingData& prevData)
 {
@@ -10612,8 +10616,8 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree*
 #ifdef TARGET_ARM64
         if (currData.accessSize == TARGET_POINTER_SIZE)
         {
-            // Per Arm ARM, a 128-bit SIMD write that is 64-bit aligned is treated as a pair of
-            // single-copy atomic 64-bit writes.
+            // See the Arm Architecture Reference Manual for A-profile architecture, "Single-copy atomicity":
+            // a 128-bit SIMD write that is 64-bit aligned is treated as a pair of single-copy atomic 64-bit writes.
             isCombinedStoreAtomic = (homeAlignment >= TARGET_POINTER_SIZE) && ((minOffset % TARGET_POINTER_SIZE) == 0);
         }
 #endif
@@ -10643,15 +10647,13 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree*
         if (combinedSize == 2 * TARGET_POINTER_SIZE)
         {
 #ifdef TARGET_ARM64
-            // Per Arm Architecture Reference Manual for A-profile architecture:
-            //
-            // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory
-            //   are treated as a pair of single - copy atomic 64 - bit writes.
-            //
-            // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null)
+            // See the Arm Architecture Reference Manual for A-profile architecture, "Single-copy atomicity":
+            // writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory are
+            // treated as a pair of single-copy atomic 64-bit writes.
             //
-            // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise
-            // we're already doing a load that has no atomicity guarantees.
+            // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null), and assume TYP_LONG/TYP_REF
+            // are already 64-bit aligned. Otherwise the existing load/store sequence would not have atomicity
+            // guarantees either.
             isCombinedIndirAtomic = true;
 #endif
         }
@@ -10678,6 +10680,9 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree*
 //
 void Lowering::LowerStoreCoalescing(GenTree* node)
 {
+    // LA, RISC-V, ARM32, and WASM are more likely to receive a large performance hit from unaligned accesses,
+    // making this optimization questionable there.
+    // TODO-WASM-CQ: re-evaluate enabling this once we have better data for Wasm unaligned accesses.
 #if defined(TARGET_XARCH) || defined(TARGET_ARM64)
     if (!m_compiler->opts.OptimizationEnabled())
     {
@@ -10737,16 +10742,23 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
 
         if ((currData.offset == prevData.offset) && (currData.targetType == prevData.targetType))
         {
-            if (m_compiler->gtTreeHasSideEffects(prevData.value, GTF_SIDE_EFFECT | GTF_GLOB_EFFECT | GTF_ASG) ||
-                m_compiler->gtTreeHasSideEffects(currData.value, GTF_SIDE_EFFECT | GTF_GLOB_EFFECT | GTF_ASG))
+            if (m_compiler->gtTreeHasSideEffects(prevData.value, GTF_GLOB_EFFECT) ||
+                m_compiler->gtTreeHasSideEffects(currData.value, GTF_GLOB_EFFECT))
             {
                 return;
             }
 
+            JITDUMP(
+                "Store coalescing: removing previous store [%06u] because store [%06u] rewrites the same location\n",
+                m_compiler->dspTreeID(prevTree), m_compiler->dspTreeID(node));
             BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
             continue;
         }
 
+        // TODO-ARM64-CQ: enable TYP_REF if we find a case where it's beneficial.
+        // The algorithm does support TYP_REF (with null value), but it seems to be not worth
+        // it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two
+        // items at once. Although, it may be profitable to do "stp q0, q0, [addr]".
         if (!varTypeIsIntegral(node) && !varTypeIsSIMD(node))
         {
             return;
@@ -10790,6 +10802,8 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
 
         if (allowOverlapOptimization && currContainsPrev)
         {
+            JITDUMP("Store coalescing: removing previous store [%06u] because store [%06u] fully overwrites it\n",
+                    m_compiler->dspTreeID(prevTree), m_compiler->dspTreeID(node));
             BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
             continue;
         }
@@ -10853,16 +10867,29 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
 #if defined(FEATURE_HW_INTRINSICS)
                 case TYP_LONG:
                 case TYP_REF:
+                    // TLDR: we should be here only if one of the conditions is true:
+                    // 1) Both stores have the GTF_IND_ALLOW_NON_ATOMIC flag
+                    // 2) ARM64: data is at least 8-byte aligned
+                    // 3) AMD64: data is at least 16-byte aligned on AMD/Intel with AVX+
                     newType = TYP_SIMD16;
                     if ((oldType == TYP_REF) &&
                         (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0)))
                     {
+                        // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.:
+                        //
+                        //   arr[1] = "hello";
+                        //   arr[0] = "world";
+                        //
+                        // but we don't want to load managed references into SIMD registers (we can only do so
+                        // when we can issue a nongc region for a block).
                         return;
                     }
                     break;
 
 #if defined(TARGET_AMD64)
                 case TYP_SIMD16:
+                    // Upgrading two SIMD stores to a wider SIMD store.
+                    // Only on x64 since ARM64 has no options above SIMD16.
                     if (m_compiler->getPreferredVectorByteLength() >= 32)
                     {
                         newType = TYP_SIMD32;
@@ -10879,19 +10906,19 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
                     }
                     tryReusingPrevValue = true;
                     break;
-#elif defined(TARGET_ARM64)
+#elif defined(TARGET_ARM64) // TARGET_AMD64
                 case TYP_SIMD16:
                     tryReusingPrevValue = true;
                     break;
-#endif
-#endif
-#endif
+#endif                      // TARGET_AMD64
+#endif                      // FEATURE_HW_INTRINSICS
+#endif                      // TARGET_64BIT
                 default:
                     return;
             }
         }
 
-        if (!LowerCheckCoalescedStoreAtomicity(node, prevTree, currData, prevData))
+        if (!LowerCheckCoalescedStoreAtomicity(node, currData, prevData))
         {
             return;
         }
@@ -10917,7 +10944,7 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
                     node->AsIndir()->Data() = prevValueTmp;
                 }
             }
-#endif
+#endif // FEATURE_HW_INTRINSICS
             return;
         }
 
@@ -10952,6 +10979,8 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
         }
 
 #if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS)
+        // Upgrading two SIMD stores to a wider SIMD store.
+        // Only on x64 since ARM64 has no options above SIMD16.
         if (varTypeIsSIMD(oldType))
         {
             int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8;
@@ -10970,8 +10999,10 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
             currData.value->AsVecCon()->gtSimdVal = newCns;
             continue;
         }
-#endif
+#endif // TARGET_AMD64 && FEATURE_HW_INTRINSICS
 
+        // The integer path below places each constant according to its byte offset, so it doesn't need to swap the
+        // values first. Only the SIMD packing paths above need to normalize lower/upper order explicitly.
         size_t lowerCns = static_cast<size_t>(prevData.value->AsIntCon()->IconValue());
         size_t upperCns = static_cast<size_t>(currData.value->AsIntCon()->IconValue());
 
@@ -11002,7 +11033,7 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
 
             continue;
         }
-#endif
+#endif // TARGET_64BIT && FEATURE_HW_INTRINSICS
 
         size_t prevMask = ~(size_t(0)) >> (sizeof(size_t) - prevData.accessSize) * BITS_PER_BYTE;
         size_t currMask = ~(size_t(0)) >> (sizeof(size_t) - currData.accessSize) * BITS_PER_BYTE;
@@ -11023,7 +11054,7 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
             node->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC;
         }
     } while (true);
-#endif
+#endif // TARGET_XARCH || TARGET_ARM64
 }
 
 //------------------------------------------------------------------------
diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h
index 47f2c09f730fdd..698396ecbc406b 100644
--- a/src/coreclr/jit/lower.h
+++ b/src/coreclr/jit/lower.h
@@ -421,7 +421,6 @@ class Lowering final : public Phase
     void     LowerStoreCoalescing(GenTree* node);
     void     LowerStoreIndirCoalescing(GenTreeIndir* node);
     bool     LowerCheckCoalescedStoreAtomicity(GenTree*                       currStore,
-                                               GenTree*                       prevStore,
                                                const LoadStoreCoalescingData& currData,
                                                const LoadStoreCoalescingData& prevData);
     GenTree* LowerAdd(GenTreeOp* node);

From 578f52ba2407b0bdc9ec26d8b7e2cbf802262d8b Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Thu, 9 Apr 2026 14:50:32 -0700
Subject: [PATCH 3/9] fix issue with overlapping stores

---
 src/coreclr/jit/lower.cpp                      | 13 +++++++++----
 src/tests/JIT/Directed/StructPromote/SpAddr.cs | 10 +++++-----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index f71394ac4d63c9..701c376be4d63f 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -11041,10 +11041,15 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
         lowerCns &= prevMask;
         upperCns &= currMask;
 
-        size_t val = 0;
-        val |= (lowerCns << ((prevData.offset - minOffset) * BITS_PER_BYTE));
-        val |= (upperCns << ((currData.offset - minOffset) * BITS_PER_BYTE));
-        val &= newMask;
+        unsigned const prevShift = static_cast<unsigned>((prevData.offset - minOffset) * BITS_PER_BYTE);
+        unsigned const currShift = static_cast<unsigned>((currData.offset - minOffset) * BITS_PER_BYTE);
+
+        size_t prevBits = (lowerCns << prevShift) & newMask;
+        size_t currBits = (upperCns << currShift) & newMask;
+
+        // Later stores must overwrite any overlapping bytes from earlier stores.
+        size_t currBitsMask = (currMask << currShift) & newMask;
+        size_t val          = (prevBits & ~currBitsMask) | currBits;
         JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val);
 
         auto* intCon      = currData.value->AsIntCon();
diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
index 9ae7a9c83543ff..e0ec39b84927ac 100644
--- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs
+++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
@@ -67,11 +67,11 @@ static long AddressExposed(int a, int b)
     }
 
     [MethodImpl(MethodImplOptions.NoInlining)]
-    static int OverlappingZeroThenShort()
+    static int OverlappingNonZeroThenShort()
     {
         Pair p = default;
-        p.A = 0;
-        Unsafe.As<Pair, short>(ref p) = 256;
+        p.A = unchecked((int)0xDEADBEEF);
+        Unsafe.As<Pair, short>(ref p) = 0;
         return p.A;
     }
 
@@ -82,12 +82,12 @@ public static int TestEntryPoint()
         Console.WriteLine("M(1, 2, 3, 4) is {0}.", res);
         long nonAddressExposed = NonAddressExposed(0x11223344, 0x55667788);
         long addressExposed    = AddressExposed(0x10203040, 0x50607080);
-        int  overlapping       = OverlappingZeroThenShort();
+        int  overlapping       = OverlappingNonZeroThenShort();
 
         if ((res == 10) &&
             (nonAddressExposed == 0x5566778811223344L) &&
             (addressExposed == 0x5060708010203040L) &&
-            (overlapping == 256))
+            (overlapping == unchecked((int)0xDEAD0000)))
             return 100;
         else
             return 99;

From d9a6a7d773eb0101d6d3f2f8cb505f09fab702af Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Fri, 10 Apr 2026 07:24:29 -0700
Subject: [PATCH 4/9] fix mixed int/float cases

---
 src/coreclr/jit/lower.cpp                     | 67 +++++++++++++++++--
 .../JIT/Directed/StructPromote/SpAddr.cs      | 23 ++++++-
 2 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 701c376be4d63f..9b62b2b18a1d96 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -10419,6 +10419,51 @@ static bool IsStoreCoalescingInvariantNode(Compiler* compiler, GenTree* node, bo
     return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum());
 }
 
+//------------------------------------------------------------------------
+// TryGetStoreCoalescingConstantBits: get the raw bits for a constant used by store
+//    coalescing.
+//
+// Arguments:
+//    value - the constant node
+//    bits  - [out] the raw constant bits
+//
+// Return Value:
+//    true if the constant can participate in store coalescing; otherwise false.
+//
+static bool TryGetStoreCoalescingConstantBits(GenTree* value, size_t* bits)
+{
+    assert(bits != nullptr);
+    *bits = 0;
+
+    if (value->IsCnsIntOrI())
+    {
+        *bits = static_cast<size_t>(value->AsIntCon()->IconValue());
+        return true;
+    }
+
+    if (value->IsCnsFltOrDbl())
+    {
+        if (value->TypeIs(TYP_FLOAT))
+        {
+            float floatCns = static_cast<float>(value->AsDblCon()->DconValue());
+            memcpy(bits, &floatCns, sizeof(floatCns));
+            return true;
+        }
+
+#ifdef TARGET_64BIT
+        // This helper returns bits through size_t, so only 64-bit targets can carry the full payload of a double.
+        // That also matches the current coalescing surface: double-to-integer store retyping only matters when the
+        // resulting 8-byte integer store is supported.
+        assert(value->TypeIs(TYP_DOUBLE));
+        double doubleCns = value->AsDblCon()->DconValue();
+        memcpy(bits, &doubleCns, sizeof(doubleCns));
+        return true;
+#endif
+    }
+
+    return false;
+}
+
 //------------------------------------------------------------------------
 // GetLoadStoreCoalescingData: given a STOREIND/IND node, get the data needed to perform
 //    store/load coalescing including pointer to the previous node.
@@ -10733,7 +10778,11 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
         }
 
         bool const allowOverlapOptimization = node->OperIs(GT_STORE_LCL_FLD);
-        bool const sameLocation = currData.IsAddressEqual(prevData, /*ignoreOffset*/ true, allowOverlapOptimization);
+        int const  prevEndOffset            = prevData.offset + static_cast<int>(prevData.accessSize);
+        int const  currEndOffset            = currData.offset + static_cast<int>(currData.accessSize);
+        bool const storesOverlap            = (currData.offset < prevEndOffset) && (prevData.offset < currEndOffset);
+        bool const sameLocation =
+            currData.IsAddressEqual(prevData, /*ignoreOffset*/ true, allowOverlapOptimization && storesOverlap);
 
         if (!sameLocation)
         {
@@ -10791,8 +10840,6 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
         var_types  newType             = TYP_UNDEF;
         bool       tryReusingPrevValue = false;
         int const  minOffset           = min(prevData.offset, currData.offset);
-        int const  prevEndOffset       = prevData.offset + static_cast<int>(prevData.accessSize);
-        int const  currEndOffset       = currData.offset + static_cast<int>(currData.accessSize);
         int const  maxEndOffset        = max(prevEndOffset, currEndOffset);
         int const  combinedSize        = maxEndOffset - minOffset;
         bool const prevContainsCurr    = (prevData.offset <= currData.offset) && (prevEndOffset >= currEndOffset);
@@ -10983,6 +11030,11 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
         // Only on x64 since ARM64 has no options above SIMD16.
         if (varTypeIsSIMD(oldType))
         {
+            if (!prevData.value->OperIs(GT_CNS_VEC) || !currData.value->OperIs(GT_CNS_VEC))
+            {
+                return;
+            }
+
             int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8;
             int8_t* upperCns = currData.value->AsVecCon()->gtSimdVal.i8;
 
@@ -11003,8 +11055,13 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
 
         // The integer path below places each constant according to its byte offset, so it doesn't need to swap the
         // values first. Only the SIMD packing paths above need to normalize lower/upper order explicitly.
-        size_t lowerCns = static_cast<size_t>(prevData.value->AsIntCon()->IconValue());
-        size_t upperCns = static_cast<size_t>(currData.value->AsIntCon()->IconValue());
+        size_t lowerCns = 0;
+        size_t upperCns = 0;
+        if (!TryGetStoreCoalescingConstantBits(prevData.value, &lowerCns) ||
+            !TryGetStoreCoalescingConstantBits(currData.value, &upperCns))
+        {
+            return;
+        }
 
 #if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS)
         if (varTypeIsSIMD(newType))
diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
index e0ec39b84927ac..ab5f694192e12e 100644
--- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs
+++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
@@ -20,6 +20,12 @@ struct Pair
         public int B;
     }
 
+    struct FloatPair
+    {
+        public float A;
+        public float B;
+    }
+
     [MethodImpl(MethodImplOptions.NoInlining)]
     static int Foo(S s0, S s1)
     {
@@ -42,6 +48,10 @@ static int M(int i0, int i1, int i2, int i3)
     [MethodImpl(MethodImplOptions.NoInlining)]
     static long Consume(Pair p) => ((long)p.B << 32) | (uint)p.A;
 
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long Consume(FloatPair p) =>
+        ((long)(uint)BitConverter.SingleToInt32Bits(p.B) << 32) | (uint)BitConverter.SingleToInt32Bits(p.A);
+
     [MethodImpl(MethodImplOptions.NoInlining)]
     static void Expose(ref Pair p)
     {
@@ -75,6 +85,15 @@ static int OverlappingNonZeroThenShort()
         return p.A;
     }
 
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long AdjacentFloatNonZeroThenZero()
+    {
+        FloatPair p = default;
+        p.A = 1.0f;
+        p.B = 0.0f;
+        return Consume(p);
+    }
+
     [Fact]
     public static int TestEntryPoint()
     {
@@ -83,11 +102,13 @@ public static int TestEntryPoint()
         long nonAddressExposed = NonAddressExposed(0x11223344, 0x55667788);
         long addressExposed    = AddressExposed(0x10203040, 0x50607080);
         int  overlapping       = OverlappingNonZeroThenShort();
+        long floatPair         = AdjacentFloatNonZeroThenZero();
 
         if ((res == 10) &&
             (nonAddressExposed == 0x5566778811223344L) &&
             (addressExposed == 0x5060708010203040L) &&
-            (overlapping == unchecked((int)0xDEAD0000)))
+            (overlapping == unchecked((int)0xDEAD0000)) &&
+            (floatPair == 0x000000003F800000L))
             return 100;
         else
             return 99;

From c61e8da06d624753e76d6bf193bd81dd04aa2b96 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Mon, 13 Apr 2026 09:49:37 -0700
Subject: [PATCH 5/9] review feedback: use BitOperations and uint64_t to build
 coaleseced constant

---
 src/coreclr/jit/lower.cpp | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 9b62b2b18a1d96..dc299d24a78b7f 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -10430,14 +10430,14 @@ static bool IsStoreCoalescingInvariantNode(Compiler* compiler, GenTree* node, bo
 // Return Value:
 //    true if the constant can participate in store coalescing; otherwise false.
 //
-static bool TryGetStoreCoalescingConstantBits(GenTree* value, size_t* bits)
+static bool TryGetStoreCoalescingConstantBits(GenTree* value, uint64_t* bits)
 {
     assert(bits != nullptr);
     *bits = 0;
 
     if (value->IsCnsIntOrI())
     {
-        *bits = static_cast<size_t>(value->AsIntCon()->IconValue());
+        *bits = static_cast<uint64_t>(value->AsIntCon()->IconValue());
         return true;
     }
 
@@ -10446,17 +10446,15 @@ static bool TryGetStoreCoalescingConstantBits(GenTree* value, size_t* bits)
         if (value->TypeIs(TYP_FLOAT))
         {
             float floatCns = static_cast<float>(value->AsDblCon()->DconValue());
-            memcpy(bits, &floatCns, sizeof(floatCns));
+            *bits          = BitOperations::SingleToUInt32Bits(floatCns);
             return true;
         }
 
 #ifdef TARGET_64BIT
-        // This helper returns bits through size_t, so only 64-bit targets can carry the full payload of a double.
-        // That also matches the current coalescing surface: double-to-integer store retyping only matters when the
-        // resulting 8-byte integer store is supported.
+        // We only need the raw 64-bit payload for targets where the resulting 8-byte coalesced store is supported.
         assert(value->TypeIs(TYP_DOUBLE));
         double doubleCns = value->AsDblCon()->DconValue();
-        memcpy(bits, &doubleCns, sizeof(doubleCns));
+        *bits            = BitOperations::DoubleToUInt64Bits(doubleCns);
         return true;
 #endif
     }
@@ -10777,6 +10775,12 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
             return;
         }
 
+        if (((currData.storeFlags | prevData.storeFlags) & (GTF_IND_VOLATILE | GTF_ORDER_SIDEEFF)) != 0)
+        {
+            // Keep volatile and other ordering-sensitive stores in their original form.
+            return;
+        }
+
         bool const allowOverlapOptimization = node->OperIs(GT_STORE_LCL_FLD);
         int const  prevEndOffset            = prevData.offset + static_cast<int>(prevData.accessSize);
         int const  currEndOffset            = currData.offset + static_cast<int>(currData.accessSize);
@@ -11055,8 +11059,8 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
 
         // The integer path below places each constant according to its byte offset, so it doesn't need to swap the
         // values first. Only the SIMD packing paths above need to normalize lower/upper order explicitly.
-        size_t lowerCns = 0;
-        size_t upperCns = 0;
+        uint64_t lowerCns = 0;
+        uint64_t upperCns = 0;
         if (!TryGetStoreCoalescingConstantBits(prevData.value, &lowerCns) ||
             !TryGetStoreCoalescingConstantBits(currData.value, &upperCns))
         {
@@ -11092,23 +11096,24 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
         }
 #endif // TARGET_64BIT && FEATURE_HW_INTRINSICS
 
-        size_t prevMask = ~(size_t(0)) >> (sizeof(size_t) - prevData.accessSize) * BITS_PER_BYTE;
-        size_t currMask = ~(size_t(0)) >> (sizeof(size_t) - currData.accessSize) * BITS_PER_BYTE;
-        size_t newMask  = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(newType)) * BITS_PER_BYTE;
+        uint64_t prevMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - prevData.accessSize) * BITS_PER_BYTE;
+        uint64_t currMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - currData.accessSize) * BITS_PER_BYTE;
+        uint64_t newMask  = ~(uint64_t(0)) >> (sizeof(uint64_t) - genTypeSize(newType)) * BITS_PER_BYTE;
         lowerCns &= prevMask;
         upperCns &= currMask;
 
         unsigned const prevShift = static_cast<unsigned>((prevData.offset - minOffset) * BITS_PER_BYTE);
         unsigned const currShift = static_cast<unsigned>((currData.offset - minOffset) * BITS_PER_BYTE);
 
-        size_t prevBits = (lowerCns << prevShift) & newMask;
-        size_t currBits = (upperCns << currShift) & newMask;
+        uint64_t prevBits = (lowerCns << prevShift) & newMask;
+        uint64_t currBits = (upperCns << currShift) & newMask;
 
         // Later stores must overwrite any overlapping bytes from earlier stores.
-        size_t currBitsMask = (currMask << currShift) & newMask;
-        size_t val          = (prevBits & ~currBitsMask) | currBits;
+        uint64_t currBitsMask = (currMask << currShift) & newMask;
+        uint64_t val          = (prevBits & ~currBitsMask) | currBits;
         JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val);
 
+        assert(currData.value->OperIs(GT_CNS_INT));
         auto* intCon      = currData.value->AsIntCon();
         intCon->gtIconVal = static_cast<ssize_t>(val);
         if (genTypeSize(oldType) == 1 && node->OperIsIndir())

From a0cc8d7a9755d2d75a390868038e3d6a190cb4b8 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Mon, 20 Apr 2026 08:45:34 -0700
Subject: [PATCH 6/9] fixes

---
 src/coreclr/jit/lower.cpp | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 2ac9d6d3ce1a3f..32a9305a84b7bd 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -10574,7 +10574,7 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
 
 //------------------------------------------------------------------------
 // GetStoreCoalescingData: get the data needed to perform store coalescing for either
-//    STOREIND/STORE_BLK or STORE_LCL_FLD.
+//    STOREIND/STORE_BLK or local stores.
 //
 // Arguments:
 //    store - the store node
@@ -10590,12 +10590,12 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d
         return GetLoadStoreCoalescingData(store->AsIndir(), data);
     }
 
-    if (!store->OperIs(GT_STORE_LCL_FLD))
+    if (!store->OperIsLocalStore())
     {
         return false;
     }
 
-    auto* lclStore = store->AsLclFld();
+    auto* lclStore = store->AsLclVarCommon();
     if (!IsStoreCoalescingInvariantNode(m_compiler, lclStore->Data()))
     {
         return false;
@@ -10610,13 +10610,16 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d
 
     LclVarDsc* varDsc      = m_compiler->lvaGetDesc(lclStore);
     data->targetType       = store->TypeGet();
-    data->accessSize       = lclStore->GetSize();
+    data->accessSize       = store->OperIs(GT_STORE_LCL_FLD) ? lclStore->AsLclFld()->GetSize() : varDsc->lvExactSize();
     data->value            = lclStore->Data();
     data->offset           = lclStore->GetLclOffs();
     data->lclNum           = lclStore->GetLclNum();
     data->isLocalStoreNode = true;
     data->isAddressExposed = varDsc->IsAddressExposed();
-    data->storeFlags       = store->gtFlags;
+    // Local stores don't use the GTF_IND_* flag space semantically. In particular, GT_STORE_LCL_FLD
+    // may carry GTF_VAR_USEASG, which shares the same bit as GTF_IND_VOLATILE. Keep only the flags
+    // that are actually meaningful for indir coalescing decisions.
+    data->storeFlags       = GTF_EMPTY;
     data->rangeStart       = range.FirstNode();
     data->rangeEnd         = range.LastNode();
 
@@ -10728,7 +10731,8 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree*
 
 //------------------------------------------------------------------------
 // LowerStoreCoalescing: if the given store is followed by a similar adjacent constant store,
-//    try to merge them into a single wider store. Supports STOREIND/STORE_BLK and STORE_LCL_FLD.
+//    try to merge them into a single wider store. Supports STOREIND/STORE_BLK and STORE_LCL_FLD,
+//    where the previous local store may be either STORE_LCL_VAR or STORE_LCL_FLD.
 //
 // Arguments:
 //    node - current store node
@@ -10772,7 +10776,7 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
 
         if (node->OperIs(GT_STORE_LCL_FLD))
         {
-            if (!prevTree->OperIs(GT_STORE_LCL_FLD))
+            if (!prevTree->OperIsLocalStore())
             {
                 return;
             }
@@ -10787,9 +10791,11 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
             return;
         }
 
-        if (((currData.storeFlags | prevData.storeFlags) & (GTF_IND_VOLATILE | GTF_ORDER_SIDEEFF)) != 0)
+        if (((currData.storeFlags | prevData.storeFlags) & GTF_IND_VOLATILE) != 0)
         {
-            // Keep volatile and other ordering-sensitive stores in their original form.
+            // Keep volatile stores in their original form.
+            // Regular STOREIND nodes may carry GTF_ORDER_SIDEEFF, and the pre-existing
+            // STOREIND coalescing logic intentionally allowed those cases.
             return;
         }
 
@@ -11032,13 +11038,16 @@ void Lowering::LowerStoreCoalescing(GenTree* node)
         }
         else
         {
-            auto* lclStore = node->AsLclFld();
+            auto* lclStore = node->AsLclVarCommon();
             BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
 
             lclStore->Data()->ClearContained();
             lclStore->gtType         = newType;
             lclStore->Data()->gtType = newType;
-            lclStore->SetLclOffs(min(prevData.offset, currData.offset));
+            if (lclStore->OperIs(GT_STORE_LCL_FLD))
+            {
+                lclStore->AsLclFld()->SetLclOffs(min(prevData.offset, currData.offset));
+            }
         }
 
 #if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS)

From 33c46127146466ea5a5f1c2606722d862397204d Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Mon, 20 Apr 2026 09:35:48 -0700
Subject: [PATCH 7/9] remove comment

---
 src/coreclr/jit/lower.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 32a9305a84b7bd..b173d8c6b9d5e5 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -10616,12 +10616,9 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d
     data->lclNum           = lclStore->GetLclNum();
     data->isLocalStoreNode = true;
     data->isAddressExposed = varDsc->IsAddressExposed();
-    // Local stores don't use the GTF_IND_* flag space semantically. In particular, GT_STORE_LCL_FLD
-    // may carry GTF_VAR_USEASG, which shares the same bit as GTF_IND_VOLATILE. Keep only the flags
-    // that are actually meaningful for indir coalescing decisions.
-    data->storeFlags       = GTF_EMPTY;
-    data->rangeStart       = range.FirstNode();
-    data->rangeEnd         = range.LastNode();
+    data->storeFlags = GTF_EMPTY;
+    data->rangeStart = range.FirstNode();
+    data->rangeEnd   = range.LastNode();
 
     return true;
 }

From 3c96099310c0ba902761f73991ad94201b39c778 Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Mon, 20 Apr 2026 12:22:44 -0700
Subject: [PATCH 8/9] format

---
 src/coreclr/jit/lower.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index b173d8c6b9d5e5..62124f149d322c 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -10616,9 +10616,9 @@ bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* d
     data->lclNum           = lclStore->GetLclNum();
     data->isLocalStoreNode = true;
     data->isAddressExposed = varDsc->IsAddressExposed();
-    data->storeFlags = GTF_EMPTY;
-    data->rangeStart = range.FirstNode();
-    data->rangeEnd   = range.LastNode();
+    data->storeFlags       = GTF_EMPTY;
+    data->rangeStart       = range.FirstNode();
+    data->rangeEnd         = range.LastNode();
 
     return true;
 }

From b276d3375f5cbc6ec7228b542969443c7c7807cc Mon Sep 17 00:00:00 2001
From: Andy Ayers <andya@microsoft.com>
Date: Sat, 25 Apr 2026 08:38:39 -0700
Subject: [PATCH 9/9] Address PR feedback: guard RetBuf check and make test
 endian-agnostic

- LowerCheckCoalescedStoreAtomicity: also require compRetBuffArg != BAD_VAR_NUM
  before relaxing atomicity for retbuf stores. Previously, when no retbuf was
  present, both lclNum and compRetBuffArg defaulted to BAD_VAR_NUM and the
  comparison trivially held, allowing non-atomic coalescing for unrelated
  indirect stores.

- SpAddr regression test: compute the expected `overlapping` value based on
  BitConverter.IsLittleEndian so the test does not rely on little-endian byte
  layout.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/coreclr/jit/lower.cpp                      | 3 ++-
 src/tests/JIT/Directed/StructPromote/SpAddr.cs | 6 +++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index e26dc6a6edaec4..5d783052b2fe8b 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -10656,7 +10656,8 @@ bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree*
                                    currData.offset + static_cast<int>(currData.accessSize));
     unsigned combinedSize    = static_cast<unsigned>(maxEndOffset - minOffset);
 
-    if (!allowsNonAtomic && (currData.lclNum == m_compiler->info.compRetBuffArg))
+    if (!allowsNonAtomic && (m_compiler->info.compRetBuffArg != BAD_VAR_NUM) &&
+        (currData.lclNum == m_compiler->info.compRetBuffArg))
     {
         // RetBuf is a private stack memory, so we don't need to worry about atomicity.
         allowsNonAtomic = true;
diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
index ab5f694192e12e..e44c3176c89c0e 100644
--- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs
+++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
@@ -104,10 +104,14 @@ public static int TestEntryPoint()
         int  overlapping       = OverlappingNonZeroThenShort();
         long floatPair         = AdjacentFloatNonZeroThenZero();
 
+        int expectedOverlapping = BitConverter.IsLittleEndian
+            ? unchecked((int)0xDEAD0000)
+            : unchecked((int)0x0000BEEF);
+
         if ((res == 10) &&
             (nonAddressExposed == 0x5566778811223344L) &&
             (addressExposed == 0x5060708010203040L) &&
-            (overlapping == unchecked((int)0xDEAD0000)) &&
+            (overlapping == expectedOverlapping) &&
             (floatPair == 0x000000003F800000L))
             return 100;
         else