diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 44cbd6d0e5eb71..e747bb8223c03d 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -91,6 +91,7 @@ CONFIG_INTEGER(JitHideAlignBehindJmp, "JitHideAlignBehindJmp", 1)
 
 // Track stores to locals done through return buffers.
 CONFIG_INTEGER(JitOptimizeStructHiddenBuffer, "JitOptimizeStructHiddenBuffer", 1)
+RELEASE_CONFIG_INTEGER(JitEnableStoreLclFldCoalescing, "JitEnableStoreLclFldCoalescing", 1)
 
 CONFIG_INTEGER(JitUnrollLoopMaxIterationCount,
                "JitUnrollLoopMaxIterationCount",
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 4d30b7d109967a..5d783052b2fe8b 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -5668,6 +5668,11 @@ GenTree* Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore)
 
     TryRetypingFloatingPointStoreToIntegerStore(lclStore);
 
+    if (lclStore->OperIs(GT_STORE_LCL_FLD) && (JitConfig.JitEnableStoreLclFldCoalescing() != 0))
+    {
+        LowerStoreCoalescing(lclStore);
+    }
+
     GenTree*   src           = lclStore->gtGetOp1();
     LclVarDsc* varDsc        = m_compiler->lvaGetDesc(lclStore);
     const bool srcIsMultiReg = src->IsMultiRegNode();
@@ -10405,6 +10410,77 @@ void Lowering::LowerBlockStoreAsHelperCall(GenTreeBlk* blkNode)
     }
 }
 
+//------------------------------------------------------------------------
+// IsStoreCoalescingInvariantNode: check whether a node is simple enough to participate in
+//    lowering-time store coalescing.
+//
+// Arguments:
+//    compiler  - the compiler instance
+//    node      - the node to examine
+//    allowNull - true if a null node should be considered valid
+//
+// Return Value:
+//    true if the node is an allowed invariant input for store coalescing; otherwise false.
+//
+static bool IsStoreCoalescingInvariantNode(Compiler* compiler, GenTree* node, bool allowNull = false)
+{
+    if (node == nullptr)
+    {
+        return allowNull;
+    }
+
+    if (node->OperIsConst())
+    {
+        return true;
+    }
+
+    // We can allow bigger trees here, but it's not clear if it's worth it.
+    return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum());
+}
+
+//------------------------------------------------------------------------
+// TryGetStoreCoalescingConstantBits: get the raw bits for a constant used by store
+//    coalescing.
+//
+// Arguments:
+//    value - the constant node
+//    bits  - [out] the raw constant bits
+//
+// Return Value:
+//    true if the constant can participate in store coalescing; otherwise false.
+//
+static bool TryGetStoreCoalescingConstantBits(GenTree* value, uint64_t* bits)
+{
+    assert(bits != nullptr);
+    *bits = 0;
+
+    if (value->IsCnsIntOrI())
+    {
+        *bits = static_cast<uint64_t>(value->AsIntCon()->IconValue());
+        return true;
+    }
+
+    if (value->IsCnsFltOrDbl())
+    {
+        if (value->TypeIs(TYP_FLOAT))
+        {
+            float floatCns = static_cast<float>(value->AsDblCon()->DconValue());
+            *bits          = BitOperations::SingleToUInt32Bits(floatCns);
+            return true;
+        }
+
+#ifdef TARGET_64BIT
+        // We only need the raw 64-bit payload for targets where the resulting 8-byte coalesced store is supported.
+        assert(value->TypeIs(TYP_DOUBLE));
+        double doubleCns = value->AsDblCon()->DconValue();
+        *bits            = BitOperations::DoubleToUInt64Bits(doubleCns);
+        return true;
+#endif
+    }
+
+    return false;
+}
+
 //------------------------------------------------------------------------
 // GetLoadStoreCoalescingData: given a STOREIND/IND node, get the data needed to perform
 //    store/load coalescing including pointer to the previous node.
@@ -10428,23 +10504,10 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
     const bool isStore = ind->OperIs(GT_STOREIND, GT_STORE_BLK);
     const bool isLoad  = ind->OperIs(GT_IND);
 
-    auto isNodeInvariant = [](Compiler* m_compiler, GenTree* node, bool allowNull) {
-        if (node == nullptr)
-        {
-            return allowNull;
-        }
-        if (node->OperIsConst())
-        {
-            return true;
-        }
-        // We can allow bigger trees here, but it's not clear if it's worth it.
-        return node->OperIs(GT_LCL_VAR) && !m_compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum());
-    };
-
     if (isStore)
     {
         // For stores, Data() is expected to be an invariant node
-        if (!isNodeInvariant(m_compiler, ind->Data(), false))
+        if (!IsStoreCoalescingInvariantNode(m_compiler, ind->Data()))
         {
             return false;
         }
@@ -10455,19 +10518,20 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
     }
 
     data->targetType = ind->TypeGet();
+    data->accessSize = ind->Size();
     data->value      = isStore ? ind->Data() : nullptr;
     if (ind->Addr()->OperIs(GT_LEA))
     {
         GenTree* base  = ind->Addr()->AsAddrMode()->Base();
         GenTree* index = ind->Addr()->AsAddrMode()->Index();
-        if (!isNodeInvariant(m_compiler, base, false))
+        if (!IsStoreCoalescingInvariantNode(m_compiler, base))
         {
             // Base must be a local. It's possible for it to be nullptr when index is not null,
             // but let's ignore such cases.
             return false;
         }
 
-        if (!isNodeInvariant(m_compiler, index, true))
+        if (!IsStoreCoalescingInvariantNode(m_compiler, index, true))
         {
             // Index should be either nullptr or a local.
             return false;
@@ -10477,14 +10541,22 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
         data->index    = index == nullptr ? nullptr : index;
         data->scale    = ind->Addr()->AsAddrMode()->GetScale();
         data->offset   = ind->Addr()->AsAddrMode()->Offset();
+        if ((base != nullptr) && base->OperIs(GT_LCL_VAR))
+        {
+            data->lclNum = base->AsLclVar()->GetLclNum();
+        }
     }
-    else if (isNodeInvariant(m_compiler, ind->Addr(), true))
+    else if (IsStoreCoalescingInvariantNode(m_compiler, ind->Addr(), true))
     {
         // Address is just a local, no offset, scale is 1
         data->baseAddr = ind->Addr();
         data->index    = nullptr;
         data->scale    = 1;
         data->offset   = 0;
+        if ((ind->Addr() != nullptr) && ind->Addr()->OperIs(GT_LCL_VAR))
+        {
+            data->lclNum = ind->Addr()->AsLclVar()->GetLclNum();
+        }
     }
     else
     {
@@ -10502,106 +10574,259 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing
 
     data->rangeStart = range.FirstNode();
     data->rangeEnd   = range.LastNode();
+    data->storeFlags = ind->gtFlags;
 
     return true;
 }
 
 //------------------------------------------------------------------------
-// LowerStoreIndirCoalescing: If the given IND/STOREIND node is followed by a similar
-//    IND/STOREIND node, try to merge them into a single store of a twice wider type. Example:
+// GetStoreCoalescingData: get the data needed to perform store coalescing for either
+//    STOREIND/STORE_BLK or local stores.
 //
-//    *  STOREIND  int
-//    +--*  LCL_VAR   byref  V00
-//    \--*  CNS_INT   int    0x1
+// Arguments:
+//    store - the store node
+//    data  - [OUT] coalescing data
 //
-//    *  STOREIND  int
-//    +--*  LEA(b+4)  byref
-//    |  \--*  LCL_VAR   byref  V00
-//    \--*  CNS_INT   int    0x2
+// Return Value:
+//    true if the data was successfully retrieved, false otherwise.
 //
-//    We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value
+bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* data) const
+{
+    if (store->OperIs(GT_STOREIND, GT_STORE_BLK))
+    {
+        return GetLoadStoreCoalescingData(store->AsIndir(), data);
+    }
+
+    if (!store->OperIsLocalStore())
+    {
+        return false;
+    }
+
+    auto* lclStore = store->AsLclVarCommon();
+    if (!IsStoreCoalescingInvariantNode(m_compiler, lclStore->Data()))
+    {
+        return false;
+    }
+
+    bool               isClosedRange = false;
+    LIR::ReadOnlyRange range         = BlockRange().GetTreeRange(store, &isClosedRange);
+    if (!isClosedRange)
+    {
+        return false;
+    }
+
+    LclVarDsc* varDsc      = m_compiler->lvaGetDesc(lclStore);
+    data->targetType       = store->TypeGet();
+    data->accessSize       = store->OperIs(GT_STORE_LCL_FLD) ? lclStore->AsLclFld()->GetSize() : varDsc->lvExactSize();
+    data->value            = lclStore->Data();
+    data->offset           = lclStore->GetLclOffs();
+    data->lclNum           = lclStore->GetLclNum();
+    data->isLocalStoreNode = true;
+    data->isAddressExposed = varDsc->IsAddressExposed();
+    data->storeFlags       = GTF_EMPTY;
+    data->rangeStart       = range.FirstNode();
+    data->rangeEnd         = range.LastNode();
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// LowerCheckCoalescedStoreAtomicity: decide whether it's safe to coalesce two adjacent
+//    stores into a single wider store without violating atomicity guarantees.
 //
-//    *  STOREIND  long
-//    +--*  LEA(b+0)  byref
-//    |  \--*  LCL_VAR   byref  V00
-//    \--*  CNS_INT   long  0x200000001
+// Arguments:
+//    currStore - the current store node
+//    currData  - coalescing data for the current store
+//    prevData  - coalescing data for the previous store
 //
-//   NOTE: Our memory model allows us to do this optimization, see Memory-model.md:
-//     * Adjacent non-volatile writes to the same location can be coalesced. (see Memory-model.md)
+// Return Value:
+//    true if it is safe to coalesce the two stores, false otherwise.
+//
+bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree*                       currStore,
+                                                 const LoadStoreCoalescingData& currData,
+                                                 const LoadStoreCoalescingData& prevData)
+{
+    // Now the hardest part: decide whether it's safe to use an unaligned write.
+    //
+    // IND<byte> is always fine (and all IND<X> created here from such)
+    // IND<simd> is not required to be atomic per our Memory Model
+    bool     allowsNonAtomic = currData.AllowsNonAtomic() && prevData.AllowsNonAtomic();
+    int      minOffset       = min(prevData.offset, currData.offset);
+    int      maxEndOffset    = max(prevData.offset + static_cast<int>(prevData.accessSize),
+                                   currData.offset + static_cast<int>(currData.accessSize));
+    unsigned combinedSize    = static_cast<unsigned>(maxEndOffset - minOffset);
+
+    if (!allowsNonAtomic && (m_compiler->info.compRetBuffArg != BAD_VAR_NUM) &&
+        (currData.lclNum == m_compiler->info.compRetBuffArg))
+    {
+        // RetBuf is a private stack memory, so we don't need to worry about atomicity.
+        allowsNonAtomic = true;
+    }
+
+    if (!allowsNonAtomic && currData.IsLocalStore() && !currData.isAddressExposed && !prevData.isAddressExposed)
+    {
+        allowsNonAtomic = true;
+    }
+
+    if (!allowsNonAtomic && currData.IsLocalStore())
+    {
+        // During lowering we cannot rely on the final stack offset yet. Instead, use the stack-home
+        // alignment guarantees implied by local allocation: locals are naturally aligned up to pointer
+        // size, and address-exposed locals must be conservative beyond that.
+        unsigned homeAlignment = min(m_compiler->lvaLclStackHomeSize(currData.lclNum), (unsigned)TARGET_POINTER_SIZE);
+        bool     isCombinedStoreAtomic = (combinedSize <= homeAlignment) && ((minOffset % combinedSize) == 0);
+
+#ifdef TARGET_ARM64
+        if (currData.accessSize == TARGET_POINTER_SIZE)
+        {
+            // See the Arm Architecture Reference Manual for A-profile architecture, "Single-copy atomicity":
+            // a 128-bit SIMD write that is 64-bit aligned is treated as a pair of single-copy atomic 64-bit writes.
+            isCombinedStoreAtomic = (homeAlignment >= TARGET_POINTER_SIZE) && ((minOffset % TARGET_POINTER_SIZE) == 0);
+        }
+#endif
+
+        return isCombinedStoreAtomic;
+    }
+
+    if (!allowsNonAtomic && (genTypeSize(currStore) > 1) && !varTypeIsSIMD(currStore))
+    {
+        // Ignore indices for now, they can invalidate our alignment assumptions.
+        // Although, we can take scale into account.
+        if (currData.index != nullptr)
+        {
+            return false;
+        }
+
+        // Base address being TYP_REF gives us a hint that data is pointer-aligned.
+        if (!currData.baseAddr->TypeIs(TYP_REF))
+        {
+            return false;
+        }
+
+        // Check whether the combined indir is still aligned.
+        bool isCombinedIndirAtomic =
+            (combinedSize <= TARGET_POINTER_SIZE) && ((minOffset % static_cast<int>(combinedSize)) == 0);
+
+        if (combinedSize == 2 * TARGET_POINTER_SIZE)
+        {
+#ifdef TARGET_ARM64
+            // See the Arm Architecture Reference Manual for A-profile architecture, "Single-copy atomicity":
+            // writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory are
+            // treated as a pair of single-copy atomic 64-bit writes.
+            //
+            // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null), and assume TYP_LONG/TYP_REF
+            // are already 64-bit aligned. Otherwise the existing load/store sequence would not have atomicity
+            // guarantees either.
+            isCombinedIndirAtomic = true;
+#endif
+        }
+        else if (combinedSize > TARGET_POINTER_SIZE)
+        {
+            return false;
+        }
+
+        if (!isCombinedIndirAtomic)
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// LowerStoreCoalescing: if the given store is followed by a similar adjacent constant store,
+//    try to merge them into a single wider store. Supports STOREIND/STORE_BLK and STORE_LCL_FLD,
+//    where the previous local store may be either STORE_LCL_VAR or STORE_LCL_FLD.
 //
 // Arguments:
-//    ind - the current STOREIND node
+//    node - current store node
 //
-void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
+void Lowering::LowerStoreCoalescing(GenTree* node)
 {
-// LA, RISC-V and ARM32 more likely to receive a terrible performance hit from
-// unaligned accesses making this optimization questionable.
+    // LA, RISC-V, ARM32, and WASM are more likely to receive a large performance hit from unaligned accesses,
+    // making this optimization questionable there.
+    // TODO-WASM-CQ: re-evaluate enabling this once we have better data for Wasm unaligned accesses.
 #if defined(TARGET_XARCH) || defined(TARGET_ARM64)
     if (!m_compiler->opts.OptimizationEnabled())
     {
         return;
     }
 
-    if (!ind->OperIs(GT_STOREIND, GT_STORE_BLK))
+    if (!node->OperIs(GT_STOREIND, GT_STORE_BLK, GT_STORE_LCL_FLD))
     {
-        // Load coalescing is not yet supported
         return;
     }
 
-    // We're going to do it in a loop while we see suitable STOREINDs to coalesce.
-    // E.g.: we have the following LIR sequence:
-    //
-    //     ...addr nodes...
-    //   STOREIND(int)
-    //     ...addr nodes...
-    //   STOREIND(short)
-    //     ...addr nodes...
-    //   STOREIND(short) <-- we're here
-    //
-    // First we merge two 'short' stores, then we merge the result with the 'int' store
-    // to get a single store of 8 bytes.
     do
     {
         LoadStoreCoalescingData currData;
         LoadStoreCoalescingData prevData;
 
-        // Get coalescing data for the current STOREIND
-        if (!GetLoadStoreCoalescingData(ind, &currData))
+        if (!GetStoreCoalescingData(node, &currData))
         {
             return;
         }
 
         GenTree* prevTree = currData.rangeStart->gtPrev;
-        // Now we need to find the previous STOREIND,
-        // we can ignore any NOPs or IL_OFFSETs in-between
         while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET))
         {
             prevTree = prevTree->gtPrev;
         }
 
-        // It's not a store - bail out.
-        if ((prevTree == nullptr) || !prevTree->OperIs(GT_STOREIND, GT_STORE_BLK))
+        if (prevTree == nullptr)
+        {
+            return;
+        }
+
+        if (node->OperIs(GT_STORE_LCL_FLD))
+        {
+            if (!prevTree->OperIsLocalStore())
+            {
+                return;
+            }
+        }
+        else if (!prevTree->OperIs(GT_STOREIND, GT_STORE_BLK))
+        {
+            return;
+        }
+
+        if (!GetStoreCoalescingData(prevTree, &prevData))
         {
             return;
         }
 
-        // Get coalescing data for the previous STOREIND
-        GenTreeIndir* prevInd = prevTree->AsIndir();
-        if (!GetLoadStoreCoalescingData(prevInd, &prevData))
+        if (((currData.storeFlags | prevData.storeFlags) & GTF_IND_VOLATILE) != 0)
         {
+            // Keep volatile stores in their original form.
+            // Regular STOREIND nodes may carry GTF_ORDER_SIDEEFF, and the pre-existing
+            // STOREIND coalescing logic intentionally allowed those cases.
             return;
         }
 
-        if (!currData.IsAddressEqual(prevData, /*ignoreOffset*/ true))
+        bool const allowOverlapOptimization = node->OperIs(GT_STORE_LCL_FLD);
+        int const  prevEndOffset            = prevData.offset + static_cast<int>(prevData.accessSize);
+        int const  currEndOffset            = currData.offset + static_cast<int>(currData.accessSize);
+        bool const storesOverlap            = (currData.offset < prevEndOffset) && (prevData.offset < currEndOffset);
+        bool const sameLocation =
+            currData.IsAddressEqual(prevData, /*ignoreOffset*/ true, allowOverlapOptimization && storesOverlap);
+
+        if (!sameLocation)
         {
-            // Non-offset part of the address is not the same - bail out.
             return;
         }
 
-        // The same offset means that we're storing to the same location of the same width.
-        // Just remove the previous store then.
-        if (prevData.offset == currData.offset)
+        if ((currData.offset == prevData.offset) && (currData.targetType == prevData.targetType))
         {
+            if (m_compiler->gtTreeHasSideEffects(prevData.value, GTF_GLOB_EFFECT) ||
+                m_compiler->gtTreeHasSideEffects(currData.value, GTF_GLOB_EFFECT))
+            {
+                return;
+            }
+
+            JITDUMP(
+                "Store coalescing: removing previous store [%06u] because store [%06u] rewrites the same location\n",
+                m_compiler->dspTreeID(prevTree), m_compiler->dspTreeID(node));
             BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
             continue;
         }
@@ -10610,17 +10835,14 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
         // The algorithm does support TYP_REF (with null value), but it seems to be not worth
         // it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two
         // items at once. Although, it may be profitable to do "stp q0, q0, [addr]".
-        if (!varTypeIsIntegral(ind) && !varTypeIsSIMD(ind))
+        if (!varTypeIsIntegral(node) && !varTypeIsSIMD(node))
         {
             return;
         }
 
-        assert(ind->OperIs(GT_STOREIND));
-        assert(prevInd->OperIs(GT_STOREIND));
         assert(prevData.IsStore());
         assert(currData.IsStore());
 
-        // For now, only non-relocatable constants are supported for data.
         if (!prevData.value->OperIsConst() || !currData.value->OperIsConst())
         {
             return;
@@ -10636,161 +10858,145 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
             return;
         }
 
-        // Otherwise, the difference between two offsets has to match the size of the type.
-        // We don't support overlapping stores.
-        if (abs(prevData.offset - currData.offset) != (int)genTypeSize(prevData.targetType))
-        {
-            return;
-        }
-
-        // For now, we require the current STOREIND to have LEA (previous store may not have it)
-        // So we can easily adjust the offset, consider making it more flexible in future.
-        if (!ind->Addr()->OperIs(GT_LEA))
+        if (node->OperIs(GT_STOREIND, GT_STORE_BLK) && !node->AsIndir()->Addr()->OperIs(GT_LEA))
         {
             return;
         }
 
-        // Now the hardest part: decide whether it's safe to use an unaligned write.
-        //
-        // IND<byte> is always fine (and all IND<X> created here from such)
-        // IND<simd> is not required to be atomic per our Memory Model
-        bool allowsNonAtomic =
-            ((ind->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0) && ((prevInd->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0);
+        var_types  oldType             = node->TypeGet();
+        var_types  newType             = TYP_UNDEF;
+        bool       tryReusingPrevValue = false;
+        int const  minOffset           = min(prevData.offset, currData.offset);
+        int const  maxEndOffset        = max(prevEndOffset, currEndOffset);
+        int const  combinedSize        = maxEndOffset - minOffset;
+        bool const prevContainsCurr    = (prevData.offset <= currData.offset) && (prevEndOffset >= currEndOffset);
+        bool const currContainsPrev    = (currData.offset <= prevData.offset) && (currEndOffset >= prevEndOffset);
+        bool const storesAreAdjacent   = (prevEndOffset == currData.offset) || (currEndOffset == prevData.offset);
+        bool const storesHaveSameSize  = prevData.accessSize == currData.accessSize;
 
-        if (!allowsNonAtomic && currData.baseAddr->OperIs(GT_LCL_VAR) &&
-            (currData.baseAddr->AsLclVar()->GetLclNum() == m_compiler->info.compRetBuffArg))
+        if (allowOverlapOptimization && currContainsPrev)
         {
-            // RetBuf is a private stack memory, so we don't need to worry about atomicity.
-            allowsNonAtomic = true;
+            JITDUMP("Store coalescing: removing previous store [%06u] because store [%06u] fully overwrites it\n",
+                    m_compiler->dspTreeID(prevTree), m_compiler->dspTreeID(node));
+            BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
+            continue;
         }
 
-        if (!allowsNonAtomic && (genTypeSize(ind) > 1) && !varTypeIsSIMD(ind))
+        if (allowOverlapOptimization && prevContainsCurr)
         {
-            // TODO-CQ: if we see that the target is a local memory (non address exposed)
-            // we can use any type (including SIMD) for a new load.
-
-            // Ignore indices for now, they can invalidate our alignment assumptions.
-            // Although, we can take scale into account.
-            if (currData.index != nullptr)
+            if (!varTypeIsIntegral(node))
             {
                 return;
             }
 
-            // Base address being TYP_REF gives us a hint that data is pointer-aligned.
-            if (!currData.baseAddr->TypeIs(TYP_REF))
+            switch (combinedSize)
             {
-                return;
+                case 1:
+                    newType = TYP_UBYTE;
+                    break;
+                case 2:
+                    newType = TYP_USHORT;
+                    break;
+                case 4:
+                    newType = TYP_INT;
+                    break;
+#ifdef TARGET_64BIT
+                case 8:
+                    newType = TYP_LONG;
+                    break;
+#endif
+                default:
+                    return;
             }
-
-            // Check whether the combined indir is still aligned.
-            bool isCombinedIndirAtomic = (genTypeSize(ind) < TARGET_POINTER_SIZE) &&
-                                         (min(prevData.offset, currData.offset) % (genTypeSize(ind) * 2)) == 0;
-
-            if (genTypeSize(ind) == TARGET_POINTER_SIZE)
+        }
+        else
+        {
+            if (!storesAreAdjacent || !storesHaveSameSize)
             {
-#ifdef TARGET_ARM64
-                // Per Arm Architecture Reference Manual for A-profile architecture:
-                //
-                // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory
-                //   are treated as a pair of single - copy atomic 64 - bit writes.
-                //
-                // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null)
-                //
-                // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise
-                // we're already doing a load that has no atomicity guarantees.
-                isCombinedIndirAtomic = true;
-#endif
+                return;
             }
 
-            if (!isCombinedIndirAtomic)
+            if (abs(prevData.offset - currData.offset) != static_cast<int>(prevData.accessSize))
             {
                 return;
             }
-        }
 
-        // Since we're merging two stores of the same type, the new type is twice wider.
-        var_types oldType             = ind->TypeGet();
-        var_types newType             = TYP_UNDEF;
-        bool      tryReusingPrevValue = false;
-        switch (oldType)
-        {
-            case TYP_BYTE:
-            case TYP_UBYTE:
-                newType = TYP_USHORT;
-                break;
+            switch (oldType)
+            {
+                case TYP_BYTE:
+                case TYP_UBYTE:
+                    newType = TYP_USHORT;
+                    break;
 
-            case TYP_SHORT:
-            case TYP_USHORT:
-                newType = TYP_INT;
-                break;
+                case TYP_SHORT:
+                case TYP_USHORT:
+                    newType = TYP_INT;
+                    break;
 
 #ifdef TARGET_64BIT
-            case TYP_INT:
-                newType = TYP_LONG;
-                break;
+                case TYP_INT:
+                    newType = TYP_LONG;
+                    break;
 
 #if defined(FEATURE_HW_INTRINSICS)
-            case TYP_LONG:
-            case TYP_REF:
-                // TLDR: we should be here only if one of the conditions is true:
-                // 1) Both GT_INDs have GTF_IND_ALLOW_NON_ATOMIC flag
-                // 2) ARM64: Data is at least 8-byte aligned
-                // 3) AMD64: Data is at least 16-byte aligned on AMD/Intel with AVX+
-                //
-                newType = TYP_SIMD16;
-                if ((oldType == TYP_REF) &&
-                    (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0)))
-                {
-                    // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.:
-                    //
-                    //   arr[1] = "hello";
-                    //   arr[0] = "world";
-                    //
-                    // but we don't want to load managed references into SIMD registers (we can only do so
-                    // when we can issue a nongc region for a block)
-                    return;
-                }
-                break;
+                case TYP_LONG:
+                case TYP_REF:
+                    // TLDR: we should be here only if one of the conditions is true:
+                    // 1) Both stores have the GTF_IND_ALLOW_NON_ATOMIC flag
+                    // 2) ARM64: data is at least 8-byte aligned
+                    // 3) AMD64: data is at least 16-byte aligned on AMD/Intel with AVX+
+                    newType = TYP_SIMD16;
+                    if ((oldType == TYP_REF) &&
+                        (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0)))
+                    {
+                        // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.:
+                        //
+                        //   arr[1] = "hello";
+                        //   arr[0] = "world";
+                        //
+                        // but we don't want to load managed references into SIMD registers (we can only do so
+                        // when we can issue a nongc region for a block).
+                        return;
+                    }
+                    break;
 
 #if defined(TARGET_AMD64)
-            case TYP_SIMD16:
-                if (m_compiler->getPreferredVectorByteLength() >= 32)
-                {
-                    newType = TYP_SIMD32;
+                case TYP_SIMD16:
+                    // Upgrading two SIMD stores to a wider SIMD store.
+                    // Only on x64 since ARM64 has no options above SIMD16.
+                    if (m_compiler->getPreferredVectorByteLength() >= 32)
+                    {
+                        newType = TYP_SIMD32;
+                        break;
+                    }
+                    tryReusingPrevValue = true;
                     break;
-                }
-                tryReusingPrevValue = true;
-                break;
 
-            case TYP_SIMD32:
-                if (m_compiler->getPreferredVectorByteLength() >= 64)
-                {
-                    newType = TYP_SIMD64;
+                case TYP_SIMD32:
+                    if (m_compiler->getPreferredVectorByteLength() >= 64)
+                    {
+                        newType = TYP_SIMD64;
+                        break;
+                    }
+                    tryReusingPrevValue = true;
                     break;
-                }
-                tryReusingPrevValue = true;
-                break;
 #elif defined(TARGET_ARM64) // TARGET_AMD64
-            case TYP_SIMD16:
-                tryReusingPrevValue = true;
-                break;
-
-#endif // TARGET_ARM64
-#endif // FEATURE_HW_INTRINSICS
-#endif // TARGET_64BIT
+                case TYP_SIMD16:
+                    tryReusingPrevValue = true;
+                    break;
+#endif                      // TARGET_AMD64
+#endif                      // FEATURE_HW_INTRINSICS
+#endif                      // TARGET_64BIT
+                default:
+                    return;
+            }
+        }
 
-            // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to
-            // be converted to TYP_INT/TYP_LONG for constant value.
-            //
-            // TYP_UINT and TYP_ULONG are not legal for GT_IND.
-            //
-            default:
-                return;
+        if (!LowerCheckCoalescedStoreAtomicity(node, currData, prevData))
+        {
+            return;
         }
 
-        // If we can't merge these two stores into a single store, we can at least
-        // cache prevData.value to a local and reuse it in currData.
-        // Normally, LSRA is expected to do this for us, but it's not always the case for SIMD.
         if (tryReusingPrevValue)
         {
 #if defined(FEATURE_HW_INTRINSICS)
@@ -10802,7 +11008,15 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
                     m_compiler->gtNewLclvNode(use.ReplaceWithLclVar(m_compiler), prevData.value->TypeGet());
                 BlockRange().InsertBefore(currData.value, prevValueTmp);
                 BlockRange().Remove(currData.value);
-                ind->Data() = prevValueTmp;
+
+                if (node->OperIsLocalStore())
+                {
+                    node->AsLclVarCommon()->Data() = prevValueTmp;
+                }
+                else
+                {
+                    node->AsIndir()->Data() = prevValueTmp;
+                }
             }
 #endif // FEATURE_HW_INTRINSICS
             return;
@@ -10810,35 +11024,50 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
 
         assert(newType != TYP_UNDEF);
 
-        // We should not be here for stores requiring write barriers.
-        assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind->AsStoreInd()));
-        assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd->AsStoreInd()));
+        if (node->OperIs(GT_STOREIND, GT_STORE_BLK))
+        {
+            auto* ind     = node->AsStoreInd();
+            auto* prevInd = prevTree->AsStoreInd();
 
-        // Delete previous STOREIND entirely
-        BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
+            assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind));
+            assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd));
 
-        // It's not expected to be contained yet, but just in case...
-        ind->Data()->ClearContained();
+            BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
 
-        // We know it's always LEA for now
-        GenTreeAddrMode* addr = ind->Addr()->AsAddrMode();
+            ind->Data()->ClearContained();
+            GenTreeAddrMode* addr = ind->Addr()->AsAddrMode();
+            addr->SetOffset(min(prevData.offset, currData.offset));
 
-        // Update offset to be the minimum of the two
-        addr->SetOffset(min(prevData.offset, currData.offset));
+            ind->gtType         = newType;
+            ind->Data()->gtType = newType;
+        }
+        else
+        {
+            auto* lclStore = node->AsLclVarCommon();
+            BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd);
 
-        // Update type for both STOREIND and val
-        ind->gtType         = newType;
-        ind->Data()->gtType = newType;
+            lclStore->Data()->ClearContained();
+            lclStore->gtType         = newType;
+            lclStore->Data()->gtType = newType;
+            if (lclStore->OperIs(GT_STORE_LCL_FLD))
+            {
+                lclStore->AsLclFld()->SetLclOffs(min(prevData.offset, currData.offset));
+            }
+        }
 
 #if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS)
         // Upgrading two SIMD stores to a wider SIMD store.
-        // Only on x64 since ARM64 has no options above SIMD16
+        // Only on x64 since ARM64 has no options above SIMD16.
         if (varTypeIsSIMD(oldType))
         {
+            if (!prevData.value->OperIs(GT_CNS_VEC) || !currData.value->OperIs(GT_CNS_VEC))
+            {
+                return;
+            }
+
             int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8;
             int8_t* upperCns = currData.value->AsVecCon()->gtSimdVal.i8;
 
-            // if the previous store was at a higher address, swap the constants
             if (prevData.offset > currData.offset)
             {
                 std::swap(lowerCns, upperCns);
@@ -10849,58 +11078,109 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
             memcpy(newCns.i8, lowerCns, oldWidth);
             memcpy(newCns.i8 + oldWidth, upperCns, oldWidth);
 
-            ind->Data()->AsVecCon()->gtSimdVal = newCns;
+            currData.value->AsVecCon()->gtSimdVal = newCns;
             continue;
         }
-#endif
+#endif // TARGET_AMD64 && FEATURE_HW_INTRINSICS
 
-        size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue();
-        size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue();
-
-        // if the previous store was at a higher address, swap the constants
-        if (prevData.offset > currData.offset)
+        // The integer path below places each constant according to its byte offset, so it doesn't need to swap the
+        // values first. Only the SIMD packing paths above need to normalize lower/upper order explicitly.
+        uint64_t lowerCns = 0;
+        uint64_t upperCns = 0;
+        if (!TryGetStoreCoalescingConstantBits(prevData.value, &lowerCns) ||
+            !TryGetStoreCoalescingConstantBits(currData.value, &upperCns))
         {
-            std::swap(lowerCns, upperCns);
+            return;
         }
 
 #if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS)
-        // We're promoting two TYP_LONG/TYP_REF into TYP_SIMD16
-        // All legality checks were done above.
         if (varTypeIsSIMD(newType))
         {
-            // Replace two 64bit constants with a single 128bit constant
+            if (prevData.offset > currData.offset)
+            {
+                std::swap(lowerCns, upperCns);
+            }
+
             int8_t val[16];
             memcpy(val, &lowerCns, 8);
             memcpy(val + 8, &upperCns, 8);
             GenTreeVecCon* vecCns = m_compiler->gtNewVconNode(newType, &val);
 
-            BlockRange().InsertAfter(ind->Data(), vecCns);
-            BlockRange().Remove(ind->Data());
-            ind->gtOp2 = vecCns;
+            BlockRange().InsertAfter(currData.value, vecCns);
+            BlockRange().Remove(currData.value);
+
+            if (node->OperIsLocalStore())
+            {
+                node->AsLclVarCommon()->Data() = vecCns;
+            }
+            else
+            {
+                node->AsIndir()->Data() = vecCns;
+            }
+
             continue;
         }
 #endif // TARGET_64BIT && FEATURE_HW_INTRINSICS
 
-        // Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT
-        // the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF.
-        size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_PER_BYTE;
-        lowerCns &= mask;
-        upperCns &= mask;
+        uint64_t prevMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - prevData.accessSize) * BITS_PER_BYTE;
+        uint64_t currMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - currData.accessSize) * BITS_PER_BYTE;
+        uint64_t newMask  = ~(uint64_t(0)) >> (sizeof(uint64_t) - genTypeSize(newType)) * BITS_PER_BYTE;
+        lowerCns &= prevMask;
+        upperCns &= currMask;
 
-        size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_PER_BYTE)));
+        unsigned const prevShift = static_cast<unsigned>((prevData.offset - minOffset) * BITS_PER_BYTE);
+        unsigned const currShift = static_cast<unsigned>((currData.offset - minOffset) * BITS_PER_BYTE);
+
+        uint64_t prevBits = (lowerCns << prevShift) & newMask;
+        uint64_t currBits = (upperCns << currShift) & newMask;
+
+        // Later stores must overwrite any overlapping bytes from earlier stores.
+        uint64_t currBitsMask = (currMask << currShift) & newMask;
+        uint64_t val          = (prevBits & ~currBitsMask) | currBits;
         JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val);
 
-        ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val;
-        if (genTypeSize(oldType) == 1)
+        assert(currData.value->OperIs(GT_CNS_INT));
+        auto* intCon      = currData.value->AsIntCon();
+        intCon->gtIconVal = static_cast<ssize_t>(val);
+        if (genTypeSize(oldType) == 1 && node->OperIsIndir())
         {
-            // A mark for future foldings that this IND doesn't need to be atomic.
-            ind->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC;
+            node->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC;
         }
-
     } while (true);
 #endif // TARGET_XARCH || TARGET_ARM64
 }
 
+//------------------------------------------------------------------------
+// LowerStoreIndirCoalescing: If the given IND/STOREIND node is followed by a similar
+//    IND/STOREIND node, try to merge them into a single store of a twice wider type. Example:
+//
+//    *  STOREIND  int
+//    +--*  LCL_VAR   byref  V00
+//    \--*  CNS_INT   int    0x1
+//
+//    *  STOREIND  int
+//    +--*  LEA(b+4)  byref
+//    |  \--*  LCL_VAR   byref  V00
+//    \--*  CNS_INT   int    0x2
+//
+//    We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value
+//
+//    *  STOREIND  long
+//    +--*  LEA(b+0)  byref
+//    |  \--*  LCL_VAR   byref  V00
+//    \--*  CNS_INT   long  0x200000001
+//
+//   NOTE: Our memory model allows us to do this optimization, see Memory-model.md:
+//     * Adjacent non-volatile writes to the same location can be coalesced. (see Memory-model.md)
+//
+// Arguments:
+//    ind - the current STOREIND node
+//
+void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind)
+{
+    LowerStoreCoalescing(ind);
+}
+
 //------------------------------------------------------------------------
 // LowerStoreIndirCommon: a common logic to lower StoreIndir.
 //
diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h
index d88ab35f9c2ad0..a3e7b8f8356210 100644
--- a/src/coreclr/jit/lower.h
+++ b/src/coreclr/jit/lower.h
@@ -351,32 +351,62 @@ class Lowering final : public Phase
 
     struct LoadStoreCoalescingData
     {
-        var_types targetType;
-        GenTree*  baseAddr;
-        GenTree*  index;
-        GenTree*  value;
-        uint32_t  scale;
-        int       offset;
-        GenTree*  rangeStart;
-        GenTree*  rangeEnd;
+        var_types    targetType       = TYP_UNDEF;
+        GenTree*     baseAddr         = nullptr;
+        GenTree*     index            = nullptr;
+        GenTree*     value            = nullptr;
+        uint32_t     scale            = 1;
+        int          offset           = 0;
+        unsigned     accessSize       = 0;
+        unsigned     lclNum           = BAD_VAR_NUM;
+        GenTreeFlags storeFlags       = GTF_EMPTY;
+        bool         isLocalStoreNode = false;
+        bool         isAddressExposed = false;
+        GenTree*     rangeStart       = nullptr;
+        GenTree*     rangeEnd         = nullptr;
 
         bool IsStore() const
         {
             return value != nullptr;
         }
 
-        bool IsAddressEqual(const LoadStoreCoalescingData& other, bool ignoreOffset) const
+        bool IsLocalStore() const
         {
-            if ((scale != other.scale) || (targetType != other.targetType) ||
-                !GenTree::Compare(baseAddr, other.baseAddr) || !GenTree::Compare(index, other.index))
+            return isLocalStoreNode;
+        }
+
+        bool AllowsNonAtomic() const
+        {
+            return (storeFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0;
+        }
+
+        bool IsAddressEqual(const LoadStoreCoalescingData& other,
+                            bool                           ignoreOffset,
+                            bool                           allowDifferentTypes = false) const
+        {
+            if ((scale != other.scale) || (!allowDifferentTypes && (targetType != other.targetType)))
             {
                 return false;
             }
+
+            if (IsLocalStore() || other.IsLocalStore())
+            {
+                if (!IsLocalStore() || !other.IsLocalStore() || (lclNum != other.lclNum))
+                {
+                    return false;
+                }
+            }
+            else if (!GenTree::Compare(baseAddr, other.baseAddr) || !GenTree::Compare(index, other.index))
+            {
+                return false;
+            }
+
             return ignoreOffset || (offset == other.offset);
         }
     };
 
     bool GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescingData* data) const;
+    bool GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* data) const;
 
     // Per tree node member functions
     GenTree* LowerStoreIndirCommon(GenTreeStoreInd* ind);
@@ -389,7 +419,11 @@ class Lowering final : public Phase
     void     MarkTree(GenTree* root);
     void     UnmarkTree(GenTree* root);
     GenTree* LowerStoreIndir(GenTreeStoreInd* node);
+    void     LowerStoreCoalescing(GenTree* node);
     void     LowerStoreIndirCoalescing(GenTreeIndir* node);
+    bool     LowerCheckCoalescedStoreAtomicity(GenTree*                       currStore,
+                                               const LoadStoreCoalescingData& currData,
+                                               const LoadStoreCoalescingData& prevData);
     GenTree* LowerAdd(GenTreeOp* node);
     GenTree* LowerMul(GenTreeOp* mul);
     bool     TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode);
diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
index b9fa68710da733..e44c3176c89c0e 100644
--- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs
+++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs
@@ -2,20 +2,30 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 //
 
-using System.Runtime.CompilerServices;
 using System;
+using System.Runtime.CompilerServices;
 using Xunit;
 
 public class SpAddr
 {
-
-    // Struct in reg (2 ints)
     struct S
     {
         public int i0;
         public int i1;
     }
 
+    struct Pair
+    {
+        public int A;
+        public int B;
+    }
+
+    struct FloatPair
+    {
+        public float A;
+        public float B;
+    }
+
     [MethodImpl(MethodImplOptions.NoInlining)]
     static int Foo(S s0, S s1)
     {
@@ -35,12 +45,74 @@ static int M(int i0, int i1, int i2, int i3)
         return Foo(s0, s1); // r0 <= r1; r1 <= r0; r2 <= r3; r3 <= r2
     }
 
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long Consume(Pair p) => ((long)p.B << 32) | (uint)p.A;
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long Consume(FloatPair p) =>
+        ((long)(uint)BitConverter.SingleToInt32Bits(p.B) << 32) | (uint)BitConverter.SingleToInt32Bits(p.A);
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static void Expose(ref Pair p)
+    {
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long NonAddressExposed(int a, int b)
+    {
+        Pair p;
+        p.A = a;
+        p.B = b;
+        return Consume(p);
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long AddressExposed(int a, int b)
+    {
+        Pair p = default;
+        Expose(ref p);
+        p.A = a;
+        p.B = b;
+        return Consume(p);
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static int OverlappingNonZeroThenShort()
+    {
+        Pair p = default;
+        p.A = unchecked((int)0xDEADBEEF);
+        Unsafe.As<Pair, short>(ref p) = 0;
+        return p.A;
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static long AdjacentFloatNonZeroThenZero()
+    {
+        FloatPair p = default;
+        p.A = 1.0f;
+        p.B = 0.0f;
+        return Consume(p);
+    }
+
     [Fact]
     public static int TestEntryPoint()
     {
         int res = M(1, 2, 3, 4);
         Console.WriteLine("M(1, 2, 3, 4) is {0}.", res);
-        if (res == 10)
+        long nonAddressExposed = NonAddressExposed(0x11223344, 0x55667788);
+        long addressExposed    = AddressExposed(0x10203040, 0x50607080);
+        int  overlapping       = OverlappingNonZeroThenShort();
+        long floatPair         = AdjacentFloatNonZeroThenZero();
+
+        int expectedOverlapping = BitConverter.IsLittleEndian
+            ? unchecked((int)0xDEAD0000)
+            : unchecked((int)0x0000BEEF);
+
+        if ((res == 10) &&
+            (nonAddressExposed == 0x5566778811223344L) &&
+            (addressExposed == 0x5060708010203040L) &&
+            (overlapping == expectedOverlapping) &&
+            (floatPair == 0x000000003F800000L))
             return 100;
         else
             return 99;