diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 44cbd6d0e5eb71..e747bb8223c03d 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -91,6 +91,7 @@ CONFIG_INTEGER(JitHideAlignBehindJmp, "JitHideAlignBehindJmp", 1) // Track stores to locals done through return buffers. CONFIG_INTEGER(JitOptimizeStructHiddenBuffer, "JitOptimizeStructHiddenBuffer", 1) +RELEASE_CONFIG_INTEGER(JitEnableStoreLclFldCoalescing, "JitEnableStoreLclFldCoalescing", 1) CONFIG_INTEGER(JitUnrollLoopMaxIterationCount, "JitUnrollLoopMaxIterationCount", diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 4d30b7d109967a..5d783052b2fe8b 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -5668,6 +5668,11 @@ GenTree* Lowering::LowerStoreLocCommon(GenTreeLclVarCommon* lclStore) TryRetypingFloatingPointStoreToIntegerStore(lclStore); + if (lclStore->OperIs(GT_STORE_LCL_FLD) && (JitConfig.JitEnableStoreLclFldCoalescing() != 0)) + { + LowerStoreCoalescing(lclStore); + } + GenTree* src = lclStore->gtGetOp1(); LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclStore); const bool srcIsMultiReg = src->IsMultiRegNode(); @@ -10405,6 +10410,77 @@ void Lowering::LowerBlockStoreAsHelperCall(GenTreeBlk* blkNode) } } +//------------------------------------------------------------------------ +// IsStoreCoalescingInvariantNode: check whether a node is simple enough to participate in +// lowering-time store coalescing. +// +// Arguments: +// compiler - the compiler instance +// node - the node to examine +// allowNull - true if a null node should be considered valid +// +// Return Value: +// true if the node is an allowed invariant input for store coalescing; otherwise false. +// +static bool IsStoreCoalescingInvariantNode(Compiler* compiler, GenTree* node, bool allowNull = false) +{ + if (node == nullptr) + { + return allowNull; + } + + if (node->OperIsConst()) + { + return true; + } + + // We can allow bigger trees here, but it's not clear if it's worth it. + return node->OperIs(GT_LCL_VAR) && !compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum()); +} + +//------------------------------------------------------------------------ +// TryGetStoreCoalescingConstantBits: get the raw bits for a constant used by store +// coalescing. +// +// Arguments: +// value - the constant node +// bits - [out] the raw constant bits +// +// Return Value: +// true if the constant can participate in store coalescing; otherwise false. +// +static bool TryGetStoreCoalescingConstantBits(GenTree* value, uint64_t* bits) +{ + assert(bits != nullptr); + *bits = 0; + + if (value->IsCnsIntOrI()) + { + *bits = static_cast(value->AsIntCon()->IconValue()); + return true; + } + + if (value->IsCnsFltOrDbl()) + { + if (value->TypeIs(TYP_FLOAT)) + { + float floatCns = static_cast(value->AsDblCon()->DconValue()); + *bits = BitOperations::SingleToUInt32Bits(floatCns); + return true; + } + +#ifdef TARGET_64BIT + // We only need the raw 64-bit payload for targets where the resulting 8-byte coalesced store is supported. + assert(value->TypeIs(TYP_DOUBLE)); + double doubleCns = value->AsDblCon()->DconValue(); + *bits = BitOperations::DoubleToUInt64Bits(doubleCns); + return true; +#endif + } + + return false; +} + //------------------------------------------------------------------------ // GetLoadStoreCoalescingData: given a STOREIND/IND node, get the data needed to perform // store/load coalescing including pointer to the previous node. @@ -10428,23 +10504,10 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing const bool isStore = ind->OperIs(GT_STOREIND, GT_STORE_BLK); const bool isLoad = ind->OperIs(GT_IND); - auto isNodeInvariant = [](Compiler* m_compiler, GenTree* node, bool allowNull) { - if (node == nullptr) - { - return allowNull; - } - if (node->OperIsConst()) - { - return true; - } - // We can allow bigger trees here, but it's not clear if it's worth it. - return node->OperIs(GT_LCL_VAR) && !m_compiler->lvaVarAddrExposed(node->AsLclVar()->GetLclNum()); - }; - if (isStore) { // For stores, Data() is expected to be an invariant node - if (!isNodeInvariant(m_compiler, ind->Data(), false)) + if (!IsStoreCoalescingInvariantNode(m_compiler, ind->Data())) { return false; } @@ -10455,19 +10518,20 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing } data->targetType = ind->TypeGet(); + data->accessSize = ind->Size(); data->value = isStore ? ind->Data() : nullptr; if (ind->Addr()->OperIs(GT_LEA)) { GenTree* base = ind->Addr()->AsAddrMode()->Base(); GenTree* index = ind->Addr()->AsAddrMode()->Index(); - if (!isNodeInvariant(m_compiler, base, false)) + if (!IsStoreCoalescingInvariantNode(m_compiler, base)) { // Base must be a local. It's possible for it to be nullptr when index is not null, // but let's ignore such cases. return false; } - if (!isNodeInvariant(m_compiler, index, true)) + if (!IsStoreCoalescingInvariantNode(m_compiler, index, true)) { // Index should be either nullptr or a local. return false; @@ -10477,14 +10541,22 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing data->index = index == nullptr ? nullptr : index; data->scale = ind->Addr()->AsAddrMode()->GetScale(); data->offset = ind->Addr()->AsAddrMode()->Offset(); + if ((base != nullptr) && base->OperIs(GT_LCL_VAR)) + { + data->lclNum = base->AsLclVar()->GetLclNum(); + } } - else if (isNodeInvariant(m_compiler, ind->Addr(), true)) + else if (IsStoreCoalescingInvariantNode(m_compiler, ind->Addr(), true)) { // Address is just a local, no offset, scale is 1 data->baseAddr = ind->Addr(); data->index = nullptr; data->scale = 1; data->offset = 0; + if ((ind->Addr() != nullptr) && ind->Addr()->OperIs(GT_LCL_VAR)) + { + data->lclNum = ind->Addr()->AsLclVar()->GetLclNum(); + } } else { @@ -10502,106 +10574,259 @@ bool Lowering::GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescing data->rangeStart = range.FirstNode(); data->rangeEnd = range.LastNode(); + data->storeFlags = ind->gtFlags; return true; } //------------------------------------------------------------------------ -// LowerStoreIndirCoalescing: If the given IND/STOREIND node is followed by a similar -// IND/STOREIND node, try to merge them into a single store of a twice wider type. Example: +// GetStoreCoalescingData: get the data needed to perform store coalescing for either +// STOREIND/STORE_BLK or local stores. // -// * STOREIND int -// +--* LCL_VAR byref V00 -// \--* CNS_INT int 0x1 +// Arguments: +// store - the store node +// data - [OUT] coalescing data // -// * STOREIND int -// +--* LEA(b+4) byref -// | \--* LCL_VAR byref V00 -// \--* CNS_INT int 0x2 +// Return Value: +// true if the data was successfully retrieved, false otherwise. // -// We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value +bool Lowering::GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* data) const +{ + if (store->OperIs(GT_STOREIND, GT_STORE_BLK)) + { + return GetLoadStoreCoalescingData(store->AsIndir(), data); + } + + if (!store->OperIsLocalStore()) + { + return false; + } + + auto* lclStore = store->AsLclVarCommon(); + if (!IsStoreCoalescingInvariantNode(m_compiler, lclStore->Data())) + { + return false; + } + + bool isClosedRange = false; + LIR::ReadOnlyRange range = BlockRange().GetTreeRange(store, &isClosedRange); + if (!isClosedRange) + { + return false; + } + + LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclStore); + data->targetType = store->TypeGet(); + data->accessSize = store->OperIs(GT_STORE_LCL_FLD) ? lclStore->AsLclFld()->GetSize() : varDsc->lvExactSize(); + data->value = lclStore->Data(); + data->offset = lclStore->GetLclOffs(); + data->lclNum = lclStore->GetLclNum(); + data->isLocalStoreNode = true; + data->isAddressExposed = varDsc->IsAddressExposed(); + data->storeFlags = GTF_EMPTY; + data->rangeStart = range.FirstNode(); + data->rangeEnd = range.LastNode(); + + return true; +} + +//------------------------------------------------------------------------ +// LowerCheckCoalescedStoreAtomicity: decide whether it's safe to coalesce two adjacent +// stores into a single wider store without violating atomicity guarantees. // -// * STOREIND long -// +--* LEA(b+0) byref -// | \--* LCL_VAR byref V00 -// \--* CNS_INT long 0x200000001 +// Arguments: +// currStore - the current store node +// currData - coalescing data for the current store +// prevData - coalescing data for the previous store // -// NOTE: Our memory model allows us to do this optimization, see Memory-model.md: -// * Adjacent non-volatile writes to the same location can be coalesced. (see Memory-model.md) +// Return Value: +// true if it is safe to coalesce the two stores, false otherwise. +// +bool Lowering::LowerCheckCoalescedStoreAtomicity(GenTree* currStore, + const LoadStoreCoalescingData& currData, + const LoadStoreCoalescingData& prevData) +{ + // Now the hardest part: decide whether it's safe to use an unaligned write. + // + // IND is always fine (and all IND created here from such) + // IND is not required to be atomic per our Memory Model + bool allowsNonAtomic = currData.AllowsNonAtomic() && prevData.AllowsNonAtomic(); + int minOffset = min(prevData.offset, currData.offset); + int maxEndOffset = max(prevData.offset + static_cast(prevData.accessSize), + currData.offset + static_cast(currData.accessSize)); + unsigned combinedSize = static_cast(maxEndOffset - minOffset); + + if (!allowsNonAtomic && (m_compiler->info.compRetBuffArg != BAD_VAR_NUM) && + (currData.lclNum == m_compiler->info.compRetBuffArg)) + { + // RetBuf is a private stack memory, so we don't need to worry about atomicity. + allowsNonAtomic = true; + } + + if (!allowsNonAtomic && currData.IsLocalStore() && !currData.isAddressExposed && !prevData.isAddressExposed) + { + allowsNonAtomic = true; + } + + if (!allowsNonAtomic && currData.IsLocalStore()) + { + // During lowering we cannot rely on the final stack offset yet. Instead, use the stack-home + // alignment guarantees implied by local allocation: locals are naturally aligned up to pointer + // size, and address-exposed locals must be conservative beyond that. + unsigned homeAlignment = min(m_compiler->lvaLclStackHomeSize(currData.lclNum), (unsigned)TARGET_POINTER_SIZE); + bool isCombinedStoreAtomic = (combinedSize <= homeAlignment) && ((minOffset % combinedSize) == 0); + +#ifdef TARGET_ARM64 + if (currData.accessSize == TARGET_POINTER_SIZE) + { + // See the Arm Architecture Reference Manual for A-profile architecture, "Single-copy atomicity": + // a 128-bit SIMD write that is 64-bit aligned is treated as a pair of single-copy atomic 64-bit writes. + isCombinedStoreAtomic = (homeAlignment >= TARGET_POINTER_SIZE) && ((minOffset % TARGET_POINTER_SIZE) == 0); + } +#endif + + return isCombinedStoreAtomic; + } + + if (!allowsNonAtomic && (genTypeSize(currStore) > 1) && !varTypeIsSIMD(currStore)) + { + // Ignore indices for now, they can invalidate our alignment assumptions. + // Although, we can take scale into account. + if (currData.index != nullptr) + { + return false; + } + + // Base address being TYP_REF gives us a hint that data is pointer-aligned. + if (!currData.baseAddr->TypeIs(TYP_REF)) + { + return false; + } + + // Check whether the combined indir is still aligned. + bool isCombinedIndirAtomic = + (combinedSize <= TARGET_POINTER_SIZE) && ((minOffset % static_cast(combinedSize)) == 0); + + if (combinedSize == 2 * TARGET_POINTER_SIZE) + { +#ifdef TARGET_ARM64 + // See the Arm Architecture Reference Manual for A-profile architecture, "Single-copy atomicity": + // writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory are + // treated as a pair of single-copy atomic 64-bit writes. + // + // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null), and assume TYP_LONG/TYP_REF + // are already 64-bit aligned. Otherwise the existing load/store sequence would not have atomicity + // guarantees either. + isCombinedIndirAtomic = true; +#endif + } + else if (combinedSize > TARGET_POINTER_SIZE) + { + return false; + } + + if (!isCombinedIndirAtomic) + { + return false; + } + } + + return true; +} + +//------------------------------------------------------------------------ +// LowerStoreCoalescing: if the given store is followed by a similar adjacent constant store, +// try to merge them into a single wider store. Supports STOREIND/STORE_BLK and STORE_LCL_FLD, +// where the previous local store may be either STORE_LCL_VAR or STORE_LCL_FLD. // // Arguments: -// ind - the current STOREIND node +// node - current store node // -void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) +void Lowering::LowerStoreCoalescing(GenTree* node) { -// LA, RISC-V and ARM32 more likely to receive a terrible performance hit from -// unaligned accesses making this optimization questionable. + // LA, RISC-V, ARM32, and WASM are more likely to receive a large performance hit from unaligned accesses, + // making this optimization questionable there. + // TODO-WASM-CQ: re-evaluate enabling this once we have better data for Wasm unaligned accesses. #if defined(TARGET_XARCH) || defined(TARGET_ARM64) if (!m_compiler->opts.OptimizationEnabled()) { return; } - if (!ind->OperIs(GT_STOREIND, GT_STORE_BLK)) + if (!node->OperIs(GT_STOREIND, GT_STORE_BLK, GT_STORE_LCL_FLD)) { - // Load coalescing is not yet supported return; } - // We're going to do it in a loop while we see suitable STOREINDs to coalesce. - // E.g.: we have the following LIR sequence: - // - // ...addr nodes... - // STOREIND(int) - // ...addr nodes... - // STOREIND(short) - // ...addr nodes... - // STOREIND(short) <-- we're here - // - // First we merge two 'short' stores, then we merge the result with the 'int' store - // to get a single store of 8 bytes. do { LoadStoreCoalescingData currData; LoadStoreCoalescingData prevData; - // Get coalescing data for the current STOREIND - if (!GetLoadStoreCoalescingData(ind, &currData)) + if (!GetStoreCoalescingData(node, &currData)) { return; } GenTree* prevTree = currData.rangeStart->gtPrev; - // Now we need to find the previous STOREIND, - // we can ignore any NOPs or IL_OFFSETs in-between while ((prevTree != nullptr) && prevTree->OperIs(GT_NOP, GT_IL_OFFSET)) { prevTree = prevTree->gtPrev; } - // It's not a store - bail out. - if ((prevTree == nullptr) || !prevTree->OperIs(GT_STOREIND, GT_STORE_BLK)) + if (prevTree == nullptr) + { + return; + } + + if (node->OperIs(GT_STORE_LCL_FLD)) + { + if (!prevTree->OperIsLocalStore()) + { + return; + } + } + else if (!prevTree->OperIs(GT_STOREIND, GT_STORE_BLK)) + { + return; + } + + if (!GetStoreCoalescingData(prevTree, &prevData)) { return; } - // Get coalescing data for the previous STOREIND - GenTreeIndir* prevInd = prevTree->AsIndir(); - if (!GetLoadStoreCoalescingData(prevInd, &prevData)) + if (((currData.storeFlags | prevData.storeFlags) & GTF_IND_VOLATILE) != 0) { + // Keep volatile stores in their original form. + // Regular STOREIND nodes may carry GTF_ORDER_SIDEEFF, and the pre-existing + // STOREIND coalescing logic intentionally allowed those cases. return; } - if (!currData.IsAddressEqual(prevData, /*ignoreOffset*/ true)) + bool const allowOverlapOptimization = node->OperIs(GT_STORE_LCL_FLD); + int const prevEndOffset = prevData.offset + static_cast(prevData.accessSize); + int const currEndOffset = currData.offset + static_cast(currData.accessSize); + bool const storesOverlap = (currData.offset < prevEndOffset) && (prevData.offset < currEndOffset); + bool const sameLocation = + currData.IsAddressEqual(prevData, /*ignoreOffset*/ true, allowOverlapOptimization && storesOverlap); + + if (!sameLocation) { - // Non-offset part of the address is not the same - bail out. return; } - // The same offset means that we're storing to the same location of the same width. - // Just remove the previous store then. - if (prevData.offset == currData.offset) + if ((currData.offset == prevData.offset) && (currData.targetType == prevData.targetType)) { + if (m_compiler->gtTreeHasSideEffects(prevData.value, GTF_GLOB_EFFECT) || + m_compiler->gtTreeHasSideEffects(currData.value, GTF_GLOB_EFFECT)) + { + return; + } + + JITDUMP( + "Store coalescing: removing previous store [%06u] because store [%06u] rewrites the same location\n", + m_compiler->dspTreeID(prevTree), m_compiler->dspTreeID(node)); BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); continue; } @@ -10610,17 +10835,14 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) // The algorithm does support TYP_REF (with null value), but it seems to be not worth // it on ARM64 where it's pretty efficient to do "stp xzr, xzr, [addr]" to clear two // items at once. Although, it may be profitable to do "stp q0, q0, [addr]". - if (!varTypeIsIntegral(ind) && !varTypeIsSIMD(ind)) + if (!varTypeIsIntegral(node) && !varTypeIsSIMD(node)) { return; } - assert(ind->OperIs(GT_STOREIND)); - assert(prevInd->OperIs(GT_STOREIND)); assert(prevData.IsStore()); assert(currData.IsStore()); - // For now, only non-relocatable constants are supported for data. if (!prevData.value->OperIsConst() || !currData.value->OperIsConst()) { return; @@ -10636,161 +10858,145 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) return; } - // Otherwise, the difference between two offsets has to match the size of the type. - // We don't support overlapping stores. - if (abs(prevData.offset - currData.offset) != (int)genTypeSize(prevData.targetType)) - { - return; - } - - // For now, we require the current STOREIND to have LEA (previous store may not have it) - // So we can easily adjust the offset, consider making it more flexible in future. - if (!ind->Addr()->OperIs(GT_LEA)) + if (node->OperIs(GT_STOREIND, GT_STORE_BLK) && !node->AsIndir()->Addr()->OperIs(GT_LEA)) { return; } - // Now the hardest part: decide whether it's safe to use an unaligned write. - // - // IND is always fine (and all IND created here from such) - // IND is not required to be atomic per our Memory Model - bool allowsNonAtomic = - ((ind->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0) && ((prevInd->gtFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0); + var_types oldType = node->TypeGet(); + var_types newType = TYP_UNDEF; + bool tryReusingPrevValue = false; + int const minOffset = min(prevData.offset, currData.offset); + int const maxEndOffset = max(prevEndOffset, currEndOffset); + int const combinedSize = maxEndOffset - minOffset; + bool const prevContainsCurr = (prevData.offset <= currData.offset) && (prevEndOffset >= currEndOffset); + bool const currContainsPrev = (currData.offset <= prevData.offset) && (currEndOffset >= prevEndOffset); + bool const storesAreAdjacent = (prevEndOffset == currData.offset) || (currEndOffset == prevData.offset); + bool const storesHaveSameSize = prevData.accessSize == currData.accessSize; - if (!allowsNonAtomic && currData.baseAddr->OperIs(GT_LCL_VAR) && - (currData.baseAddr->AsLclVar()->GetLclNum() == m_compiler->info.compRetBuffArg)) + if (allowOverlapOptimization && currContainsPrev) { - // RetBuf is a private stack memory, so we don't need to worry about atomicity. - allowsNonAtomic = true; + JITDUMP("Store coalescing: removing previous store [%06u] because store [%06u] fully overwrites it\n", + m_compiler->dspTreeID(prevTree), m_compiler->dspTreeID(node)); + BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); + continue; } - if (!allowsNonAtomic && (genTypeSize(ind) > 1) && !varTypeIsSIMD(ind)) + if (allowOverlapOptimization && prevContainsCurr) { - // TODO-CQ: if we see that the target is a local memory (non address exposed) - // we can use any type (including SIMD) for a new load. - - // Ignore indices for now, they can invalidate our alignment assumptions. - // Although, we can take scale into account. - if (currData.index != nullptr) + if (!varTypeIsIntegral(node)) { return; } - // Base address being TYP_REF gives us a hint that data is pointer-aligned. - if (!currData.baseAddr->TypeIs(TYP_REF)) + switch (combinedSize) { - return; + case 1: + newType = TYP_UBYTE; + break; + case 2: + newType = TYP_USHORT; + break; + case 4: + newType = TYP_INT; + break; +#ifdef TARGET_64BIT + case 8: + newType = TYP_LONG; + break; +#endif + default: + return; } - - // Check whether the combined indir is still aligned. - bool isCombinedIndirAtomic = (genTypeSize(ind) < TARGET_POINTER_SIZE) && - (min(prevData.offset, currData.offset) % (genTypeSize(ind) * 2)) == 0; - - if (genTypeSize(ind) == TARGET_POINTER_SIZE) + } + else + { + if (!storesAreAdjacent || !storesHaveSameSize) { -#ifdef TARGET_ARM64 - // Per Arm Architecture Reference Manual for A-profile architecture: - // - // * Writes from SIMD and floating-point registers of a 128-bit value that is 64-bit aligned in memory - // are treated as a pair of single - copy atomic 64 - bit writes. - // - // Thus, we can allow 2xLONG -> SIMD, same for TYP_REF (for value being null) - // - // And we assume on ARM64 TYP_LONG/TYP_REF are always 64-bit aligned, otherwise - // we're already doing a load that has no atomicity guarantees. - isCombinedIndirAtomic = true; -#endif + return; } - if (!isCombinedIndirAtomic) + if (abs(prevData.offset - currData.offset) != static_cast(prevData.accessSize)) { return; } - } - // Since we're merging two stores of the same type, the new type is twice wider. - var_types oldType = ind->TypeGet(); - var_types newType = TYP_UNDEF; - bool tryReusingPrevValue = false; - switch (oldType) - { - case TYP_BYTE: - case TYP_UBYTE: - newType = TYP_USHORT; - break; + switch (oldType) + { + case TYP_BYTE: + case TYP_UBYTE: + newType = TYP_USHORT; + break; - case TYP_SHORT: - case TYP_USHORT: - newType = TYP_INT; - break; + case TYP_SHORT: + case TYP_USHORT: + newType = TYP_INT; + break; #ifdef TARGET_64BIT - case TYP_INT: - newType = TYP_LONG; - break; + case TYP_INT: + newType = TYP_LONG; + break; #if defined(FEATURE_HW_INTRINSICS) - case TYP_LONG: - case TYP_REF: - // TLDR: we should be here only if one of the conditions is true: - // 1) Both GT_INDs have GTF_IND_ALLOW_NON_ATOMIC flag - // 2) ARM64: Data is at least 8-byte aligned - // 3) AMD64: Data is at least 16-byte aligned on AMD/Intel with AVX+ - // - newType = TYP_SIMD16; - if ((oldType == TYP_REF) && - (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0))) - { - // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.: - // - // arr[1] = "hello"; - // arr[0] = "world"; - // - // but we don't want to load managed references into SIMD registers (we can only do so - // when we can issue a nongc region for a block) - return; - } - break; + case TYP_LONG: + case TYP_REF: + // TLDR: we should be here only if one of the conditions is true: + // 1) Both stores have the GTF_IND_ALLOW_NON_ATOMIC flag + // 2) ARM64: data is at least 8-byte aligned + // 3) AMD64: data is at least 16-byte aligned on AMD/Intel with AVX+ + newType = TYP_SIMD16; + if ((oldType == TYP_REF) && + (!currData.value->IsIntegralConst(0) || !prevData.value->IsIntegralConst(0))) + { + // For TYP_REF we only support null values. In theory, we can also support frozen handles, e.g.: + // + // arr[1] = "hello"; + // arr[0] = "world"; + // + // but we don't want to load managed references into SIMD registers (we can only do so + // when we can issue a nongc region for a block). + return; + } + break; #if defined(TARGET_AMD64) - case TYP_SIMD16: - if (m_compiler->getPreferredVectorByteLength() >= 32) - { - newType = TYP_SIMD32; + case TYP_SIMD16: + // Upgrading two SIMD stores to a wider SIMD store. + // Only on x64 since ARM64 has no options above SIMD16. + if (m_compiler->getPreferredVectorByteLength() >= 32) + { + newType = TYP_SIMD32; + break; + } + tryReusingPrevValue = true; break; - } - tryReusingPrevValue = true; - break; - case TYP_SIMD32: - if (m_compiler->getPreferredVectorByteLength() >= 64) - { - newType = TYP_SIMD64; + case TYP_SIMD32: + if (m_compiler->getPreferredVectorByteLength() >= 64) + { + newType = TYP_SIMD64; + break; + } + tryReusingPrevValue = true; break; - } - tryReusingPrevValue = true; - break; #elif defined(TARGET_ARM64) // TARGET_AMD64 - case TYP_SIMD16: - tryReusingPrevValue = true; - break; - -#endif // TARGET_ARM64 -#endif // FEATURE_HW_INTRINSICS -#endif // TARGET_64BIT + case TYP_SIMD16: + tryReusingPrevValue = true; + break; +#endif // TARGET_AMD64 +#endif // FEATURE_HW_INTRINSICS +#endif // TARGET_64BIT + default: + return; + } + } - // TYP_FLOAT and TYP_DOUBLE aren't needed here - they're expected to - // be converted to TYP_INT/TYP_LONG for constant value. - // - // TYP_UINT and TYP_ULONG are not legal for GT_IND. - // - default: - return; + if (!LowerCheckCoalescedStoreAtomicity(node, currData, prevData)) + { + return; } - // If we can't merge these two stores into a single store, we can at least - // cache prevData.value to a local and reuse it in currData. - // Normally, LSRA is expected to do this for us, but it's not always the case for SIMD. if (tryReusingPrevValue) { #if defined(FEATURE_HW_INTRINSICS) @@ -10802,7 +11008,15 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) m_compiler->gtNewLclvNode(use.ReplaceWithLclVar(m_compiler), prevData.value->TypeGet()); BlockRange().InsertBefore(currData.value, prevValueTmp); BlockRange().Remove(currData.value); - ind->Data() = prevValueTmp; + + if (node->OperIsLocalStore()) + { + node->AsLclVarCommon()->Data() = prevValueTmp; + } + else + { + node->AsIndir()->Data() = prevValueTmp; + } } #endif // FEATURE_HW_INTRINSICS return; @@ -10810,35 +11024,50 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) assert(newType != TYP_UNDEF); - // We should not be here for stores requiring write barriers. - assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind->AsStoreInd())); - assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd->AsStoreInd())); + if (node->OperIs(GT_STOREIND, GT_STORE_BLK)) + { + auto* ind = node->AsStoreInd(); + auto* prevInd = prevTree->AsStoreInd(); - // Delete previous STOREIND entirely - BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); + assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(ind)); + assert(!m_compiler->codeGen->gcInfo.gcIsWriteBarrierStoreIndNode(prevInd)); - // It's not expected to be contained yet, but just in case... - ind->Data()->ClearContained(); + BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); - // We know it's always LEA for now - GenTreeAddrMode* addr = ind->Addr()->AsAddrMode(); + ind->Data()->ClearContained(); + GenTreeAddrMode* addr = ind->Addr()->AsAddrMode(); + addr->SetOffset(min(prevData.offset, currData.offset)); - // Update offset to be the minimum of the two - addr->SetOffset(min(prevData.offset, currData.offset)); + ind->gtType = newType; + ind->Data()->gtType = newType; + } + else + { + auto* lclStore = node->AsLclVarCommon(); + BlockRange().Remove(prevData.rangeStart, prevData.rangeEnd); - // Update type for both STOREIND and val - ind->gtType = newType; - ind->Data()->gtType = newType; + lclStore->Data()->ClearContained(); + lclStore->gtType = newType; + lclStore->Data()->gtType = newType; + if (lclStore->OperIs(GT_STORE_LCL_FLD)) + { + lclStore->AsLclFld()->SetLclOffs(min(prevData.offset, currData.offset)); + } + } #if defined(TARGET_AMD64) && defined(FEATURE_HW_INTRINSICS) // Upgrading two SIMD stores to a wider SIMD store. - // Only on x64 since ARM64 has no options above SIMD16 + // Only on x64 since ARM64 has no options above SIMD16. if (varTypeIsSIMD(oldType)) { + if (!prevData.value->OperIs(GT_CNS_VEC) || !currData.value->OperIs(GT_CNS_VEC)) + { + return; + } + int8_t* lowerCns = prevData.value->AsVecCon()->gtSimdVal.i8; int8_t* upperCns = currData.value->AsVecCon()->gtSimdVal.i8; - // if the previous store was at a higher address, swap the constants if (prevData.offset > currData.offset) { std::swap(lowerCns, upperCns); @@ -10849,58 +11078,109 @@ void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) memcpy(newCns.i8, lowerCns, oldWidth); memcpy(newCns.i8 + oldWidth, upperCns, oldWidth); - ind->Data()->AsVecCon()->gtSimdVal = newCns; + currData.value->AsVecCon()->gtSimdVal = newCns; continue; } -#endif +#endif // TARGET_AMD64 && FEATURE_HW_INTRINSICS - size_t lowerCns = (size_t)prevData.value->AsIntCon()->IconValue(); - size_t upperCns = (size_t)currData.value->AsIntCon()->IconValue(); - - // if the previous store was at a higher address, swap the constants - if (prevData.offset > currData.offset) + // The integer path below places each constant according to its byte offset, so it doesn't need to swap the + // values first. Only the SIMD packing paths above need to normalize lower/upper order explicitly. + uint64_t lowerCns = 0; + uint64_t upperCns = 0; + if (!TryGetStoreCoalescingConstantBits(prevData.value, &lowerCns) || + !TryGetStoreCoalescingConstantBits(currData.value, &upperCns)) { - std::swap(lowerCns, upperCns); + return; } #if defined(TARGET_64BIT) && defined(FEATURE_HW_INTRINSICS) - // We're promoting two TYP_LONG/TYP_REF into TYP_SIMD16 - // All legality checks were done above. if (varTypeIsSIMD(newType)) { - // Replace two 64bit constants with a single 128bit constant + if (prevData.offset > currData.offset) + { + std::swap(lowerCns, upperCns); + } + int8_t val[16]; memcpy(val, &lowerCns, 8); memcpy(val + 8, &upperCns, 8); GenTreeVecCon* vecCns = m_compiler->gtNewVconNode(newType, &val); - BlockRange().InsertAfter(ind->Data(), vecCns); - BlockRange().Remove(ind->Data()); - ind->gtOp2 = vecCns; + BlockRange().InsertAfter(currData.value, vecCns); + BlockRange().Remove(currData.value); + + if (node->OperIsLocalStore()) + { + node->AsLclVarCommon()->Data() = vecCns; + } + else + { + node->AsIndir()->Data() = vecCns; + } + continue; } #endif // TARGET_64BIT && FEATURE_HW_INTRINSICS - // Trim the constants to the size of the type, e.g. for TYP_SHORT and TYP_USHORT - // the mask will be 0xFFFF, for TYP_INT - 0xFFFFFFFF. - size_t mask = ~(size_t(0)) >> (sizeof(size_t) - genTypeSize(oldType)) * BITS_PER_BYTE; - lowerCns &= mask; - upperCns &= mask; + uint64_t prevMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - prevData.accessSize) * BITS_PER_BYTE; + uint64_t currMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - currData.accessSize) * BITS_PER_BYTE; + uint64_t newMask = ~(uint64_t(0)) >> (sizeof(uint64_t) - genTypeSize(newType)) * BITS_PER_BYTE; + lowerCns &= prevMask; + upperCns &= currMask; - size_t val = (lowerCns | (upperCns << (genTypeSize(oldType) * BITS_PER_BYTE))); + unsigned const prevShift = static_cast((prevData.offset - minOffset) * BITS_PER_BYTE); + unsigned const currShift = static_cast((currData.offset - minOffset) * BITS_PER_BYTE); + + uint64_t prevBits = (lowerCns << prevShift) & newMask; + uint64_t currBits = (upperCns << currShift) & newMask; + + // Later stores must overwrite any overlapping bytes from earlier stores. + uint64_t currBitsMask = (currMask << currShift) & newMask; + uint64_t val = (prevBits & ~currBitsMask) | currBits; JITDUMP("Coalesced two stores into a single store with value %lld\n", (int64_t)val); - ind->Data()->AsIntCon()->gtIconVal = (ssize_t)val; - if (genTypeSize(oldType) == 1) + assert(currData.value->OperIs(GT_CNS_INT)); + auto* intCon = currData.value->AsIntCon(); + intCon->gtIconVal = static_cast(val); + if (genTypeSize(oldType) == 1 && node->OperIsIndir()) { - // A mark for future foldings that this IND doesn't need to be atomic. - ind->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC; + node->gtFlags |= GTF_IND_ALLOW_NON_ATOMIC; } - } while (true); #endif // TARGET_XARCH || TARGET_ARM64 } +//------------------------------------------------------------------------ +// LowerStoreIndirCoalescing: If the given IND/STOREIND node is followed by a similar +// IND/STOREIND node, try to merge them into a single store of a twice wider type. Example: +// +// * STOREIND int +// +--* LCL_VAR byref V00 +// \--* CNS_INT int 0x1 +// +// * STOREIND int +// +--* LEA(b+4) byref +// | \--* LCL_VAR byref V00 +// \--* CNS_INT int 0x2 +// +// We can merge these two into into a single store of 8 bytes with (0x1 | (0x2 << 32)) as the value +// +// * STOREIND long +// +--* LEA(b+0) byref +// | \--* LCL_VAR byref V00 +// \--* CNS_INT long 0x200000001 +// +// NOTE: Our memory model allows us to do this optimization, see Memory-model.md: +// * Adjacent non-volatile writes to the same location can be coalesced. (see Memory-model.md) +// +// Arguments: +// ind - the current STOREIND node +// +void Lowering::LowerStoreIndirCoalescing(GenTreeIndir* ind) +{ + LowerStoreCoalescing(ind); +} + //------------------------------------------------------------------------ // LowerStoreIndirCommon: a common logic to lower StoreIndir. // diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index d88ab35f9c2ad0..a3e7b8f8356210 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -351,32 +351,62 @@ class Lowering final : public Phase struct LoadStoreCoalescingData { - var_types targetType; - GenTree* baseAddr; - GenTree* index; - GenTree* value; - uint32_t scale; - int offset; - GenTree* rangeStart; - GenTree* rangeEnd; + var_types targetType = TYP_UNDEF; + GenTree* baseAddr = nullptr; + GenTree* index = nullptr; + GenTree* value = nullptr; + uint32_t scale = 1; + int offset = 0; + unsigned accessSize = 0; + unsigned lclNum = BAD_VAR_NUM; + GenTreeFlags storeFlags = GTF_EMPTY; + bool isLocalStoreNode = false; + bool isAddressExposed = false; + GenTree* rangeStart = nullptr; + GenTree* rangeEnd = nullptr; bool IsStore() const { return value != nullptr; } - bool IsAddressEqual(const LoadStoreCoalescingData& other, bool ignoreOffset) const + bool IsLocalStore() const { - if ((scale != other.scale) || (targetType != other.targetType) || - !GenTree::Compare(baseAddr, other.baseAddr) || !GenTree::Compare(index, other.index)) + return isLocalStoreNode; + } + + bool AllowsNonAtomic() const + { + return (storeFlags & GTF_IND_ALLOW_NON_ATOMIC) != 0; + } + + bool IsAddressEqual(const LoadStoreCoalescingData& other, + bool ignoreOffset, + bool allowDifferentTypes = false) const + { + if ((scale != other.scale) || (!allowDifferentTypes && (targetType != other.targetType))) { return false; } + + if (IsLocalStore() || other.IsLocalStore()) + { + if (!IsLocalStore() || !other.IsLocalStore() || (lclNum != other.lclNum)) + { + return false; + } + } + else if (!GenTree::Compare(baseAddr, other.baseAddr) || !GenTree::Compare(index, other.index)) + { + return false; + } + return ignoreOffset || (offset == other.offset); } }; bool GetLoadStoreCoalescingData(GenTreeIndir* ind, LoadStoreCoalescingData* data) const; + bool GetStoreCoalescingData(GenTree* store, LoadStoreCoalescingData* data) const; // Per tree node member functions GenTree* LowerStoreIndirCommon(GenTreeStoreInd* ind); @@ -389,7 +419,11 @@ class Lowering final : public Phase void MarkTree(GenTree* root); void UnmarkTree(GenTree* root); GenTree* LowerStoreIndir(GenTreeStoreInd* node); + void LowerStoreCoalescing(GenTree* node); void LowerStoreIndirCoalescing(GenTreeIndir* node); + bool LowerCheckCoalescedStoreAtomicity(GenTree* currStore, + const LoadStoreCoalescingData& currData, + const LoadStoreCoalescingData& prevData); GenTree* LowerAdd(GenTreeOp* node); GenTree* LowerMul(GenTreeOp* mul); bool TryLowerAndNegativeOne(GenTreeOp* node, GenTree** nextNode); diff --git a/src/tests/JIT/Directed/StructPromote/SpAddr.cs b/src/tests/JIT/Directed/StructPromote/SpAddr.cs index b9fa68710da733..e44c3176c89c0e 100644 --- a/src/tests/JIT/Directed/StructPromote/SpAddr.cs +++ b/src/tests/JIT/Directed/StructPromote/SpAddr.cs @@ -2,20 +2,30 @@ // The .NET Foundation licenses this file to you under the MIT license. // -using System.Runtime.CompilerServices; using System; +using System.Runtime.CompilerServices; using Xunit; public class SpAddr { - - // Struct in reg (2 ints) struct S { public int i0; public int i1; } + struct Pair + { + public int A; + public int B; + } + + struct FloatPair + { + public float A; + public float B; + } + [MethodImpl(MethodImplOptions.NoInlining)] static int Foo(S s0, S s1) { @@ -35,12 +45,74 @@ static int M(int i0, int i1, int i2, int i3) return Foo(s0, s1); // r0 <= r1; r1 <= r0; r2 <= r3; r3 <= r2 } + [MethodImpl(MethodImplOptions.NoInlining)] + static long Consume(Pair p) => ((long)p.B << 32) | (uint)p.A; + + [MethodImpl(MethodImplOptions.NoInlining)] + static long Consume(FloatPair p) => + ((long)(uint)BitConverter.SingleToInt32Bits(p.B) << 32) | (uint)BitConverter.SingleToInt32Bits(p.A); + + [MethodImpl(MethodImplOptions.NoInlining)] + static void Expose(ref Pair p) + { + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static long NonAddressExposed(int a, int b) + { + Pair p; + p.A = a; + p.B = b; + return Consume(p); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static long AddressExposed(int a, int b) + { + Pair p = default; + Expose(ref p); + p.A = a; + p.B = b; + return Consume(p); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static int OverlappingNonZeroThenShort() + { + Pair p = default; + p.A = unchecked((int)0xDEADBEEF); + Unsafe.As(ref p) = 0; + return p.A; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + static long AdjacentFloatNonZeroThenZero() + { + FloatPair p = default; + p.A = 1.0f; + p.B = 0.0f; + return Consume(p); + } + [Fact] public static int TestEntryPoint() { int res = M(1, 2, 3, 4); Console.WriteLine("M(1, 2, 3, 4) is {0}.", res); - if (res == 10) + long nonAddressExposed = NonAddressExposed(0x11223344, 0x55667788); + long addressExposed = AddressExposed(0x10203040, 0x50607080); + int overlapping = OverlappingNonZeroThenShort(); + long floatPair = AdjacentFloatNonZeroThenZero(); + + int expectedOverlapping = BitConverter.IsLittleEndian + ? unchecked((int)0xDEAD0000) + : unchecked((int)0x0000BEEF); + + if ((res == 10) && + (nonAddressExposed == 0x5566778811223344L) && + (addressExposed == 0x5060708010203040L) && + (overlapping == expectedOverlapping) && + (floatPair == 0x000000003F800000L)) return 100; else return 99;