From d444a91e242f131f55f17d64ca86870928ea05e2 Mon Sep 17 00:00:00 2001 From: aromaa Date: Tue, 22 Mar 2022 01:19:14 +0200 Subject: [PATCH 01/10] Optimize bswap+mov to movbe --- src/coreclr/inc/clrconfigvalues.h | 1 + src/coreclr/inc/corinfoinstructionset.h | 90 ++++++++++------- src/coreclr/inc/jiteeversionguid.h | 10 +- src/coreclr/inc/readytoruninstructionset.h | 1 + src/coreclr/jit/codegenlinear.cpp | 2 +- src/coreclr/jit/codegenxarch.cpp | 41 ++++---- src/coreclr/jit/emitxarch.cpp | 19 ++++ src/coreclr/jit/instrsxarch.h | 1 + src/coreclr/jit/lower.cpp | 6 ++ src/coreclr/jit/lower.h | 1 + src/coreclr/jit/lowerxarch.cpp | 30 ++++++ src/coreclr/jit/lsrabuild.cpp | 5 + .../Runtime/ReadyToRunInstructionSet.cs | 1 + .../Runtime/ReadyToRunInstructionSetHelper.cs | 4 + .../JitInterface/CorInfoInstructionSet.cs | 96 ++++++++++++------- .../ThunkGenerator/InstructionSetDesc.txt | 3 + src/coreclr/vm/codeman.cpp | 13 +++ .../BinaryPrimitivesReverseEndianness.cs | 75 +++++++++++++++ 18 files changed, 308 insertions(+), 91 deletions(-) diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 86beb15b0a6376..ee15bdb3a2b0c0 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -757,6 +757,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableBMI2, W("EnableBMI2"), 1 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableFMA, W("EnableFMA"), 1, "Allows FMA+ hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableLZCNT, W("EnableLZCNT"), 1, "Allows LZCNT+ hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnablePCLMULQDQ, W("EnablePCLMULQDQ"), 1, "Allows PCLMULQDQ+ hardware intrinsics to be disabled") +RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableMOVBE, W("EnableMOVBE"), 1, "Allows MOVBE+ hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnablePOPCNT, W("EnablePOPCNT"), 1, "Allows POPCNT+ hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableSSE, W("EnableSSE"), 1, "Allows SSE+ hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableSSE2, W("EnableSSE2"), 1, "Allows SSE2+ hardware intrinsics to be disabled") diff --git a/src/coreclr/inc/corinfoinstructionset.h b/src/coreclr/inc/corinfoinstructionset.h index b8ccc0af8446a5..778b1839e4afbd 100644 --- a/src/coreclr/inc/corinfoinstructionset.h +++ b/src/coreclr/inc/corinfoinstructionset.h @@ -58,23 +58,25 @@ enum CORINFO_InstructionSet InstructionSet_Vector128=17, InstructionSet_Vector256=18, InstructionSet_AVXVNNI=19, - InstructionSet_X86Base_X64=20, - InstructionSet_SSE_X64=21, - InstructionSet_SSE2_X64=22, - InstructionSet_SSE3_X64=23, - InstructionSet_SSSE3_X64=24, - InstructionSet_SSE41_X64=25, - InstructionSet_SSE42_X64=26, - InstructionSet_AVX_X64=27, - InstructionSet_AVX2_X64=28, - InstructionSet_AES_X64=29, - InstructionSet_BMI1_X64=30, - InstructionSet_BMI2_X64=31, - InstructionSet_FMA_X64=32, - InstructionSet_LZCNT_X64=33, - InstructionSet_PCLMULQDQ_X64=34, - InstructionSet_POPCNT_X64=35, - InstructionSet_AVXVNNI_X64=36, + InstructionSet_MOVBE=20, + InstructionSet_X86Base_X64=21, + InstructionSet_SSE_X64=22, + InstructionSet_SSE2_X64=23, + InstructionSet_SSE3_X64=24, + InstructionSet_SSSE3_X64=25, + InstructionSet_SSE41_X64=26, + InstructionSet_SSE42_X64=27, + InstructionSet_AVX_X64=28, + InstructionSet_AVX2_X64=29, + InstructionSet_AES_X64=30, + InstructionSet_BMI1_X64=31, + InstructionSet_BMI2_X64=32, + InstructionSet_FMA_X64=33, + InstructionSet_LZCNT_X64=34, + InstructionSet_PCLMULQDQ_X64=35, + InstructionSet_POPCNT_X64=36, + InstructionSet_AVXVNNI_X64=37, + InstructionSet_MOVBE_X64=38, #endif // TARGET_AMD64 #ifdef TARGET_X86 InstructionSet_X86Base=1, @@ -96,23 +98,25 @@ enum CORINFO_InstructionSet InstructionSet_Vector128=17, InstructionSet_Vector256=18, InstructionSet_AVXVNNI=19, - InstructionSet_X86Base_X64=20, - InstructionSet_SSE_X64=21, - InstructionSet_SSE2_X64=22, - InstructionSet_SSE3_X64=23, - InstructionSet_SSSE3_X64=24, - InstructionSet_SSE41_X64=25, - InstructionSet_SSE42_X64=26, - InstructionSet_AVX_X64=27, - InstructionSet_AVX2_X64=28, - InstructionSet_AES_X64=29, - InstructionSet_BMI1_X64=30, - InstructionSet_BMI2_X64=31, - InstructionSet_FMA_X64=32, - InstructionSet_LZCNT_X64=33, - InstructionSet_PCLMULQDQ_X64=34, - InstructionSet_POPCNT_X64=35, - InstructionSet_AVXVNNI_X64=36, + InstructionSet_MOVBE=20, + InstructionSet_X86Base_X64=21, + InstructionSet_SSE_X64=22, + InstructionSet_SSE2_X64=23, + InstructionSet_SSE3_X64=24, + InstructionSet_SSSE3_X64=25, + InstructionSet_SSE41_X64=26, + InstructionSet_SSE42_X64=27, + InstructionSet_AVX_X64=28, + InstructionSet_AVX2_X64=29, + InstructionSet_AES_X64=30, + InstructionSet_BMI1_X64=31, + InstructionSet_BMI2_X64=32, + InstructionSet_FMA_X64=33, + InstructionSet_LZCNT_X64=34, + InstructionSet_PCLMULQDQ_X64=35, + InstructionSet_POPCNT_X64=36, + InstructionSet_AVXVNNI_X64=37, + InstructionSet_MOVBE_X64=38, #endif // TARGET_X86 }; @@ -212,6 +216,8 @@ struct CORINFO_InstructionSetFlags AddInstructionSet(InstructionSet_POPCNT_X64); if (HasInstructionSet(InstructionSet_AVXVNNI)) AddInstructionSet(InstructionSet_AVXVNNI_X64); + if (HasInstructionSet(InstructionSet_MOVBE)) + AddInstructionSet(InstructionSet_MOVBE_X64); #endif // TARGET_AMD64 #ifdef TARGET_X86 #endif // TARGET_X86 @@ -357,6 +363,10 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI); if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI_X64) && !resultflags.HasInstructionSet(InstructionSet_AVXVNNI)) resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI_X64); + if (resultflags.HasInstructionSet(InstructionSet_MOVBE) && !resultflags.HasInstructionSet(InstructionSet_MOVBE_X64)) + resultflags.RemoveInstructionSet(InstructionSet_MOVBE); + if (resultflags.HasInstructionSet(InstructionSet_MOVBE_X64) && !resultflags.HasInstructionSet(InstructionSet_MOVBE)) + resultflags.RemoveInstructionSet(InstructionSet_MOVBE_X64); if (resultflags.HasInstructionSet(InstructionSet_SSE) && !resultflags.HasInstructionSet(InstructionSet_X86Base)) resultflags.RemoveInstructionSet(InstructionSet_SSE); if (resultflags.HasInstructionSet(InstructionSet_SSE2) && !resultflags.HasInstructionSet(InstructionSet_SSE)) @@ -393,6 +403,8 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins resultflags.RemoveInstructionSet(InstructionSet_Vector256); if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2)) resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI); + if (resultflags.HasInstructionSet(InstructionSet_MOVBE) && !resultflags.HasInstructionSet(InstructionSet_SSE42)) + resultflags.RemoveInstructionSet(InstructionSet_MOVBE); #endif // TARGET_AMD64 #ifdef TARGET_X86 if (resultflags.HasInstructionSet(InstructionSet_SSE) && !resultflags.HasInstructionSet(InstructionSet_X86Base)) @@ -431,6 +443,8 @@ inline CORINFO_InstructionSetFlags EnsureInstructionSetFlagsAreValid(CORINFO_Ins resultflags.RemoveInstructionSet(InstructionSet_Vector256); if (resultflags.HasInstructionSet(InstructionSet_AVXVNNI) && !resultflags.HasInstructionSet(InstructionSet_AVX2)) resultflags.RemoveInstructionSet(InstructionSet_AVXVNNI); + if (resultflags.HasInstructionSet(InstructionSet_MOVBE) && !resultflags.HasInstructionSet(InstructionSet_SSE42)) + resultflags.RemoveInstructionSet(InstructionSet_MOVBE); #endif // TARGET_X86 } while (!oldflags.Equals(resultflags)); @@ -563,6 +577,10 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet) return "AVXVNNI"; case InstructionSet_AVXVNNI_X64 : return "AVXVNNI_X64"; + case InstructionSet_MOVBE : + return "MOVBE"; + case InstructionSet_MOVBE_X64 : + return "MOVBE_X64"; #endif // TARGET_AMD64 #ifdef TARGET_X86 case InstructionSet_X86Base : @@ -603,6 +621,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet) return "Vector256"; case InstructionSet_AVXVNNI : return "AVXVNNI"; + case InstructionSet_MOVBE : + return "MOVBE"; #endif // TARGET_X86 default: @@ -652,6 +672,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst case READYTORUN_INSTRUCTION_Pclmulqdq: return InstructionSet_PCLMULQDQ; case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_POPCNT; case READYTORUN_INSTRUCTION_AvxVnni: return InstructionSet_AVXVNNI; + case READYTORUN_INSTRUCTION_Movbe: return InstructionSet_MOVBE; #endif // TARGET_AMD64 #ifdef TARGET_X86 case READYTORUN_INSTRUCTION_X86Base: return InstructionSet_X86Base; @@ -671,6 +692,7 @@ inline CORINFO_InstructionSet InstructionSetFromR2RInstructionSet(ReadyToRunInst case READYTORUN_INSTRUCTION_Pclmulqdq: return InstructionSet_PCLMULQDQ; case READYTORUN_INSTRUCTION_Popcnt: return InstructionSet_POPCNT; case READYTORUN_INSTRUCTION_AvxVnni: return InstructionSet_AVXVNNI; + case READYTORUN_INSTRUCTION_Movbe: return InstructionSet_MOVBE; #endif // TARGET_X86 default: diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h index 59a16ece8cca0d..ce062f5e082b1a 100644 --- a/src/coreclr/inc/jiteeversionguid.h +++ b/src/coreclr/inc/jiteeversionguid.h @@ -43,11 +43,11 @@ typedef const GUID *LPCGUID; #define GUID_DEFINED #endif // !GUID_DEFINED -constexpr GUID JITEEVersionIdentifier = { /* b0719856-6fe6-407c-bf40-7a57e22b2382 */ - 0xb0719856, - 0x6fe6, - 0x407c, - {0xbf, 0x40, 0x7a, 0x57, 0xe2, 0x2b, 0x23, 0x82} +constexpr GUID JITEEVersionIdentifier = { /* dfda4767-9618-4d6a-8105-a86034dc52eb */ + 0xdfda4767, + 0x9618, + 0x4d6a, + {0x81, 0x05, 0xa8, 0x60, 0x34, 0xdc, 0x52, 0xeb} }; ////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/coreclr/inc/readytoruninstructionset.h b/src/coreclr/inc/readytoruninstructionset.h index 6d7d65a651b6bb..62935f58668d6b 100644 --- a/src/coreclr/inc/readytoruninstructionset.h +++ b/src/coreclr/inc/readytoruninstructionset.h @@ -35,6 +35,7 @@ enum ReadyToRunInstructionSet READYTORUN_INSTRUCTION_Rdm=24, READYTORUN_INSTRUCTION_AvxVnni=25, READYTORUN_INSTRUCTION_Rcpc=26, + READYTORUN_INSTRUCTION_Movbe=27, }; diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 83efe5685c3ba1..152b8610394d55 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -1653,7 +1653,7 @@ void CodeGen::genConsumeRegs(GenTree* tree) } #endif // FEATURE_HW_INTRINSICS #endif // TARGET_XARCH - else if (tree->OperIs(GT_BITCAST, GT_NEG, GT_CAST, GT_LSH)) + else if (tree->OperIs(GT_BITCAST, GT_NEG, GT_CAST, GT_LSH, GT_BSWAP)) { genConsumeRegs(tree->gtGetOp1()); } diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 326a2be1289f73..d86669732bfac7 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -553,34 +553,38 @@ void CodeGen::genCodeForNegNot(GenTree* tree) // void CodeGen::genCodeForBswap(GenTree* tree) { - // TODO: If we're swapping immediately after a read from memory or immediately before - // a write to memory, use the MOVBE instruction instead of the BSWAP instruction if - // the platform supports it. - assert(tree->OperIs(GT_BSWAP, GT_BSWAP16)); regNumber targetReg = tree->GetRegNum(); var_types targetType = tree->TypeGet(); - GenTree* operand = tree->gtGetOp1(); - assert(operand->isUsedFromReg()); - regNumber operandReg = genConsumeReg(operand); - - inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true); + GenTree* operand = tree->gtGetOp1(); - if (tree->OperIs(GT_BSWAP)) + if (operand->isUsedFromReg()) { - // 32-bit and 64-bit byte swaps use "bswap reg" - inst_RV(INS_bswap, targetReg, targetType); + regNumber operandReg = genConsumeReg(operand); + + inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true); } else { - // 16-bit byte swaps use "ror reg.16, 8" - inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE); + assert(operand->OperIs(GT_IND)); + + genConsumeReg(operand->gtGetOp1()); + GetEmitter()->emitInsBinary(INS_movbe, emitTypeSize(operand), tree, operand); + } - if (!genCanOmitNormalizationForBswap16(tree)) + if (operand->isUsedFromReg()) + { + if (tree->OperIs(GT_BSWAP)) + { + // 32-bit and 64-bit byte swaps use "bswap reg" + inst_RV(INS_bswap, targetReg, targetType); + } + else { - GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false); + // 16-bit byte swaps use "ror reg.16, 8" + inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE); } } @@ -5142,7 +5146,10 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree) } else { - GetEmitter()->emitInsStoreInd(ins_Store(data->TypeGet()), emitTypeSize(tree), tree); + GetEmitter()->emitInsStoreInd(data->OperIs(GT_BSWAP) && data->isContained() + ? INS_movbe + : ins_Store(data->TypeGet()), + emitTypeSize(tree), tree); } } } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 855f85f1406f92..a6da809bedac7f 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -3387,6 +3387,11 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m } else { + if (data->OperIs(GT_BSWAP) && data->isContained()) + { + data = data->gtGetOp1(); + } + assert(!data->isContained()); id = emitNewInstrAmd(attr, offset); id->idIns(ins); @@ -16282,6 +16287,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; } + case INS_movbe: + if (memAccessKind == PERFSCORE_MEMORY_READ) + { + result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insLatency += PERFSCORE_THROUGHPUT_2C; + } + else + { + assert(memAccessKind == PERFSCORE_MEMORY_WRITE); + result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insLatency += PERFSCORE_THROUGHPUT_2C; + } + break; + default: // unhandled instruction insFmt combination perfScoreUnhandledInstruction(id, &result); diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index bb76e56b6a0542..a8916e84404f3a 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -65,6 +65,7 @@ INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, // Multi-byte opcodes without modrm are represented in mixed endian fashion. // See comment around quarter way through this file for more information. INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_FLAGS_None ) +INST4(movbe, "movbe", IUM_WR, 0x380F00F1, BAD_CODE, 0x380F00F0, BAD_CODE, INS_FLAGS_Has_Wbit ) // id nm um mr mi rm a4 flags INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 3682fb9e53cf8b..5959f7340c33ff 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -418,6 +418,12 @@ GenTree* Lowering::LowerNode(GenTree* node) } break; +#if defined(TARGET_XARCH) + case GT_BSWAP: + LowerBswapOp(node->AsOp()); + break; +#endif // TARGET_XARCH + default: break; } diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index 0512d4173ac13e..7ad3f6202d6697 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -350,6 +350,7 @@ class Lowering final : public Phase GenTree* TryLowerAndOpToResetLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToExtractLowestSetBit(GenTreeOp* andNode); GenTree* TryLowerAndOpToAndNot(GenTreeOp* andNode); + void LowerBswapOp(GenTreeOp* node); #elif defined(TARGET_ARM64) bool IsValidConstForMovImm(GenTreeHWIntrinsic* node); void LowerHWIntrinsicFusedMultiplyAddScalar(GenTreeHWIntrinsic* node); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 853121bf6a2b59..222186ab1b751d 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3945,6 +3945,36 @@ GenTree* Lowering::TryLowerAndOpToAndNot(GenTreeOp* andNode) return andnNode; } +void Lowering::LowerBswapOp(GenTreeOp* node) +{ + assert(node->OperIs(GT_BSWAP)); + + if (!comp->compOpportunisticallyDependsOn(InstructionSet_MOVBE)) + { + return; + } + + GenTree* operand = node->gtGetOp1(); + if (operand->OperIs(GT_IND)) + { + operand->SetContained(); + + return; + } + + LIR::Use use; + if (!BlockRange().TryGetUse(node, &use)) + { + return; + } + + GenTree* user = use.User(); + if (user->OperIs(GT_STOREIND)) + { + node->SetContained(); + } +} + #endif // FEATURE_HW_INTRINSICS //---------------------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 65d06d30ffd33e..40995f1baeacb4 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3137,6 +3137,11 @@ int LinearScan::BuildOperandUses(GenTree* node, regMaskTP candidates) { return BuildAddrUses(node, candidates); } + if (node->OperIs(GT_BSWAP)) + { + BuildOperandUses(node->gtGetOp1(), candidates); + return 1; + } #ifdef FEATURE_HW_INTRINSICS if (node->OperIsHWIntrinsic()) { diff --git a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSet.cs b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSet.cs index ae045b02cfd13b..1157bf84edf4e6 100644 --- a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSet.cs +++ b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSet.cs @@ -38,6 +38,7 @@ public enum ReadyToRunInstructionSet Rdm=24, AvxVnni=25, Rcpc=26, + Movbe=27, } } diff --git a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs index 730e6c4f41680d..69c29c7f960707 100644 --- a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs +++ b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs @@ -89,6 +89,8 @@ public static class ReadyToRunInstructionSetHelper case InstructionSet.X64_Vector256: return null; case InstructionSet.X64_AVXVNNI: return ReadyToRunInstructionSet.AvxVnni; case InstructionSet.X64_AVXVNNI_X64: return ReadyToRunInstructionSet.AvxVnni; + case InstructionSet.X64_MOVBE: return ReadyToRunInstructionSet.Movbe; + case InstructionSet.X64_MOVBE_X64: return ReadyToRunInstructionSet.Movbe; default: throw new Exception("Unknown instruction set"); } @@ -134,6 +136,8 @@ public static class ReadyToRunInstructionSetHelper case InstructionSet.X86_Vector256: return null; case InstructionSet.X86_AVXVNNI: return ReadyToRunInstructionSet.AvxVnni; case InstructionSet.X86_AVXVNNI_X64: return null; + case InstructionSet.X86_MOVBE: return ReadyToRunInstructionSet.Movbe; + case InstructionSet.X86_MOVBE_X64: return null; default: throw new Exception("Unknown instruction set"); } diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs b/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs index f90947e8697661..fc90274c415bf3 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs @@ -58,6 +58,7 @@ public enum InstructionSet X64_Vector128 = InstructionSet_X64.Vector128, X64_Vector256 = InstructionSet_X64.Vector256, X64_AVXVNNI = InstructionSet_X64.AVXVNNI, + X64_MOVBE = InstructionSet_X64.MOVBE, X64_X86Base_X64 = InstructionSet_X64.X86Base_X64, X64_SSE_X64 = InstructionSet_X64.SSE_X64, X64_SSE2_X64 = InstructionSet_X64.SSE2_X64, @@ -75,6 +76,7 @@ public enum InstructionSet X64_PCLMULQDQ_X64 = InstructionSet_X64.PCLMULQDQ_X64, X64_POPCNT_X64 = InstructionSet_X64.POPCNT_X64, X64_AVXVNNI_X64 = InstructionSet_X64.AVXVNNI_X64, + X64_MOVBE_X64 = InstructionSet_X64.MOVBE_X64, X86_X86Base = InstructionSet_X86.X86Base, X86_SSE = InstructionSet_X86.SSE, X86_SSE2 = InstructionSet_X86.SSE2, @@ -94,6 +96,7 @@ public enum InstructionSet X86_Vector128 = InstructionSet_X86.Vector128, X86_Vector256 = InstructionSet_X86.Vector256, X86_AVXVNNI = InstructionSet_X86.AVXVNNI, + X86_MOVBE = InstructionSet_X86.MOVBE, X86_X86Base_X64 = InstructionSet_X86.X86Base_X64, X86_SSE_X64 = InstructionSet_X86.SSE_X64, X86_SSE2_X64 = InstructionSet_X86.SSE2_X64, @@ -111,6 +114,7 @@ public enum InstructionSet X86_PCLMULQDQ_X64 = InstructionSet_X86.PCLMULQDQ_X64, X86_POPCNT_X64 = InstructionSet_X86.POPCNT_X64, X86_AVXVNNI_X64 = InstructionSet_X86.AVXVNNI_X64, + X86_MOVBE_X64 = InstructionSet_X86.MOVBE_X64, } public enum InstructionSet_ARM64 @@ -163,23 +167,25 @@ public enum InstructionSet_X64 Vector128 = 17, Vector256 = 18, AVXVNNI = 19, - X86Base_X64 = 20, - SSE_X64 = 21, - SSE2_X64 = 22, - SSE3_X64 = 23, - SSSE3_X64 = 24, - SSE41_X64 = 25, - SSE42_X64 = 26, - AVX_X64 = 27, - AVX2_X64 = 28, - AES_X64 = 29, - BMI1_X64 = 30, - BMI2_X64 = 31, - FMA_X64 = 32, - LZCNT_X64 = 33, - PCLMULQDQ_X64 = 34, - POPCNT_X64 = 35, - AVXVNNI_X64 = 36, + MOVBE = 20, + X86Base_X64 = 21, + SSE_X64 = 22, + SSE2_X64 = 23, + SSE3_X64 = 24, + SSSE3_X64 = 25, + SSE41_X64 = 26, + SSE42_X64 = 27, + AVX_X64 = 28, + AVX2_X64 = 29, + AES_X64 = 30, + BMI1_X64 = 31, + BMI2_X64 = 32, + FMA_X64 = 33, + LZCNT_X64 = 34, + PCLMULQDQ_X64 = 35, + POPCNT_X64 = 36, + AVXVNNI_X64 = 37, + MOVBE_X64 = 38, } public enum InstructionSet_X86 @@ -205,23 +211,25 @@ public enum InstructionSet_X86 Vector128 = 17, Vector256 = 18, AVXVNNI = 19, - X86Base_X64 = 20, - SSE_X64 = 21, - SSE2_X64 = 22, - SSE3_X64 = 23, - SSSE3_X64 = 24, - SSE41_X64 = 25, - SSE42_X64 = 26, - AVX_X64 = 27, - AVX2_X64 = 28, - AES_X64 = 29, - BMI1_X64 = 30, - BMI2_X64 = 31, - FMA_X64 = 32, - LZCNT_X64 = 33, - PCLMULQDQ_X64 = 34, - POPCNT_X64 = 35, - AVXVNNI_X64 = 36, + MOVBE = 20, + X86Base_X64 = 21, + SSE_X64 = 22, + SSE2_X64 = 23, + SSE3_X64 = 24, + SSSE3_X64 = 25, + SSE41_X64 = 26, + SSE42_X64 = 27, + AVX_X64 = 28, + AVX2_X64 = 29, + AES_X64 = 30, + BMI1_X64 = 31, + BMI2_X64 = 32, + FMA_X64 = 33, + LZCNT_X64 = 34, + PCLMULQDQ_X64 = 35, + POPCNT_X64 = 36, + AVXVNNI_X64 = 37, + MOVBE_X64 = 38, } public struct InstructionSetFlags : IEnumerable @@ -457,6 +465,10 @@ public static InstructionSetFlags ExpandInstructionSetByImplicationHelper(Target resultflags.AddInstructionSet(InstructionSet.X64_AVXVNNI_X64); if (resultflags.HasInstructionSet(InstructionSet.X64_AVXVNNI_X64)) resultflags.AddInstructionSet(InstructionSet.X64_AVXVNNI); + if (resultflags.HasInstructionSet(InstructionSet.X64_MOVBE)) + resultflags.AddInstructionSet(InstructionSet.X64_MOVBE_X64); + if (resultflags.HasInstructionSet(InstructionSet.X64_MOVBE_X64)) + resultflags.AddInstructionSet(InstructionSet.X64_MOVBE); if (resultflags.HasInstructionSet(InstructionSet.X64_SSE)) resultflags.AddInstructionSet(InstructionSet.X64_X86Base); if (resultflags.HasInstructionSet(InstructionSet.X64_SSE2)) @@ -493,6 +505,8 @@ public static InstructionSetFlags ExpandInstructionSetByImplicationHelper(Target resultflags.AddInstructionSet(InstructionSet.X64_AVX); if (resultflags.HasInstructionSet(InstructionSet.X64_AVXVNNI)) resultflags.AddInstructionSet(InstructionSet.X64_AVX2); + if (resultflags.HasInstructionSet(InstructionSet.X64_MOVBE)) + resultflags.AddInstructionSet(InstructionSet.X64_SSE42); break; case TargetArchitecture.X86: @@ -532,6 +546,8 @@ public static InstructionSetFlags ExpandInstructionSetByImplicationHelper(Target resultflags.AddInstructionSet(InstructionSet.X86_AVX); if (resultflags.HasInstructionSet(InstructionSet.X86_AVXVNNI)) resultflags.AddInstructionSet(InstructionSet.X86_AVX2); + if (resultflags.HasInstructionSet(InstructionSet.X86_MOVBE)) + resultflags.AddInstructionSet(InstructionSet.X86_SSE42); break; } } while (!oldflags.Equals(resultflags)); @@ -626,6 +642,8 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe resultflags.AddInstructionSet(InstructionSet.X64_POPCNT); if (resultflags.HasInstructionSet(InstructionSet.X64_AVXVNNI_X64)) resultflags.AddInstructionSet(InstructionSet.X64_AVXVNNI); + if (resultflags.HasInstructionSet(InstructionSet.X64_MOVBE_X64)) + resultflags.AddInstructionSet(InstructionSet.X64_MOVBE); if (resultflags.HasInstructionSet(InstructionSet.X64_X86Base)) resultflags.AddInstructionSet(InstructionSet.X64_SSE); if (resultflags.HasInstructionSet(InstructionSet.X64_SSE)) @@ -662,6 +680,8 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe resultflags.AddInstructionSet(InstructionSet.X64_Vector256); if (resultflags.HasInstructionSet(InstructionSet.X64_AVX2)) resultflags.AddInstructionSet(InstructionSet.X64_AVXVNNI); + if (resultflags.HasInstructionSet(InstructionSet.X64_SSE42)) + resultflags.AddInstructionSet(InstructionSet.X64_MOVBE); break; case TargetArchitecture.X86: @@ -701,6 +721,8 @@ private static InstructionSetFlags ExpandInstructionSetByReverseImplicationHelpe resultflags.AddInstructionSet(InstructionSet.X86_Vector256); if (resultflags.HasInstructionSet(InstructionSet.X86_AVX2)) resultflags.AddInstructionSet(InstructionSet.X86_AVXVNNI); + if (resultflags.HasInstructionSet(InstructionSet.X86_SSE42)) + resultflags.AddInstructionSet(InstructionSet.X86_MOVBE); break; } } while (!oldflags.Equals(resultflags)); @@ -765,6 +787,7 @@ public static IEnumerable ArchitectureToValidInstructionSets yield return new InstructionSetInfo("Vector128", "", InstructionSet.X64_Vector128, false); yield return new InstructionSetInfo("Vector256", "", InstructionSet.X64_Vector256, false); yield return new InstructionSetInfo("avxvnni", "AvxVnni", InstructionSet.X64_AVXVNNI, true); + yield return new InstructionSetInfo("movbe", "Movbe", InstructionSet.X64_MOVBE, true); break; case TargetArchitecture.X86: @@ -787,6 +810,7 @@ public static IEnumerable ArchitectureToValidInstructionSets yield return new InstructionSetInfo("Vector128", "", InstructionSet.X86_Vector128, false); yield return new InstructionSetInfo("Vector256", "", InstructionSet.X86_Vector256, false); yield return new InstructionSetInfo("avxvnni", "AvxVnni", InstructionSet.X86_AVXVNNI, true); + yield return new InstructionSetInfo("movbe", "Movbe", InstructionSet.X86_MOVBE, true); break; } } @@ -850,6 +874,8 @@ public void Set64BitInstructionSetVariants(TargetArchitecture architecture) AddInstructionSet(InstructionSet.X64_POPCNT_X64); if (HasInstructionSet(InstructionSet.X64_AVXVNNI)) AddInstructionSet(InstructionSet.X64_AVXVNNI_X64); + if (HasInstructionSet(InstructionSet.X64_MOVBE)) + AddInstructionSet(InstructionSet.X64_MOVBE_X64); break; case TargetArchitecture.X86: @@ -891,6 +917,7 @@ public void Set64BitInstructionSetVariantsUnconditionally(TargetArchitecture arc AddInstructionSet(InstructionSet.X64_PCLMULQDQ_X64); AddInstructionSet(InstructionSet.X64_POPCNT_X64); AddInstructionSet(InstructionSet.X64_AVXVNNI_X64); + AddInstructionSet(InstructionSet.X64_MOVBE_X64); break; case TargetArchitecture.X86: @@ -911,6 +938,7 @@ public void Set64BitInstructionSetVariantsUnconditionally(TargetArchitecture arc AddInstructionSet(InstructionSet.X86_PCLMULQDQ_X64); AddInstructionSet(InstructionSet.X86_POPCNT_X64); AddInstructionSet(InstructionSet.X86_AVXVNNI_X64); + AddInstructionSet(InstructionSet.X86_MOVBE_X64); break; } } diff --git a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt index 1ab412ba69f147..f1db26c5235561 100644 --- a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt +++ b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt @@ -44,6 +44,7 @@ instructionset ,X86 ,Popcnt , ,15 ,POPCNT ,popcnt instructionset ,X86 , , , ,Vector128, instructionset ,X86 , , , ,Vector256, instructionset ,X86 ,AvxVnni , ,25 ,AVXVNNI ,avxvnni +instructionset ,X86 ,Movbe , ,27 ,MOVBE ,movbe instructionset64bit,X86 ,X86Base instructionset64bit,X86 ,SSE @@ -62,6 +63,7 @@ instructionset64bit,X86 ,LZCNT instructionset64bit,X86 ,PCLMULQDQ instructionset64bit,X86 ,POPCNT instructionset64bit,X86 ,AVXVNNI +instructionset64bit,X86 ,MOVBE vectorinstructionset,X86 ,Vector128 vectorinstructionset,X86 ,Vector256 @@ -84,6 +86,7 @@ implication ,X86 ,POPCNT ,SSE42 implication ,X86 ,Vector128 ,SSE implication ,X86 ,Vector256 ,AVX implication ,X86 ,AVXVNNI ,AVX2 +implication ,X86 ,MOVBE ,SSE42 ; Definition of X64 instruction sets definearch ,X64 ,64Bit ,X64 diff --git a/src/coreclr/vm/codeman.cpp b/src/coreclr/vm/codeman.cpp index 6f7cf19da2ec91..f49d8a2ab0961b 100644 --- a/src/coreclr/vm/codeman.cpp +++ b/src/coreclr/vm/codeman.cpp @@ -1287,6 +1287,9 @@ void EEJitManager::SetCpuInfo() // CORJIT_FLAG_USE_SSE42 if the following feature bits are set (input EAX of 1) // CORJIT_FLAG_USE_SSE41 // SSE4.2 - ECX bit 20 + // CORJIT_FLAG_USE_MOVBE if the following feature bits are set (input EAX of 1) + // CORJIT_FLAG_USE_SSE42 + // MOVBE - ECX bit 22 // CORJIT_FLAG_USE_POPCNT if the following feature bits are set (input EAX of 1) // CORJIT_FLAG_USE_SSE42 // POPCNT - ECX bit 23 @@ -1358,6 +1361,11 @@ void EEJitManager::SetCpuInfo() { CPUCompileFlags.Set(InstructionSet_SSE42); + if ((cpuidInfo[ECX] & (1 << 22)) != 0) // MOVBE + { + CPUCompileFlags.Set(InstructionSet_MOVBE); + } + if ((cpuidInfo[ECX] & (1 << 23)) != 0) // POPCNT { CPUCompileFlags.Set(InstructionSet_POPCNT); @@ -1535,6 +1543,11 @@ void EEJitManager::SetCpuInfo() CPUCompileFlags.Clear(InstructionSet_PCLMULQDQ); } + if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableMOVBE)) + { + CPUCompileFlags.Clear(InstructionSet_MOVBE); + } + if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnablePOPCNT)) { CPUCompileFlags.Clear(InstructionSet_POPCNT); diff --git a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs index e71bd36e36bd39..b45f19a96bb08e 100644 --- a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs +++ b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs @@ -20,12 +20,18 @@ class Program private const uint ConstantUInt32Input = 0x98765432; private const uint ConstantUInt32Expected = 0x32547698; + private const long ConstantInt64Input = 0x1edcba9876543210; + private const long ConstantInt64Expected = 0x1032547698badc1e; + private const ulong ConstantUInt64Input = 0xfedcba9876543210; private const ulong ConstantUInt64Expected = 0x1032547698badcfe; private static readonly byte[] s_bufferLE = new byte[] { 0x32, 0x54, 0x76, 0x98 }; private static readonly byte[] s_bufferBE = new byte[] { 0x98, 0x76, 0x54, 0x32 }; + private static readonly byte[] s_bufferLESigned64 = new byte[] { 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0x1e }; + private static readonly byte[] s_bufferBESigned64 = new byte[] { 0x1e, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10 }; + static int Main(string[] args) { /* @@ -73,6 +79,14 @@ static int Main(string[] args) return Fail; } + Span spanInt64 = BitConverter.IsLittleEndian ? s_bufferLESigned64 : s_bufferBESigned64; + long swappedSpanInt64 = BinaryPrimitives.ReverseEndianness(MemoryMarshal.Read(spanInt64)); + if (swappedSpanInt64 != ConstantInt64Expected) + { + ReportError("sign-extended Int64", ConstantInt64Input, swappedSpanInt64, ConstantInt64Expected); + return Fail; + } + /* * NON-CONST VALUE TESTS */ @@ -104,6 +118,37 @@ static int Main(string[] args) return Fail; } + /* + * WRITE TESTS + */ + + ushort writeUInt16Output = default; + ushort writeUInt16Input = ByteSwapUInt16_Write(ref writeUInt16Output); + ushort writeUInt16Expected = ByteSwapUInt16_Control(writeUInt16Input); + if (writeUInt16Output != writeUInt16Expected) + { + ReportError("write UInt16", writeUInt16Input, writeUInt16Output, writeUInt16Expected); + return Fail; + } + + uint writeUInt32Output = default; + uint writeUInt32Input = ByteSwapUInt32_Write(ref writeUInt32Output); + uint writeUInt32Expected = ByteSwapUInt32_Control(writeUInt32Input); + if (writeUInt32Output != writeUInt32Expected) + { + ReportError("write UInt32", writeUInt32Input, writeUInt32Output, writeUInt32Expected); + return Fail; + } + + ulong writeUInt64Output = default; + ulong writeUInt64Input = ByteSwapUInt64_Write(ref writeUInt64Output); + ulong writeUInt64Expected = ByteSwapUInt64_Control(writeUInt64Input); + if (writeUInt64Output != writeUInt64Expected) + { + ReportError("write UInt64", writeUInt64Input, writeUInt64Output, writeUInt64Expected); + return Fail; + } + return Pass; } @@ -143,6 +188,36 @@ private static ulong ByteSwapUnsigned_General(ulong value, int width) return retVal; } + [MethodImpl(MethodImplOptions.NoInlining)] + private static ushort ByteSwapUInt16_Write(ref ushort output) + { + ushort input = (ushort)DateTime.UtcNow.Ticks; + + output = BinaryPrimitives.ReverseEndianness(input); + + return input; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint ByteSwapUInt32_Write(ref uint output) + { + uint input = (uint)DateTime.UtcNow.Ticks; + + output = BinaryPrimitives.ReverseEndianness(input); + + return input; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static ulong ByteSwapUInt64_Write(ref ulong output) + { + ulong input = (ulong)DateTime.UtcNow.Ticks; + + output = BinaryPrimitives.ReverseEndianness(input); + + return input; + } + private static string GetHexString(T value) { if (typeof(T) == typeof(short)) From 0b6d2b960c288cd4fc3bea86a214e6b073a5f6c6 Mon Sep 17 00:00:00 2001 From: aromaa Date: Tue, 22 Mar 2022 04:00:04 +0200 Subject: [PATCH 02/10] Fix build --- src/coreclr/jit/codegenxarch.cpp | 7 +++---- src/coreclr/jit/lower.cpp | 4 ++-- .../JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs | 1 + 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index d86669732bfac7..42d02db0a8bdd0 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -558,7 +558,7 @@ void CodeGen::genCodeForBswap(GenTree* tree) regNumber targetReg = tree->GetRegNum(); var_types targetType = tree->TypeGet(); - GenTree* operand = tree->gtGetOp1(); + GenTree* operand = tree->gtGetOp1(); if (operand->isUsedFromReg()) { @@ -5146,9 +5146,8 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree) } else { - GetEmitter()->emitInsStoreInd(data->OperIs(GT_BSWAP) && data->isContained() - ? INS_movbe - : ins_Store(data->TypeGet()), + GetEmitter()->emitInsStoreInd(data->OperIs(GT_BSWAP) && data->isContained() ? INS_movbe + : ins_Store(data->TypeGet()), emitTypeSize(tree), tree); } } diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 5959f7340c33ff..492d6e67c828d8 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -418,11 +418,11 @@ GenTree* Lowering::LowerNode(GenTree* node) } break; -#if defined(TARGET_XARCH) +#if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) case GT_BSWAP: LowerBswapOp(node->AsOp()); break; -#endif // TARGET_XARCH +#endif // FEATURE_HW_INTRINSICS && TARGET_XARCH default: break; diff --git a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs index b45f19a96bb08e..2566ac42babb27 100644 --- a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs +++ b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs @@ -4,6 +4,7 @@ using System; using System.Buffers.Binary; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using Internal; From 3866ab146098ff0244a11c0e958f5ec3a775e41c Mon Sep 17 00:00:00 2001 From: aromaa Date: Fri, 25 Mar 2022 02:17:08 +0200 Subject: [PATCH 03/10] PR feedback --- src/coreclr/jit/codegenxarch.cpp | 20 +++--- src/coreclr/jit/emitxarch.cpp | 12 ++-- src/coreclr/jit/lowerxarch.cpp | 22 +++++-- src/coreclr/jit/lsrabuild.cpp | 3 +- .../BinaryPrimitivesReverseEndianness.cs | 62 +++++++++++++++++++ 5 files changed, 95 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 42d02db0a8bdd0..9a27c2df2bcd2c 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -560,22 +560,12 @@ void CodeGen::genCodeForBswap(GenTree* tree) GenTree* operand = tree->gtGetOp1(); - if (operand->isUsedFromReg()) - { - regNumber operandReg = genConsumeReg(operand); - - inst_Mov(targetType, targetReg, operandReg, /* canSkip */ true); - } - else - { - assert(operand->OperIs(GT_IND)); - - genConsumeReg(operand->gtGetOp1()); - GetEmitter()->emitInsBinary(INS_movbe, emitTypeSize(operand), tree, operand); - } + genConsumeRegs(operand); if (operand->isUsedFromReg()) { + inst_Mov(targetType, targetReg, operand->GetRegNum(), /* canSkip */ true); + if (tree->OperIs(GT_BSWAP)) { // 32-bit and 64-bit byte swaps use "bswap reg" @@ -587,6 +577,10 @@ void CodeGen::genCodeForBswap(GenTree* tree) inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE); } } + else + { + GetEmitter()->emitInsBinary(INS_movbe, emitTypeSize(operand), tree, operand); + } genProduceReg(tree); } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index a6da809bedac7f..fbff3ee2f15691 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -3339,6 +3339,13 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m GenTree* addr = mem->Addr(); GenTree* data = mem->Data(); + if (data->OperIs(GT_BSWAP) && data->isContained()) + { + assert(ins == INS_movbe); + + data = data->gtGetOp1(); + } + if (addr->OperGet() == GT_CLS_VAR_ADDR) { if (data->isContainedIntOrIImmed()) @@ -3387,11 +3394,6 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m } else { - if (data->OperIs(GT_BSWAP) && data->isContained()) - { - data = data->gtGetOp1(); - } - assert(!data->isContained()); id = emitNewInstrAmd(attr, offset); id->idIns(ins); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 222186ab1b751d..e417bec5e0b0a5 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3945,23 +3945,37 @@ GenTree* Lowering::TryLowerAndOpToAndNot(GenTreeOp* andNode) return andnNode; } +//---------------------------------------------------------------------------------------------- +// Lowering::LowerBswapOp: Tries to contain GT_BSWAP node when possible +// +// Arguments: +// node - GT_BSWAP node to contain +// +// Notes: +// Containment is not performed when optimizations are disabled +// or when MOVBE instruction set is not found +// void Lowering::LowerBswapOp(GenTreeOp* node) { assert(node->OperIs(GT_BSWAP)); - if (!comp->compOpportunisticallyDependsOn(InstructionSet_MOVBE)) + if (!comp->opts.OptimizationEnabled() || !comp->compOpportunisticallyDependsOn(InstructionSet_MOVBE)) { return; } GenTree* operand = node->gtGetOp1(); - if (operand->OperIs(GT_IND)) + if (IsContainableMemoryOp(operand) && IsSafeToContainMem(node, operand)) { - operand->SetContained(); + MakeSrcContained(node, operand); + // Return early so we don't do double-containing return; } + // Due to this instruction being rare we can perform TryGetUse here + // to simplify the pattern recognition required for GT_STOREIND + // In cases where TP matters, it should be performed inside the user LIR::Use use; if (!BlockRange().TryGetUse(node, &use)) { @@ -3971,7 +3985,7 @@ void Lowering::LowerBswapOp(GenTreeOp* node) GenTree* user = use.User(); if (user->OperIs(GT_STOREIND)) { - node->SetContained(); + MakeSrcContained(user, node); } } diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 40995f1baeacb4..e2f40414c3c4c3 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3139,8 +3139,7 @@ int LinearScan::BuildOperandUses(GenTree* node, regMaskTP candidates) } if (node->OperIs(GT_BSWAP)) { - BuildOperandUses(node->gtGetOp1(), candidates); - return 1; + return BuildOperandUses(node->gtGetOp1(), candidates); } #ifdef FEATURE_HW_INTRINSICS if (node->OperIsHWIntrinsic()) diff --git a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs index 2566ac42babb27..6b1bf8af69f7c4 100644 --- a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs +++ b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs @@ -150,6 +150,37 @@ static int Main(string[] args) return Fail; } + /* + * WRITE TESTS LEA + */ + + ushort writeLeaUInt16Output = default; + ushort writeLeaUInt16Input = ByteSwapUInt16_WriteLea(ref writeLeaUInt16Output, 0); + ushort writeLeaUInt16Expected = ByteSwapUInt16_Control(writeLeaUInt16Input); + if (writeLeaUInt16Output != writeLeaUInt16Expected) + { + ReportError("write lea UInt16", writeLeaUInt16Input, writeLeaUInt16Output, writeLeaUInt16Expected); + return Fail; + } + + uint writeLeaUInt32Output = default; + uint writeLeaUInt32Input = ByteSwapUInt32_WriteLea(ref writeLeaUInt32Output, 0); + uint writeLeaUInt32Expected = ByteSwapUInt32_Control(writeLeaUInt32Input); + if (writeLeaUInt32Output != writeLeaUInt32Expected) + { + ReportError("write lea UInt32", writeLeaUInt32Input, writeLeaUInt32Output, writeLeaUInt32Expected); + return Fail; + } + + ulong writeLeaUInt64Output = default; + ulong writeLeaUInt64Input = ByteSwapUInt64_WriteLea(ref writeLeaUInt64Output, 0); + ulong writeLeaUInt64Expected = ByteSwapUInt64_Control(writeLeaUInt64Input); + if (writeLeaUInt64Output != writeLeaUInt64Expected) + { + ReportError("write lea UInt64", writeLeaUInt64Input, writeLeaUInt64Output, writeLeaUInt64Expected); + return Fail; + } + return Pass; } @@ -219,6 +250,37 @@ private static ulong ByteSwapUInt64_Write(ref ulong output) return input; } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static ushort ByteSwapUInt16_WriteLea(ref ushort output, int offset) + { + ushort input = (ushort)DateTime.UtcNow.Ticks; + + Unsafe.Add(ref output, offset) = BinaryPrimitives.ReverseEndianness(input); + + return input; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint ByteSwapUInt32_WriteLea(ref uint output, int offset) + { + uint input = (uint)DateTime.UtcNow.Ticks; + + Unsafe.Add(ref output, offset) = BinaryPrimitives.ReverseEndianness(input); + + return input; + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static ulong ByteSwapUInt64_WriteLea(ref ulong output, int offset) + { + ulong input = (ulong)DateTime.UtcNow.Ticks; + + Unsafe.Add(ref output, offset) = BinaryPrimitives.ReverseEndianness(input); + + return input; + } + private static string GetHexString(T value) { if (typeof(T) == typeof(short)) From 20cb5d9267fea942e28c3323c5146ef685776269 Mon Sep 17 00:00:00 2001 From: aromaa Date: Sun, 27 Mar 2022 19:55:18 +0300 Subject: [PATCH 04/10] Support BSWAP16 --- src/coreclr/jit/codegenlinear.cpp | 2 +- src/coreclr/jit/codegenxarch.cpp | 5 +++-- src/coreclr/jit/emitxarch.cpp | 23 ++++++++++++++++++++++- src/coreclr/jit/instrsxarch.h | 5 ++++- src/coreclr/jit/lower.cpp | 1 + src/coreclr/jit/lowerxarch.cpp | 2 +- src/coreclr/jit/lsrabuild.cpp | 2 +- 7 files changed, 33 insertions(+), 7 deletions(-) diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp index 152b8610394d55..d9df6b13d8977a 100644 --- a/src/coreclr/jit/codegenlinear.cpp +++ b/src/coreclr/jit/codegenlinear.cpp @@ -1653,7 +1653,7 @@ void CodeGen::genConsumeRegs(GenTree* tree) } #endif // FEATURE_HW_INTRINSICS #endif // TARGET_XARCH - else if (tree->OperIs(GT_BITCAST, GT_NEG, GT_CAST, GT_LSH, GT_BSWAP)) + else if (tree->OperIs(GT_BITCAST, GT_NEG, GT_CAST, GT_LSH, GT_BSWAP, GT_BSWAP16)) { genConsumeRegs(tree->gtGetOp1()); } diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 9a27c2df2bcd2c..418b321b88b6a7 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -5140,8 +5140,9 @@ void CodeGen::genCodeForStoreInd(GenTreeStoreInd* tree) } else { - GetEmitter()->emitInsStoreInd(data->OperIs(GT_BSWAP) && data->isContained() ? INS_movbe - : ins_Store(data->TypeGet()), + GetEmitter()->emitInsStoreInd(data->OperIs(GT_BSWAP, GT_BSWAP16) && data->isContained() + ? INS_movbe + : ins_Store(data->TypeGet()), emitTypeSize(tree), tree); } } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index fbff3ee2f15691..ce9b98b7efe43d 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -3339,7 +3339,7 @@ void emitter::emitInsStoreInd(instruction ins, emitAttr attr, GenTreeStoreInd* m GenTree* addr = mem->Addr(); GenTree* data = mem->Data(); - if (data->OperIs(GT_BSWAP) && data->isContained()) + if (data->OperIs(GT_BSWAP, GT_BSWAP16) && data->isContained()) { assert(ins == INS_movbe); @@ -10429,6 +10429,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Is this a 'big' opcode? else if (code & 0xFF000000) { + if (size == EA_2BYTE) + { + assert(ins == INS_movbe); + + dst += emitOutputByte(dst, 0x66); + } + // Output the REX prefix dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); @@ -11213,6 +11220,13 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Is this a 'big' opcode? else if (code & 0xFF000000) { + if (size == EA_2BYTE) + { + assert(ins == INS_movbe); + + dst += emitOutputByte(dst, 0x66); + } + // Output the REX prefix dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); @@ -11683,6 +11697,13 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Is this a 'big' opcode? else if (code & 0xFF000000) { + if (size == EA_2BYTE) + { + assert(ins == INS_movbe); + + dst += emitOutputByte(dst, 0x66); + } + // Output the REX prefix dst += emitOutputRexOrVexPrefixIfNeeded(ins, dst, code); diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index a8916e84404f3a..e08667e87a9f5c 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -65,7 +65,6 @@ INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, // Multi-byte opcodes without modrm are represented in mixed endian fashion. // See comment around quarter way through this file for more information. INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, INS_FLAGS_None ) -INST4(movbe, "movbe", IUM_WR, 0x380F00F1, BAD_CODE, 0x380F00F0, BAD_CODE, INS_FLAGS_Has_Wbit ) // id nm um mr mi rm a4 flags INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit ) @@ -168,6 +167,7 @@ INSTMUL(imul_15, "imul", IUM_RD, BAD_CODE, 0x4400003868, #define SSEDBL(c) PACK3(0xf2, 0x0f, c) #define PCKDBL(c) PACK3(0x66, 0x0f, c) #define PCKFLT(c) PACK2(0x0f,c) +#define PCKMVB(c) PACK3(0x0F, 0x38, c) // These macros encode extra byte that is implicit in the macro. #define PACK4(byte1,byte2,byte3,byte4) (((byte1) << 16) | ((byte2) << 24) | (byte3) | ((byte4) << 8)) @@ -619,6 +619,9 @@ INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, // LZCNT INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF ) +// MOVBE +INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), INS_FLAGS_None ) + // POPCNT INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF ) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 492d6e67c828d8..e98f3331ba7aa7 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -420,6 +420,7 @@ GenTree* Lowering::LowerNode(GenTree* node) #if defined(FEATURE_HW_INTRINSICS) && defined(TARGET_XARCH) case GT_BSWAP: + case GT_BSWAP16: LowerBswapOp(node->AsOp()); break; #endif // FEATURE_HW_INTRINSICS && TARGET_XARCH diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index e417bec5e0b0a5..819d8bc5a27931 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3957,7 +3957,7 @@ GenTree* Lowering::TryLowerAndOpToAndNot(GenTreeOp* andNode) // void Lowering::LowerBswapOp(GenTreeOp* node) { - assert(node->OperIs(GT_BSWAP)); + assert(node->OperIs(GT_BSWAP, GT_BSWAP16)); if (!comp->opts.OptimizationEnabled() || !comp->compOpportunisticallyDependsOn(InstructionSet_MOVBE)) { diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index e2f40414c3c4c3..869c0b017513a7 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -3137,7 +3137,7 @@ int LinearScan::BuildOperandUses(GenTree* node, regMaskTP candidates) { return BuildAddrUses(node, candidates); } - if (node->OperIs(GT_BSWAP)) + if (node->OperIs(GT_BSWAP, GT_BSWAP16)) { return BuildOperandUses(node->gtGetOp1(), candidates); } From e35aa3cc71c0bc1f2e51da259ff6f2792ec456fb Mon Sep 17 00:00:00 2001 From: aromaa Date: Sun, 27 Mar 2022 20:26:42 +0300 Subject: [PATCH 05/10] Add NativeAOT configs --- src/coreclr/nativeaot/Runtime/IntrinsicConstants.h | 1 + src/coreclr/nativeaot/Runtime/startup.cpp | 5 +++++ .../Compiler/HardwareIntrinsicHelpers.Aot.cs | 3 +++ src/coreclr/tools/aot/ILCompiler/Program.cs | 1 + 4 files changed, 10 insertions(+) diff --git a/src/coreclr/nativeaot/Runtime/IntrinsicConstants.h b/src/coreclr/nativeaot/Runtime/IntrinsicConstants.h index e7bb62abfd058e..631109fbef9d9c 100644 --- a/src/coreclr/nativeaot/Runtime/IntrinsicConstants.h +++ b/src/coreclr/nativeaot/Runtime/IntrinsicConstants.h @@ -23,6 +23,7 @@ enum XArchIntrinsicConstants XArchIntrinsicConstants_Bmi2 = 0x0800, XArchIntrinsicConstants_Lzcnt = 0x1000, XArchIntrinsicConstants_AvxVnni = 0x2000, + XArchIntrinsicConstants_Movbe = 0x4000, }; #endif //HOST_X86 || HOST_AMD64 diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp index 0b43e8288d0943..eabdb8191d4269 100644 --- a/src/coreclr/nativeaot/Runtime/startup.cpp +++ b/src/coreclr/nativeaot/Runtime/startup.cpp @@ -178,6 +178,11 @@ bool DetectCPUFeatures() { g_cpuFeatures |= XArchIntrinsicConstants_Sse42; + if ((cpuidInfo[ECX] & (1 << 22)) != 0) // MOVBE + { + g_cpuFeatures |= XArchIntrinsicConstants_Movbe; + } + if ((cpuidInfo[ECX] & (1 << 23)) != 0) // POPCNT { g_cpuFeatures |= XArchIntrinsicConstants_Popcnt; diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/HardwareIntrinsicHelpers.Aot.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/HardwareIntrinsicHelpers.Aot.cs index 880b99d0107bce..5fade8d7825cfa 100644 --- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/HardwareIntrinsicHelpers.Aot.cs +++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/HardwareIntrinsicHelpers.Aot.cs @@ -100,6 +100,7 @@ private static class XArchIntrinsicConstants public const int Bmi2 = 0x0800; public const int Lzcnt = 0x1000; public const int AvxVnni = 0x2000; + public const int Movbe = 0x4000; public static int FromInstructionSet(InstructionSet instructionSet) { @@ -137,6 +138,8 @@ public static int FromInstructionSet(InstructionSet instructionSet) InstructionSet.X64_LZCNT_X64 => Popcnt, InstructionSet.X64_AVXVNNI => AvxVnni, InstructionSet.X64_AVXVNNI_X64 => AvxVnni, + InstructionSet.X64_MOVBE => Movbe, + InstructionSet.X64_MOVBE_X64 => Movbe, // SSE and SSE2 are baseline ISAs - they're always available InstructionSet.X64_SSE => 0, diff --git a/src/coreclr/tools/aot/ILCompiler/Program.cs b/src/coreclr/tools/aot/ILCompiler/Program.cs index 9d8d0d00b05ded..37ea1959d6f1b0 100644 --- a/src/coreclr/tools/aot/ILCompiler/Program.cs +++ b/src/coreclr/tools/aot/ILCompiler/Program.cs @@ -499,6 +499,7 @@ private int Run(string[] args) optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("ssse3"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("aes"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("pclmul"); + optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("movbe"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("popcnt"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("lzcnt"); From 4a994222a3849c986eba14cb132c698eb9e7a2cc Mon Sep 17 00:00:00 2001 From: aromaa Date: Fri, 22 Apr 2022 23:02:39 +0300 Subject: [PATCH 06/10] PR feedback --- src/coreclr/jit/codegenxarch.cpp | 5 ++ src/coreclr/jit/lowerxarch.cpp | 26 +++------- .../BinaryPrimitivesReverseEndianness.cs | 50 ++++++++++++++++++- 3 files changed, 61 insertions(+), 20 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 418b321b88b6a7..20d9450ff4a7d6 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -575,6 +575,11 @@ void CodeGen::genCodeForBswap(GenTree* tree) { // 16-bit byte swaps use "ror reg.16, 8" inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE); + + if (!genCanOmitNormalizationForBswap16(tree)) + { + GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false); + } } } else diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 819d8bc5a27931..19b6c054871aec 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3965,27 +3965,9 @@ void Lowering::LowerBswapOp(GenTreeOp* node) } GenTree* operand = node->gtGetOp1(); - if (IsContainableMemoryOp(operand) && IsSafeToContainMem(node, operand)) + if (node->TypeGet() == operand->TypeGet() && IsContainableMemoryOp(operand) && IsSafeToContainMem(node, operand)) { MakeSrcContained(node, operand); - - // Return early so we don't do double-containing - return; - } - - // Due to this instruction being rare we can perform TryGetUse here - // to simplify the pattern recognition required for GT_STOREIND - // In cases where TP matters, it should be performed inside the user - LIR::Use use; - if (!BlockRange().TryGetUse(node, &use)) - { - return; - } - - GenTree* user = use.User(); - if (user->OperIs(GT_STOREIND)) - { - MakeSrcContained(user, node); } } @@ -4619,6 +4601,12 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) MakeSrcContained(node, src); } + if (comp->opts.OptimizationEnabled() && src->OperIs(GT_BSWAP, GT_BSWAP16) && + comp->compOpportunisticallyDependsOn(InstructionSet_MOVBE) && IsSafeToContainMem(node, src)) + { + MakeSrcContained(node, src); + } + ContainCheckIndir(node); } diff --git a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs index 6b1bf8af69f7c4..e52e2543ba4737 100644 --- a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs +++ b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs @@ -181,6 +181,37 @@ static int Main(string[] args) return Fail; } + /* + * WRITE TESTS CAST + */ + + byte writeCastUInt16Input = (byte)DateTime.UtcNow.Ticks; + ushort writeCastUInt16Output = ByteSwapUInt16_WriteCast(ref writeCastUInt16Input); + ushort writeCastUInt16Expected = ByteSwapUInt16_Control(writeCastUInt16Input); + if (writeCastUInt16Output != writeCastUInt16Expected) + { + ReportError("write cast UInt16", writeCastUInt16Input, writeCastUInt16Output, writeCastUInt16Expected); + return Fail; + } + + ushort writeCastUInt32Input = (ushort)DateTime.UtcNow.Ticks; + uint writeCastUInt32Output = ByteSwapUInt32_WriteCast(ref writeCastUInt32Input); + uint writeCastUInt32Expected = ByteSwapUInt32_Control(writeCastUInt32Input); + if (writeCastUInt32Output != writeCastUInt32Expected) + { + ReportError("write cast UInt32", writeCastUInt32Input, writeCastUInt32Output, writeCastUInt32Expected); + return Fail; + } + + uint writeCastUInt64Input = (uint)DateTime.UtcNow.Ticks; + ulong writeCastUInt64Output = ByteSwapUInt64_WriteCast(ref writeCastUInt64Input); + ulong writeCastUInt64Expected = ByteSwapUInt64_Control(writeCastUInt64Input); + if (writeCastUInt64Output != writeCastUInt64Expected) + { + ReportError("write cast UInt64", writeCastUInt64Input, writeCastUInt64Output, writeCastUInt64Expected); + return Fail; + } + return Pass; } @@ -250,7 +281,6 @@ private static ulong ByteSwapUInt64_Write(ref ulong output) return input; } - [MethodImpl(MethodImplOptions.NoInlining)] private static ushort ByteSwapUInt16_WriteLea(ref ushort output, int offset) { @@ -281,6 +311,24 @@ private static ulong ByteSwapUInt64_WriteLea(ref ulong output, int offset) return input; } + [MethodImpl(MethodImplOptions.NoInlining)] + private static ushort ByteSwapUInt16_WriteCast(ref byte input) + { + return BinaryPrimitives.ReverseEndianness((ushort)input); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint ByteSwapUInt32_WriteCast(ref ushort input) + { + return BinaryPrimitives.ReverseEndianness((uint)input); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static ulong ByteSwapUInt64_WriteCast(ref uint input) + { + return BinaryPrimitives.ReverseEndianness((ulong)input); + } + private static string GetHexString(T value) { if (typeof(T) == typeof(short)) From 6bac43db8a5fb4b6796df78d2fa40981aa876779 Mon Sep 17 00:00:00 2001 From: aromaa Date: Sun, 24 Apr 2022 23:24:04 +0300 Subject: [PATCH 07/10] Few fixes --- src/coreclr/jit/emitxarch.cpp | 4 +- src/coreclr/jit/lowerxarch.cpp | 4 +- .../BinaryPrimitivesReverseEndianness.cs | 52 +++++++++++++++++++ 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ce9b98b7efe43d..c5b9d438a17822 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -16314,13 +16314,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins if (memAccessKind == PERFSCORE_MEMORY_READ) { result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += PERFSCORE_THROUGHPUT_2C; + result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C; } else { assert(memAccessKind == PERFSCORE_MEMORY_WRITE); result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += PERFSCORE_THROUGHPUT_2C; + result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C; } break; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 19b6c054871aec..6651cf99b12ccf 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4601,7 +4601,9 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) MakeSrcContained(node, src); } - if (comp->opts.OptimizationEnabled() && src->OperIs(GT_BSWAP, GT_BSWAP16) && + // If the source is a BSWAP, contain it on supported hardware to generate a MOVBE. + // Ensure that we are not double containing when the load has been contained. + if (comp->opts.OptimizationEnabled() && src->OperIs(GT_BSWAP, GT_BSWAP16) && !src->gtGetOp1()->isContained() && comp->compOpportunisticallyDependsOn(InstructionSet_MOVBE) && IsSafeToContainMem(node, src)) { MakeSrcContained(node, src); diff --git a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs index e52e2543ba4737..5748f8e055547a 100644 --- a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs +++ b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs @@ -212,6 +212,40 @@ static int Main(string[] args) return Fail; } + /* + * READ & WRITE TESTS + */ + + ushort writeBackUInt16Input = (ushort)DateTime.UtcNow.Ticks; + ushort writeBackUInt16Output = writeBackUInt16Input; + ByteSwapUInt16_WriteBack(ref writeBackUInt16Output); + ushort writeBackUInt16Expected = ByteSwapUInt16_Control(writeBackUInt16Input); + if (writeBackUInt16Output != writeBackUInt16Expected) + { + ReportError("write back UInt16", writeBackUInt16Input, writeBackUInt16Output, writeBackUInt16Expected); + return Fail; + } + + uint writeBackUInt32Input = (uint)DateTime.UtcNow.Ticks; + uint writeBackUInt32Output = writeBackUInt32Input; + ByteSwapUInt32_WriteBack(ref writeBackUInt32Output); + uint writeBackUInt32Expected = ByteSwapUInt32_Control(writeBackUInt32Input); + if (writeBackUInt32Output != writeBackUInt32Expected) + { + ReportError("write back UInt32", writeBackUInt32Input, writeBackUInt32Output, writeBackUInt32Expected); + return Fail; + } + + ulong writeBackUInt64Input = (ulong)DateTime.UtcNow.Ticks; + ulong writeBackUInt64Output = writeBackUInt64Input; + ByteSwapUInt64_WriteBack(ref writeBackUInt64Output); + ulong writeBackUInt64Expected = ByteSwapUInt64_Control(writeBackUInt64Input); + if (writeBackUInt64Output != writeBackUInt64Expected) + { + ReportError("write back UInt64", writeBackUInt64Input, writeBackUInt64Output, writeBackUInt64Expected); + return Fail; + } + return Pass; } @@ -329,6 +363,24 @@ private static ulong ByteSwapUInt64_WriteCast(ref uint input) return BinaryPrimitives.ReverseEndianness((ulong)input); } + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ByteSwapUInt16_WriteBack(ref ushort value) + { + value = BinaryPrimitives.ReverseEndianness(value); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ByteSwapUInt32_WriteBack(ref uint value) + { + value = BinaryPrimitives.ReverseEndianness(value); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ByteSwapUInt64_WriteBack(ref ulong value) + { + value = BinaryPrimitives.ReverseEndianness(value); + } + private static string GetHexString(T value) { if (typeof(T) == typeof(short)) From f068a16d2ed87048277e009a441ca5e9a8df81a2 Mon Sep 17 00:00:00 2001 From: aromaa Date: Tue, 26 Apr 2022 03:33:54 +0300 Subject: [PATCH 08/10] Fix BSWAP16 loads --- src/coreclr/jit/codegenxarch.cpp | 5 +++++ src/coreclr/jit/emitxarch.cpp | 4 ++-- src/coreclr/jit/lowerxarch.cpp | 11 +++++++---- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 20d9450ff4a7d6..9d440e89a15d95 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -585,6 +585,11 @@ void CodeGen::genCodeForBswap(GenTree* tree) else { GetEmitter()->emitInsBinary(INS_movbe, emitTypeSize(operand), tree, operand); + + if (tree->OperIs(GT_BSWAP16) && !genCanOmitNormalizationForBswap16(tree)) + { + GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false); + } } genProduceReg(tree); diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index c5b9d438a17822..cdce6388b2432f 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -16313,13 +16313,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_movbe: if (memAccessKind == PERFSCORE_MEMORY_READ) { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + result.insThroughput = PERFSCORE_THROUGHPUT_2X; result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C; } else { assert(memAccessKind == PERFSCORE_MEMORY_WRITE); - result.insThroughput = PERFSCORE_THROUGHPUT_2X; + result.insThroughput = PERFSCORE_THROUGHPUT_1C; result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C; } break; diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 6651cf99b12ccf..e348dd5bd7e662 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3964,8 +3964,9 @@ void Lowering::LowerBswapOp(GenTreeOp* node) return; } - GenTree* operand = node->gtGetOp1(); - if (node->TypeGet() == operand->TypeGet() && IsContainableMemoryOp(operand) && IsSafeToContainMem(node, operand)) + GenTree* operand = node->gtGetOp1(); + unsigned swapSize = node->OperIs(GT_BSWAP16) ? 2 : genTypeSize(node); + if (swapSize == genTypeSize(operand) && IsContainableMemoryOp(operand) && IsSafeToContainMem(node, operand)) { MakeSrcContained(node, operand); } @@ -4602,10 +4603,12 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) } // If the source is a BSWAP, contain it on supported hardware to generate a MOVBE. - // Ensure that we are not double containing when the load has been contained. - if (comp->opts.OptimizationEnabled() && src->OperIs(GT_BSWAP, GT_BSWAP16) && !src->gtGetOp1()->isContained() && + if (comp->opts.OptimizationEnabled() && src->OperIs(GT_BSWAP, GT_BSWAP16) && comp->compOpportunisticallyDependsOn(InstructionSet_MOVBE) && IsSafeToContainMem(node, src)) { + // Prefer containing in the store in case the load has been contained. + src->gtGetOp1()->ClearContained(); + MakeSrcContained(node, src); } From 548d5dfd8fef32a49ddeb5ea8a0e5daacfab7b2b Mon Sep 17 00:00:00 2001 From: aromaa Date: Tue, 26 Apr 2022 13:27:55 +0300 Subject: [PATCH 09/10] Nit --- src/coreclr/jit/codegenxarch.cpp | 13 ++++--------- src/coreclr/jit/lowerxarch.cpp | 2 +- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 9d440e89a15d95..396f81970e4ee5 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -575,21 +575,16 @@ void CodeGen::genCodeForBswap(GenTree* tree) { // 16-bit byte swaps use "ror reg.16, 8" inst_RV_IV(INS_ror_N, targetReg, 8 /* val */, emitAttr::EA_2BYTE); - - if (!genCanOmitNormalizationForBswap16(tree)) - { - GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false); - } } } else { GetEmitter()->emitInsBinary(INS_movbe, emitTypeSize(operand), tree, operand); + } - if (tree->OperIs(GT_BSWAP16) && !genCanOmitNormalizationForBswap16(tree)) - { - GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false); - } + if (tree->OperIs(GT_BSWAP16) && !genCanOmitNormalizationForBswap16(tree)) + { + GetEmitter()->emitIns_Mov(INS_movzx, EA_2BYTE, targetReg, targetReg, /* canSkip */ false); } genProduceReg(tree); diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index e348dd5bd7e662..b983b9838f49d4 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3966,7 +3966,7 @@ void Lowering::LowerBswapOp(GenTreeOp* node) GenTree* operand = node->gtGetOp1(); unsigned swapSize = node->OperIs(GT_BSWAP16) ? 2 : genTypeSize(node); - if (swapSize == genTypeSize(operand) && IsContainableMemoryOp(operand) && IsSafeToContainMem(node, operand)) + if ((swapSize == genTypeSize(operand)) && IsContainableMemoryOp(operand) && IsSafeToContainMem(node, operand)) { MakeSrcContained(node, operand); } From 0273f6995d6e8f2983f0fd58cf30c528aa220534 Mon Sep 17 00:00:00 2001 From: aromaa Date: Tue, 26 Apr 2022 15:02:38 +0300 Subject: [PATCH 10/10] Ensure same size on store --- src/coreclr/jit/lowerxarch.cpp | 12 +- .../BinaryPrimitivesReverseEndianness.cs | 108 +++++++++++++----- 2 files changed, 88 insertions(+), 32 deletions(-) diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index b983b9838f49d4..ada86ba9ae95ce 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -4604,12 +4604,16 @@ void Lowering::ContainCheckStoreIndir(GenTreeStoreInd* node) // If the source is a BSWAP, contain it on supported hardware to generate a MOVBE. if (comp->opts.OptimizationEnabled() && src->OperIs(GT_BSWAP, GT_BSWAP16) && - comp->compOpportunisticallyDependsOn(InstructionSet_MOVBE) && IsSafeToContainMem(node, src)) + comp->compOpportunisticallyDependsOn(InstructionSet_MOVBE)) { - // Prefer containing in the store in case the load has been contained. - src->gtGetOp1()->ClearContained(); + unsigned swapSize = src->OperIs(GT_BSWAP16) ? 2 : genTypeSize(src); + if ((swapSize == genTypeSize(node)) && IsSafeToContainMem(node, src)) + { + // Prefer containing in the store in case the load has been contained. + src->gtGetOp1()->ClearContained(); - MakeSrcContained(node, src); + MakeSrcContained(node, src); + } } ContainCheckIndir(node); diff --git a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs index 5748f8e055547a..65908c7c1ee616 100644 --- a/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs +++ b/src/tests/JIT/Intrinsics/BinaryPrimitivesReverseEndianness.cs @@ -119,6 +119,37 @@ static int Main(string[] args) return Fail; } + /* + * READ TESTS CAST + */ + + byte readCastUInt16Input = (byte)DateTime.UtcNow.Ticks; + ushort readCastUInt16Output = ByteSwapUInt16_ReadCast(ref readCastUInt16Input); + ushort readCastUInt16Expected = ByteSwapUInt16_Control(readCastUInt16Input); + if (readCastUInt16Output != readCastUInt16Expected) + { + ReportError("read cast UInt16", readCastUInt16Input, readCastUInt16Output, readCastUInt16Expected); + return Fail; + } + + ushort readCastUInt32Input = (ushort)DateTime.UtcNow.Ticks; + uint readCastUInt32Output = ByteSwapUInt32_ReadCast(ref readCastUInt32Input); + uint readCastUInt32Expected = ByteSwapUInt32_Control(readCastUInt32Input); + if (readCastUInt32Output != readCastUInt32Expected) + { + ReportError("read cast UInt32", readCastUInt32Input, readCastUInt32Output, readCastUInt32Expected); + return Fail; + } + + uint readCastUInt64Input = (uint)DateTime.UtcNow.Ticks; + ulong readCastUInt64Output = ByteSwapUInt64_ReadCast(ref readCastUInt64Input); + ulong readCastUInt64Expected = ByteSwapUInt64_Control(readCastUInt64Input); + if (readCastUInt64Output != readCastUInt64Expected) + { + ReportError("read cast UInt64", readCastUInt64Input, readCastUInt64Output, readCastUInt64Expected); + return Fail; + } + /* * WRITE TESTS */ @@ -185,30 +216,33 @@ static int Main(string[] args) * WRITE TESTS CAST */ - byte writeCastUInt16Input = (byte)DateTime.UtcNow.Ticks; - ushort writeCastUInt16Output = ByteSwapUInt16_WriteCast(ref writeCastUInt16Input); - ushort writeCastUInt16Expected = ByteSwapUInt16_Control(writeCastUInt16Input); - if (writeCastUInt16Output != writeCastUInt16Expected) + ulong writeCastUInt8Input = (ulong)DateTime.UtcNow.Ticks; + byte writeCastUInt8Output = default; + ByteSwapUInt8_WriteCast(ref writeCastUInt8Output, writeCastUInt8Input); + ulong writeCastUInt8Expected = (byte)ByteSwapUInt64_Control(writeCastUInt8Input); + if (writeCastUInt8Output != writeCastUInt8Expected) { - ReportError("write cast UInt16", writeCastUInt16Input, writeCastUInt16Output, writeCastUInt16Expected); + ReportError("write cast UInt8", writeCastUInt8Input, writeCastUInt8Output, writeCastUInt8Expected); return Fail; } - ushort writeCastUInt32Input = (ushort)DateTime.UtcNow.Ticks; - uint writeCastUInt32Output = ByteSwapUInt32_WriteCast(ref writeCastUInt32Input); - uint writeCastUInt32Expected = ByteSwapUInt32_Control(writeCastUInt32Input); - if (writeCastUInt32Output != writeCastUInt32Expected) + ulong writeCastUInt16Input = (ulong)DateTime.UtcNow.Ticks; + ushort writeCastUInt16Output = default; + ByteSwapUInt16_WriteCast(ref writeCastUInt16Output, writeCastUInt16Input); + ulong writeCastUInt16Expected = (ushort)ByteSwapUInt64_Control(writeCastUInt16Input); + if (writeCastUInt16Output != writeCastUInt16Expected) { - ReportError("write cast UInt32", writeCastUInt32Input, writeCastUInt32Output, writeCastUInt32Expected); + ReportError("write cast UInt16", writeCastUInt16Input, writeCastUInt16Output, writeCastUInt16Expected); return Fail; } - uint writeCastUInt64Input = (uint)DateTime.UtcNow.Ticks; - ulong writeCastUInt64Output = ByteSwapUInt64_WriteCast(ref writeCastUInt64Input); - ulong writeCastUInt64Expected = ByteSwapUInt64_Control(writeCastUInt64Input); - if (writeCastUInt64Output != writeCastUInt64Expected) + ulong writeCastUInt32Input = (ulong)DateTime.UtcNow.Ticks; + uint writeCastUInt32Output = default; + ByteSwapUInt32_WriteCast(ref writeCastUInt32Output, writeCastUInt32Input); + ulong writeCastUInt32Expected = (uint)ByteSwapUInt64_Control(writeCastUInt32Input); + if (writeCastUInt32Output != writeCastUInt32Expected) { - ReportError("write cast UInt64", writeCastUInt64Input, writeCastUInt64Output, writeCastUInt64Expected); + ReportError("write cast UInt32", writeCastUInt32Input, writeCastUInt32Output, writeCastUInt32Expected); return Fail; } @@ -285,6 +319,24 @@ private static ulong ByteSwapUnsigned_General(ulong value, int width) return retVal; } + [MethodImpl(MethodImplOptions.NoInlining)] + private static ushort ByteSwapUInt16_ReadCast(ref byte input) + { + return BinaryPrimitives.ReverseEndianness((ushort)input); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static uint ByteSwapUInt32_ReadCast(ref ushort input) + { + return BinaryPrimitives.ReverseEndianness((uint)input); + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static ulong ByteSwapUInt64_ReadCast(ref uint input) + { + return BinaryPrimitives.ReverseEndianness((ulong)input); + } + [MethodImpl(MethodImplOptions.NoInlining)] private static ushort ByteSwapUInt16_Write(ref ushort output) { @@ -346,39 +398,39 @@ private static ulong ByteSwapUInt64_WriteLea(ref ulong output, int offset) } [MethodImpl(MethodImplOptions.NoInlining)] - private static ushort ByteSwapUInt16_WriteCast(ref byte input) + private static void ByteSwapUInt16_WriteBack(ref ushort value) { - return BinaryPrimitives.ReverseEndianness((ushort)input); + value = BinaryPrimitives.ReverseEndianness(value); } [MethodImpl(MethodImplOptions.NoInlining)] - private static uint ByteSwapUInt32_WriteCast(ref ushort input) + private static void ByteSwapUInt32_WriteBack(ref uint value) { - return BinaryPrimitives.ReverseEndianness((uint)input); + value = BinaryPrimitives.ReverseEndianness(value); } [MethodImpl(MethodImplOptions.NoInlining)] - private static ulong ByteSwapUInt64_WriteCast(ref uint input) + private static void ByteSwapUInt64_WriteBack(ref ulong value) { - return BinaryPrimitives.ReverseEndianness((ulong)input); + value = BinaryPrimitives.ReverseEndianness(value); } - + [MethodImpl(MethodImplOptions.NoInlining)] - private static void ByteSwapUInt16_WriteBack(ref ushort value) + private static void ByteSwapUInt8_WriteCast(ref byte output, ulong input) { - value = BinaryPrimitives.ReverseEndianness(value); + output = (byte)BinaryPrimitives.ReverseEndianness(input); } [MethodImpl(MethodImplOptions.NoInlining)] - private static void ByteSwapUInt32_WriteBack(ref uint value) + private static void ByteSwapUInt16_WriteCast(ref ushort output, ulong input) { - value = BinaryPrimitives.ReverseEndianness(value); + output = (ushort)BinaryPrimitives.ReverseEndianness(input); } [MethodImpl(MethodImplOptions.NoInlining)] - private static void ByteSwapUInt64_WriteBack(ref ulong value) + private static void ByteSwapUInt32_WriteCast(ref uint output, ulong input) { - value = BinaryPrimitives.ReverseEndianness(value); + output = (uint)BinaryPrimitives.ReverseEndianness(input); } private static string GetHexString(T value)