diff --git a/eng/pipelines/common/templates/runtimes/run-test-job.yml b/eng/pipelines/common/templates/runtimes/run-test-job.yml index 6b238012679736..a2ca44678e6b1e 100644 --- a/eng/pipelines/common/templates/runtimes/run-test-job.yml +++ b/eng/pipelines/common/templates/runtimes/run-test-job.yml @@ -536,6 +536,7 @@ jobs: ${{ if in(parameters.testGroup, 'jitstress-isas-avx512') }}: scenarios: - jitstress_isas_avx512_forceevex + - jitstress_isas_avx512_forceevex_stresshighregs ${{ if in(parameters.testGroup, 'jitstressregs-x86') }}: scenarios: - jitstressregs1_x86_noavx diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index aa3fbefad70039..2d5051be6c0583 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -35,6 +35,17 @@ class CodeGen final : public CodeGenInterface GenTree* addr, bool fold, bool* revPtr, GenTree** rv1Ptr, GenTree** rv2Ptr, unsigned* mulPtr, ssize_t* cnsPtr); private: +#if defined(TARGET_AMD64) + regMaskTP get_RBM_ALLFLOAT() const + { + return compiler->rbmAllFloat; + } + regMaskTP get_RBM_FLT_CALLEE_TRASH() const + { + return compiler->rbmFltCalleeTrash; + } +#endif // TARGET_AMD64 + #if defined(TARGET_XARCH) // Bit masks used in negating a float or double number. // This is to avoid creating more than one data constant for these bitmasks when a diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 5c1c26150eae19..f933f685ad6488 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3524,7 +3524,7 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) // this probably needs to be changed. // Load - genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src, offset); + genCodeForLoadOffset(INS_movdqu, EA_16BYTE, xmmTmpReg, src, offset); // Store genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset); @@ -8347,7 +8347,7 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset { ins = INS_movdqu; // This should be changed! - attr = EA_8BYTE; + attr = EA_16BYTE; size = 16; } else diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp index 6c8445b6fee744..84249af733545f 100644 --- a/src/coreclr/jit/compiler.cpp +++ b/src/coreclr/jit/compiler.cpp @@ -3325,6 +3325,24 @@ void Compiler::compInitOptions(JitFlags* jitFlags) opts.compJitSaveFpLrWithCalleeSavedRegisters = JitConfig.JitSaveFpLrWithCalleeSavedRegisters(); } #endif // defined(DEBUG) && defined(TARGET_ARM64) + +#if defined(TARGET_AMD64) + rbmAllFloat = RBM_ALLFLOAT_INIT; + rbmFltCalleeTrash = RBM_FLT_CALLEE_TRASH_INIT; + cntCalleeTrashFloat = CNT_CALLEE_TRASH_FLOAT_INIT; + availableRegCount = ACTUAL_REG_COUNT; + + if (DoJitStressEvexEncoding()) + { + rbmAllFloat |= RBM_HIGHFLOAT; + rbmFltCalleeTrash |= RBM_HIGHFLOAT; + cntCalleeTrashFloat += CNT_CALLEE_TRASH_HIGHFLOAT; + } + else + { + availableRegCount -= CNT_HIGHFLOAT; + } +#endif // TARGET_AMD64 } #ifdef DEBUG @@ -3528,6 +3546,37 @@ bool Compiler::compPromoteFewerStructs(unsigned lclNum) return rejectThisPromo; } +//------------------------------------------------------------------------ +// dumpRegMask: display a register mask. For well-known sets of registers, display a well-known token instead of +// a potentially large number of registers. +// +// Arguments: +// regs - The set of registers to display +// +void Compiler::dumpRegMask(regMaskTP regs) const +{ + if (regs == RBM_ALLINT) + { + printf("[allInt]"); + } + else if (regs == (RBM_ALLINT & ~RBM_FPBASE)) + { + printf("[allIntButFP]"); + } + else if (regs == RBM_ALLFLOAT) + { + printf("[allFloat]"); + } + else if (regs == RBM_ALLDOUBLE) + { + printf("[allDouble]"); + } + else + { + dspRegMask(regs); + } +} + #endif // DEBUG void Compiler::compInitDebuggingInfo() diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 11fb3c3986d7a7..8a231f698a663c 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -10361,6 +10361,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX bool compJitHaltMethod(); + void dumpRegMask(regMaskTP regs) const; + #endif /* @@ -10635,6 +10637,48 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX GenTree* fgMorphMultiregStructArg(CallArg* arg); bool killGCRefs(GenTree* tree); + +#if defined(TARGET_AMD64) +private: + // The following are for initializing register allocator "constants" defined in targetamd64.h + // that now depend upon runtime ISA information, e.g., the presence of AVX512F/VL, which increases + // the number of SIMD (xmm, ymm, and zmm) registers from 16 to 32. + // As only 64-bit xarch has the capability to have the additional registers, we limit the changes + // to TARGET_AMD64 only. + // + // Users of these values need to define four accessor functions: + // + // regMaskTP get_RBM_ALLFLOAT(); + // regMaskTP get_RBM_FLT_CALLEE_TRASH(); + // unsigned get_CNT_CALLEE_TRASH_FLOAT(); + // unsigned get_AVAILABLE_REG_COUNT(); + // + // which return the values of these variables. + // + // This was done to avoid polluting all `targetXXX.h` macro definitions with a compiler parameter, where only + // TARGET_AMD64 requires one. + // + regMaskTP rbmAllFloat; + regMaskTP rbmFltCalleeTrash; + unsigned cntCalleeTrashFloat; + unsigned availableRegCount; + +public: + regMaskTP get_RBM_ALLFLOAT() const + { + return rbmAllFloat; + } + regMaskTP get_RBM_FLT_CALLEE_TRASH() const + { + return rbmFltCalleeTrash; + } + unsigned get_CNT_CALLEE_TRASH_FLOAT() const + { + return cntCalleeTrashFloat; + } + +#endif // TARGET_AMD64 + }; // end of class Compiler //--------------------------------------------------------------------------------------------------------------------- diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp index 692eb10d12359e..0f12cd4644b93b 100644 --- a/src/coreclr/jit/emit.cpp +++ b/src/coreclr/jit/emit.cpp @@ -99,6 +99,17 @@ void emitLocation::Print(LONG compMethodID) const } #endif // DEBUG +#if defined(TARGET_AMD64) +inline regMaskTP emitter::get_RBM_FLT_CALLEE_TRASH() const +{ + return emitComp->rbmFltCalleeTrash; +} +inline unsigned emitter::get_AVAILABLE_REG_COUNT() const +{ + return emitComp->availableRegCount; +} +#endif // TARGET_AMD64 + /***************************************************************************** * * Return the name of an instruction format. @@ -3204,11 +3215,19 @@ void emitter::emitDispRegSet(regMaskTP regs) for (reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) { - if ((regs & genRegMask(reg)) == 0) + if (regs == RBM_NONE) + { + break; + } + + regMaskTP curReg = genRegMask(reg); + if ((regs & curReg) == 0) { continue; } + regs -= curReg; + if (sp) { printf(" "); @@ -3378,6 +3397,7 @@ emitter::instrDesc* emitter::emitNewInstrCallInd(int argCnt, #endif // TARGET_XARCH /* Save the live GC registers in the unused register fields */ + assert((gcrefRegs & RBM_CALLEE_TRASH) == 0); emitEncodeCallGCregs(gcrefRegs, id); return id; @@ -3450,6 +3470,7 @@ emitter::instrDesc* emitter::emitNewInstrCallDir(int argCnt, assert(!id->idIsLargeCns()); /* Save the live GC registers in the unused register fields */ + assert((gcrefRegs & RBM_CALLEE_TRASH) == 0); emitEncodeCallGCregs(gcrefRegs, id); return id; diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 4d4b75ad351073..fe27b94ee62d10 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -1121,6 +1121,28 @@ class emitter idAddr()->_idReg4 = reg; assert(reg == idAddr()->_idReg4); } + bool idHasReg3() const + { + switch (idInsFmt()) + { + case IF_RWR_RRD_RRD: + case IF_RWR_RRD_RRD_CNS: + case IF_RWR_RRD_RRD_RRD: + return true; + default: + return false; + } + } + bool idHasReg4() const + { + switch (idInsFmt()) + { + case IF_RWR_RRD_RRD_RRD: + return true; + default: + return false; + } + } #endif // defined(TARGET_XARCH) #ifdef TARGET_ARMARCH insOpts idInsOpt() const @@ -1951,6 +1973,11 @@ class emitter CORINFO_FIELD_HANDLE emitBlkConst(const void* cnsAddr, unsigned cnsSize, unsigned cnsAlign, var_types elemType); private: +#if defined(TARGET_AMD64) + regMaskTP get_RBM_FLT_CALLEE_TRASH() const; + unsigned get_AVAILABLE_REG_COUNT() const; +#endif // TARGET_AMD64 + CORINFO_FIELD_HANDLE emitFltOrDblConst(double constValue, emitAttr attr); CORINFO_FIELD_HANDLE emitSimd8Const(simd8_t constValue); CORINFO_FIELD_HANDLE emitSimd16Const(simd16_t constValue); diff --git a/src/coreclr/jit/emitinl.h b/src/coreclr/jit/emitinl.h index 82c78299efebd3..125c1ddd0fbd3f 100644 --- a/src/coreclr/jit/emitinl.h +++ b/src/coreclr/jit/emitinl.h @@ -211,11 +211,8 @@ inline ssize_t emitter::emitGetInsAmdAny(instrDesc* id) * * Convert between a register mask and a smaller version for storage. */ - /*static*/ inline void emitter::emitEncodeCallGCregs(regMaskTP regmask, instrDesc* id) { - assert((regmask & RBM_CALLEE_TRASH) == 0); - unsigned encodeMask; #ifdef TARGET_X86 diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 2541ed4473a722..9fb35e9f94375a 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -217,10 +217,10 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_phminposuw: case INS_mpsadbw: case INS_pclmulqdq: - case INS_aesdec: - case INS_aesdeclast: case INS_aesenc: case INS_aesenclast: + case INS_aesdec: + case INS_aesdeclast: case INS_aesimc: case INS_aeskeygenassist: case INS_vzeroupper: @@ -260,21 +260,30 @@ bool emitter::IsEvexEncodedInstruction(instruction ins) const case INS_prefetcht2: case INS_sfence: // Might need new INS_*suffix* instructions for these. - case INS_por: // INS_pord, INS_porq. - case INS_pxor: // INS_pxord, INS_pxorq - case INS_movdqa: // INS_movdqa32, INS_movdqa64. - case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. - case INS_pand: // INS_pandd, INS_pandq. - case INS_pandn: // INS_pandnd, INS_pandnq. - case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. - case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. - case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. - case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. case INS_vbroadcastf128: // INS_vbroadcastf32x4, INS_vbroadcastf64x2. case INS_vbroadcasti128: // INS_vbroadcasti32x4, INS_vbroadcasti64x2. - { - return false; - } + + // TODO-XARCH-AVX512 these need to be encoded with the proper individual EVEX instructions (movdqu8, + // movdqu16 etc) + // For implementation speed, I have set it up so the standing instruction will default to the 32-bit operand + // type + // i.e., movdqu => movdqu32 etc + // Since we are not using k registers yet, this will have no impact on correctness but will affect things + // once + // k registers are used (as that is the point of the "break out operand type" of these instructions) + // case INS_movdqa: // INS_movdqa32, INS_movdqa64. + // case INS_movdqu: // INS_movdqu8, INS_movdqu16, INS_movdqu32, INS_movdqu64. + // case INS_pand: // INS_pandd, INS_pandq. + // case INS_pandn: // INS_pandnd, INS_pandnq. + // case INS_por: // INS_pord, INS_porq. + // case INS_pxor: // INS_pxord, INS_pxorq + // case INS_vextractf128: // INS_vextractf32x4, INS_vextractf64x2. + // case INS_vextracti128: // INS_vextracti32x4, INS_vextracti64x2. + // case INS_vinsertf128: // INS_vinsertf32x4, INS_vinsertf64x2. + // case INS_vinserti128: // INS_vinserti32x4, INS_vinserti64x2. + { + return false; + } default: { break; @@ -761,9 +770,11 @@ bool emitter::Is4ByteSSEInstruction(instruction ins) const // Return Value: // true if this instruction requires a VEX or EVEX prefix. // -bool emitter::TakesSimdPrefix(instruction ins) const +bool emitter::TakesSimdPrefix(const instrDesc* id) const { - return TakesEvexPrefix(ins) || TakesVexPrefix(ins); + instruction ins = id->idIns(); + + return TakesEvexPrefix(id) || TakesVexPrefix(ins); } //------------------------------------------------------------------------ @@ -785,13 +796,23 @@ bool emitter::TakesSimdPrefix(instruction ins) const // Return Value: // true if this instruction requires a EVEX prefix. // -bool emitter::TakesEvexPrefix(instruction ins) const +bool emitter::TakesEvexPrefix(const instrDesc* id) const { if (!emitComp->DoJitStressEvexEncoding()) { return false; } + instruction ins = id->idIns(); + + if (HasHighSIMDReg(id)) + { + assert(IsEvexEncodedInstruction(ins)); + // TODO-XARCH-AVX512 remove this check once k registers have been implemented + assert(!HasKMaskRegisterDest(ins)); + return true; + } + // TODO-XArch-AVX512: Revisit 'HasKMaskRegisterDest()' check once KMask support is added. return IsEvexEncodedInstruction(ins) && !HasKMaskRegisterDest(ins); } @@ -1059,6 +1080,50 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) #endif //! TARGET_AMD64 } +//------------------------------------------------------------------------ +// HasHighSIMReg: Checks if an instruction uses a high SIMD registers (mm16-mm31) +// and will require one of the EVEX high SIMD bits (EVEX.R', EVEX.V', EVEX.X) +// +// Arguments: +// id -- instruction descriptor for encoding +// +// Return Value: +// true if instruction will require EVEX encoding for its register operands. +bool emitter::HasHighSIMDReg(const instrDesc* id) const +{ +#if defined(TARGET_AMD64) + if (IsHighSIMDReg(id->idReg1()) || IsHighSIMDReg(id->idReg2())) + return true; + + if (id->idIsSmallDsc()) + return false; + + if ((id->idHasReg3() && IsHighSIMDReg(id->idReg3())) || (id->idHasReg4() && IsHighSIMDReg(id->idReg4()))) + return true; +#endif + // X86 JIT operates in 32-bit mode and hence extended reg are not available. + return false; +} + +//------------------------------------------------------------------------ +// IsHighSIMDReg: Checks if a register is strictly an EVEX encoded high SIMD +// registers (mm16-mm31). +// +// Arguments: +// reg -- register to check +// +// Return Value: +// true if the register is strictly an EVEX encoded high SIMD register +bool emitter::IsHighSIMDReg(regNumber reg) const +{ +#ifdef TARGET_AMD64 + return ((reg >= REG_XMM16) && (reg <= REG_XMM31)); +#else + // X86 JIT operates in 32-bit mode and hence extended reg are not available. + return false; +#endif +} + // Returns true if using this register will require a REX.* prefix. // Since XMM registers overlap with YMM registers, this routine // can also be used to know whether a YMM register if the @@ -1066,7 +1131,7 @@ bool emitter::TakesRexWPrefix(instruction ins, emitAttr attr) bool IsExtendedReg(regNumber reg) { #ifdef TARGET_AMD64 - return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM15)); + return ((reg >= REG_R8) && (reg <= REG_R15)) || ((reg >= REG_XMM8) && (reg <= REG_XMM31)); #else // X86 JIT operates in 32-bit mode and hence extended reg are not available. return false; @@ -1078,7 +1143,7 @@ bool IsExtendedReg(regNumber reg, emitAttr attr) { #ifdef TARGET_AMD64 // Not a register, so doesn't need a prefix - if (reg > REG_XMM15) + if (reg > REG_XMM31) { return false; } @@ -1119,12 +1184,29 @@ bool IsExtendedReg(regNumber reg, emitAttr attr) bool IsXMMReg(regNumber reg) { #ifdef TARGET_AMD64 - return (reg >= REG_XMM0) && (reg <= REG_XMM15); + return (reg >= REG_XMM0) && (reg <= REG_XMM31); #else // !TARGET_AMD64 return (reg >= REG_XMM0) && (reg <= REG_XMM7); #endif // !TARGET_AMD64 } +//------------------------------------------------------------------------ +// HighAwareRegEncoding: For EVEX encoded high SIMD registers (mm16-mm31), +// get a register encoding for bits 0-4, where the 5th bit is encoded via +// EVEX.R', EVEX.R, or EVEX.X. +// +// Arguments: +// reg -- register to encode +// +// Return Value: +// bits 0-4 of register encoding +// +unsigned HighAwareRegEncoding(regNumber reg) +{ + static_assert((REG_XMM0 & 0x7) == 0, "bad XMMBASE"); + return (unsigned)(reg & 0xF); +} + // Returns bits to be encoded in instruction for the given register. unsigned RegEncoding(regNumber reg) { @@ -1135,11 +1217,13 @@ unsigned RegEncoding(regNumber reg) // Utility routines that abstract the logic of adding REX.W, REX.R, REX.X, REX.B and REX prefixes // SSE2: separate 1-byte prefix gets added before opcode. // AVX: specific bits within VEX prefix need to be set in bit-inverted form. -emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexWPrefix(const instrDesc* id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // W-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1169,11 +1253,13 @@ emitter::code_t emitter::AddRexWPrefix(instruction ins, code_t code) #ifdef TARGET_AMD64 -emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexRPrefix(const instrDesc* id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // R-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1197,11 +1283,13 @@ emitter::code_t emitter::AddRexRPrefix(instruction ins, code_t code) return code | 0x4400000000ULL; } -emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexXPrefix(const instrDesc* id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { // X-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1224,11 +1312,13 @@ emitter::code_t emitter::AddRexXPrefix(instruction ins, code_t code) return code | 0x4200000000ULL; } -emitter::code_t emitter::AddRexBPrefix(instruction ins, code_t code) +emitter::code_t emitter::AddRexBPrefix(const instrDesc* id, code_t code) { + instruction ins = id->idIns(); + if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) // TODO-XArch-AVX512: Remove codeEvexMigrationCheck(). { // B-bit is available in 4-byte EVEX prefix that starts with byte 62. assert(hasEvexPrefix(code)); @@ -1260,6 +1350,46 @@ emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) return code | 0x4000000000ULL; } +//------------------------------------------------------------------------ +// AddEvexVPrimePrefix: Add the EVEX.V' bit to the EVEX prefix. EVEX.V' +// is encoded in inverted form. +// +// Arguments: +// code -- register to encode +// +// Return Value: +// code with EVEX.V' set in verted form. +// +emitter::code_t emitter::AddEvexVPrimePrefix(code_t code) +{ +#if defined(TARGET_AMD64) + assert(UseEvexEncoding() && hasEvexPrefix(code)); + return emitter::code_t(code & 0xFFFFFFF7FFFFFFFFULL); +#else + unreached(); +#endif +} + +//------------------------------------------------------------------------ +// AddEvexRPrimePrefix: Add the EVEX.R' bit to the EVEX prefix. EVEX.R' +// is encoded in inverted form. +// +// Arguments: +// code -- register to encode +// +// Return Value: +// code with EVEX.R' set in verted form. +// +emitter::code_t emitter::AddEvexRPrimePrefix(code_t code) +{ +#if defined(TARGET_AMD64) + assert(UseEvexEncoding() && hasEvexPrefix(code)); + return emitter::code_t(code & 0xFFEFFFFFFFFFFFFFULL); +#else + unreached(); +#endif +} + #endif // TARGET_AMD64 bool isPrefix(BYTE b) @@ -1800,7 +1930,7 @@ unsigned emitter::emitGetAdjustedSize(instrDesc* id, code_t code) const // IsEvexEncodedInstruction(ins) is `true` for AVX/SSE instructions also which needs to be VEX encoded unless // explicitly // asked for EVEX. - if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(ins)) + if (IsEvexEncodedInstruction(ins) && TakesEvexPrefix(id)) { // EVEX prefix encodes some bytes of the opcode and as a result, overall size of the instruction reduces. // Therefore, to estimate the size adding EVEX prefix size and size of instruction opcode bytes will always @@ -2574,10 +2704,12 @@ bool emitter::EncodedBySSE38orSSE3A(instruction ins) const * part of an opcode. */ -inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); + instruction ins = id->idIns(); + #ifdef TARGET_AMD64 // Either code is not NULL or reg is not an extended reg. // If reg is an extended reg, instruction needs to be prefixed with 'REX' @@ -2586,7 +2718,14 @@ inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAtt if (IsExtendedReg(reg)) { - *code = AddRexBPrefix(ins, *code); // REX.B + if (IsHighSIMDReg(reg)) + { + *code = AddRexXPrefix(id, *code); // EVEX.X + } + if (reg & 0x8) + { + *code = AddRexBPrefix(id, *code); // REX.B + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { @@ -2608,10 +2747,12 @@ inline unsigned emitter::insEncodeReg012(instruction ins, regNumber reg, emitAtt * part of an opcode. */ -inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code) +inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emitAttr size, code_t* code) { assert(reg < REG_STK); + instruction ins = id->idIns(); + #ifdef TARGET_AMD64 // Either code is not NULL or reg is not an extended reg. // If reg is an extended reg, instruction needs to be prefixed with 'REX' @@ -2620,7 +2761,14 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt if (IsExtendedReg(reg)) { - *code = AddRexRPrefix(ins, *code); // REX.R + if (IsHighSIMDReg(reg)) + { + *code = AddEvexRPrimePrefix(*code); // EVEX.R' + } + if (reg & 0x8) + { + *code = AddRexRPrefix(id, *code); // REX.R + } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) { @@ -2641,8 +2789,10 @@ inline unsigned emitter::insEncodeReg345(instruction ins, regNumber reg, emitAtt * Returns modified SIMD opcode with the specified register encoded in bits 3-6 of * byte 2 of VEX and EVEX prefix. */ -inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber reg, emitAttr size, code_t code) { + instruction ins = id->idIns(); + assert(reg < REG_STK); assert(IsVexOrEvexEncodedInstruction(ins)); assert(hasVexOrEvexPrefix(code)); @@ -2660,10 +2810,21 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, assert(regBits <= 0xF); if (UseEvexEncoding() && IsEvexEncodedInstruction(ins)) { - if (TakesEvexPrefix(ins) && codeEvexMigrationCheck(code)) + if (TakesEvexPrefix(id) && codeEvexMigrationCheck(code)) { - assert(hasEvexPrefix(code) && TakesEvexPrefix(ins)); + assert(hasEvexPrefix(code) && TakesEvexPrefix(id)); + +// TODO-XARCH-AVX512 I don't like that we redefine regBits on the EVEX case. +// Rather see these paths cleaned up. +#if defined(TARGET_AMD64) + regBits = HighAwareRegEncoding(reg); + if (IsHighSIMDReg(reg)) + { + // Have to set the EVEX V' bit + code = AddEvexVPrimePrefix(code); + } +#endif // Shift count = 5-bytes of opcode + 0-2 bits for EVEX regBits <<= 43; return code ^ regBits; @@ -2671,6 +2832,10 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, } if (UseVEXEncoding() && IsVexEncodedInstruction(ins)) { + + // Both prefix encodes register operand in 1's complement form + assert(regBits <= 0xF); + if (TakesVexPrefix(ins)) { assert(hasVexPrefix(code)); @@ -2690,8 +2855,10 @@ inline emitter::code_t emitter::insEncodeReg3456(instruction ins, regNumber reg, * Used exclusively to generate the REX.X bit and truncate the register. */ -inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* code) +inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, code_t* code) { + instruction ins = id->idIns(); + assert(reg < REG_STK); #ifdef TARGET_AMD64 @@ -2702,7 +2869,14 @@ inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* if (IsExtendedReg(reg)) { - *code = AddRexXPrefix(ins, *code); // REX.X + if (IsHighSIMDReg(reg)) + { + *code = AddEvexVPrimePrefix(*code); // EVEX.X + } + if (reg & 0x8) + { + *code = AddRexXPrefix(id, *code); // REX.B + } } unsigned regBits = RegEncoding(reg); #else // !TARGET_AMD64 @@ -2718,7 +2892,7 @@ inline unsigned emitter::insEncodeRegSIB(instruction ins, regNumber reg, code_t* * Returns the "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc* id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2736,7 +2910,7 @@ inline emitter::code_t emitter::insEncodeMRreg(instruction ins, code_t code) * Returns the given "[r/m]" opcode with the mod/RM field set to register. */ -inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code) +inline emitter::code_t emitter::insEncodeRMreg(const instrDesc* id, code_t code) { // If Byte 4 (which is 0xFF00) is 0, that's where the RM encoding goes. // Otherwise, it will be placed after the 4 byte encoding. @@ -2754,11 +2928,11 @@ inline emitter::code_t emitter::insEncodeRMreg(instruction ins, code_t code) * the given register. */ -inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMRreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; - unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8; + unsigned regcode = insEncodeReg012(id, reg, size, &code) << 8; code |= regcode; return code; } @@ -2769,11 +2943,11 @@ inline emitter::code_t emitter::insEncodeMRreg(instruction ins, regNumber reg, e * the given register. */ -inline emitter::code_t emitter::insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code) +inline emitter::code_t emitter::insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code) { assert((code & 0xC000) == 0); code |= 0xC000; - unsigned regcode = insEncodeReg012(ins, reg, size, &code) << 8; + unsigned regcode = insEncodeReg012(id, reg, size, &code) << 8; code |= regcode; return code; } @@ -2794,13 +2968,13 @@ inline bool insNeedsRRIb(instruction ins) * Returns the "reg,reg,imm8" opcode with both the reg's set to the * the given register. */ -inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeRRIb(const instrDesc* id, regNumber reg, emitAttr size) { assert(size == EA_4BYTE); // All we handle for now. - assert(insNeedsRRIb(ins)); + assert(insNeedsRRIb(id->idIns())); // If this list gets longer, use a switch, or a table lookup. code_t code = 0x69c0; - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // We use the same register as source and destination. (Could have another version that does both regs...) code |= regcode; code |= (regcode << 3); @@ -2813,10 +2987,10 @@ inline emitter::code_t emitter::insEncodeRRIb(instruction ins, regNumber reg, em * nibble of the opcode */ -inline emitter::code_t emitter::insEncodeOpreg(instruction ins, regNumber reg, emitAttr size) +inline emitter::code_t emitter::insEncodeOpreg(const instrDesc* id, regNumber reg, emitAttr size) { - code_t code = insCodeRR(ins); - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + code_t code = insCodeRR(id->idIns()); + unsigned regcode = insEncodeReg012(id, reg, size, &code); code |= regcode; return code; } @@ -3090,7 +3264,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeRR(instrDesc* id) } else { - sz += emitInsSize(id, insEncodeRMreg(ins, code), includeRexPrefixSize); + sz += emitInsSize(id, insEncodeRMreg(id, code), includeRexPrefixSize); } return sz; @@ -3219,7 +3393,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, assert(emitComp->lvaTempsHaveLargerOffsetThanVars()); // Check whether we can use compressed displacement if EVEX. - if (TakesEvexPrefix(id->idIns())) + if (TakesEvexPrefix(id)) { bool compressedFitsInByte = false; TryEvexCompressDisp8Byte(id, ssize_t(offs), &compressedFitsInByte); @@ -3263,7 +3437,7 @@ inline UNATIVE_OFFSET emitter::emitInsSizeSVCalcDisp(instrDesc* id, code_t code, #endif // !FEATURE_FIXED_OUT_ARGS bool useSmallEncoding = false; - if (TakesEvexPrefix(id->idIns())) + if (TakesEvexPrefix(id)) { TryEvexCompressDisp8Byte(id, ssize_t(offs), &useSmallEncoding); } @@ -3416,7 +3590,7 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) } else { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -5026,7 +5200,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) /* We expect this to always be a 'big' opcode */ - assert(insEncodeMRreg(ins, reg, attr, insCodeMR(ins)) & 0x00FF0000); + assert(insEncodeMRreg(id, reg, attr, insCodeMR(ins)) & 0x00FF0000); size = attr; @@ -5046,7 +5220,7 @@ void emitter::emitIns_R(instruction ins, emitAttr attr, regNumber reg) id->idReg1(reg); // Vex bytes - sz += emitGetAdjustedSize(id, insEncodeMRreg(ins, reg, attr, insCodeMR(ins))); + sz += emitGetAdjustedSize(id, insEncodeMRreg(id, reg, attr, insCodeMR(ins))); // REX byte if (IsExtendedReg(reg, attr) || TakesRexWPrefix(ins, attr)) @@ -8903,7 +9077,7 @@ void emitter::emitIns_Call(EmitCallType callType, { // Tailcall with addressing mode/register needs to be rex.w // prefixed to be recognized as part of epilog by unwinder. - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } sz = emitInsSizeAM(id, code); @@ -11262,13 +11436,13 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { // tail call with addressing mode (or through register) needs rex.w // prefix to be recognized by unwinder as part of epilog. - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Special case: call via a register if (id->idIsCallRegPtr()) { - code = insEncodeMRreg(ins, reg, EA_PTRSIZE, code); + code = insEncodeMRreg(id, reg, EA_PTRSIZE, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); dst += emitOutputWord(dst, code); goto DONE; @@ -11282,14 +11456,14 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Compute the REX prefix if it exists if (IsExtendedReg(reg, EA_PTRSIZE)) { - insEncodeReg012(ins, reg, EA_PTRSIZE, &code); + insEncodeReg012(id, reg, EA_PTRSIZE, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. reg = (regNumber)RegEncoding(reg); } if (IsExtendedReg(rgx, EA_PTRSIZE)) { - insEncodeRegSIB(ins, rgx, &code); + insEncodeRegSIB(id, rgx, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. rgx = (regNumber)RegEncoding(rgx); } @@ -11335,7 +11509,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { case IF_RWR_ARD: - assert(code == (insCodeRM(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8))); + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); code &= ~((code_t)0xFFFFFFFF); code |= 0xA0; @@ -11344,7 +11518,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_AWR_RRD: - assert(code == (insCodeMR(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8))); + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8))); code &= ~((code_t)0xFFFFFFFF); code |= 0xA2; @@ -11361,10 +11535,10 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Emit SIMD prefix if required // There are some callers who already add SIMD prefix and call this routine. // Therefore, add SIMD prefix is one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // For this format, moves do not support a third operand, so we only need to handle the binary ops. - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { if (IsDstDstSrcAVXInstruction(ins)) { @@ -11389,11 +11563,11 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, src1, size, code); + code = insEncodeReg3456(id, src1, size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { - code = insEncodeReg3456(ins, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg2(), size, code); } } @@ -11401,21 +11575,21 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). Not doing so currently // since we cannot differentiate EVEX vs VEX without 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } if (IsExtendedReg(reg, EA_PTRSIZE)) { - insEncodeReg012(ins, reg, EA_PTRSIZE, &code); + insEncodeReg012(id, reg, EA_PTRSIZE, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. reg = (regNumber)RegEncoding(reg); } if (IsExtendedReg(rgx, EA_PTRSIZE)) { - insEncodeRegSIB(ins, rgx, &code); + insEncodeRegSIB(id, rgx, &code); // TODO-Cleanup: stop casting RegEncoding() back to a regNumber. rgx = (regNumber)RegEncoding(rgx); } @@ -11459,7 +11633,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } } } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -11589,7 +11763,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dsp = TryEvexCompressDisp8Byte(id, dsp, &dspInByte); } @@ -11820,7 +11994,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { // Put the register in the opcode - code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr); + code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr); // Is there a displacement? if (dspIsZero) @@ -11850,7 +12024,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // Put the register in the opcode - code |= insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) << 8; + code |= insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) << 8; // Is there a displacement? if (dspIsZero) @@ -11896,8 +12070,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) if (reg != REG_NA) { // The address is "[reg + {2/4/8} * rgx + icon]" - regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | - insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); + regByte = insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) | + insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -11963,8 +12137,8 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // The address is "[{2/4/8} * rgx + icon]" - regByte = insEncodeReg012(ins, REG_EBP, EA_PTRSIZE, nullptr) | - insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); + regByte = insEncodeReg012(id, REG_EBP, EA_PTRSIZE, nullptr) | + insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr) | insSSval(mul); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -11993,7 +12167,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) else { // The address is "[reg+rgx+dsp]" - regByte = insEncodeReg012(ins, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(ins, rgx, EA_PTRSIZE, nullptr); + regByte = insEncodeReg012(id, reg, EA_PTRSIZE, nullptr) | insEncodeReg345(id, rgx, EA_PTRSIZE, nullptr); if (EncodedBySSE38orSSE3A(ins) || (ins == INS_crc32)) { @@ -12231,16 +12405,16 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Add VEX or EVEX prefix if required. // There are some callers who already add prefix and call this routine. // Therefore, add VEX or EVEX prefix if one is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic inside TakesRexWPrefix(). // Not doing so currently since we cannot differentiate EVEX vs VEX without // 'code' until all paths have EVEX support. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Special case emitting AVX instructions @@ -12267,9 +12441,9 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -12400,7 +12574,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this int dspAsByte = dsp; - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -12454,7 +12628,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // TODO-XARCH-AVX512 : working to wrap up all adjusted disp8 compression logic into the following // function, to which the remainder of the emitter logic should handle properly. // TODO-XARCH-AVX512 : embedded broadcast might change this - if (TakesEvexPrefix(ins)) + if (TakesEvexPrefix(id)) { dspAsByte = int(TryEvexCompressDisp8Byte(id, ssize_t(dsp), &dspInByte)); } @@ -12680,12 +12854,12 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) // Compute VEX/EVEX prefix // Some of its callers already add EVEX/VEX prefix and then call this routine. // Therefore add EVEX/VEX prefix is not already present. - code = AddSimdPrefixIfNeededAndNotPresent(ins, code, size); + code = AddSimdPrefixIfNeededAndNotPresent(id, code, size); // Compute the REX prefix - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // `addc` is used for two kinds if instructions @@ -12720,7 +12894,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { case IF_RWR_MRD: - assert(code == (insCodeRM(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); + assert(code == (insCodeRM(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); code &= ~((code_t)0xFFFFFFFF); code |= 0xA0; @@ -12729,7 +12903,7 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) case IF_MWR_RRD: - assert(code == (insCodeMR(ins) | (insEncodeReg345(ins, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); + assert(code == (insCodeMR(ins) | (insEncodeReg345(id, REG_EAX, EA_PTRSIZE, NULL) << 8) | 0x0500)); code &= ~((code_t)0xFFFFFFFF); code |= 0xA2; @@ -12767,9 +12941,9 @@ BYTE* emitter::emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) } else { - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - unsigned regcode = insEncodeReg345(ins, reg345, size, &code); + unsigned regcode = insEncodeReg345(id, reg345, size, &code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13146,13 +13320,13 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code |= 0x1; } - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Register... - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13166,7 +13340,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) // Output a size prefix for a 16-bit operand dst += emitOutputByte(dst, 0x66); } - dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(ins, reg, size, nullptr)); + dst += emitOutputByte(dst, insCodeRR(ins) | insEncodeReg012(id, reg, size, nullptr)); } break; @@ -13176,9 +13350,9 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) case INS_push_hide: assert(size == EA_PTRSIZE); - code = insEncodeOpreg(ins, reg, size); + code = insEncodeOpreg(id, reg, size); - assert(!TakesSimdPrefix(ins)); + assert(!TakesSimdPrefix(id)); assert(!TakesRexWPrefix(ins, size)); // Output the REX prefix @@ -13198,13 +13372,13 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) code = insCodeRR(ins); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Register... - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13233,7 +13407,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); assert(size == EA_1BYTE); - code = insEncodeMRreg(ins, reg, EA_1BYTE, insCodeMR(ins)); + code = insEncodeMRreg(id, reg, EA_1BYTE, insCodeMR(ins)); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13259,7 +13433,7 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) assert(id->idGCref() == GCT_NONE); - code = insEncodeMRreg(ins, reg, size, insCodeMR(ins)); + code = insEncodeMRreg(id, reg, size, insCodeMR(ins)); if (size != EA_1BYTE) { @@ -13273,11 +13447,11 @@ BYTE* emitter::emitOutputR(BYTE* dst, instrDesc* id) } } - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Output the REX prefix @@ -13364,36 +13538,36 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { code = insCodeMR(ins); } - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } else if ((ins == INS_movsx) || (ins == INS_movzx) || (insIsCMOV(ins))) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code) | (int)(size == EA_2BYTE); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code) | (int)(size == EA_2BYTE); #ifdef TARGET_AMD64 assert((size < EA_4BYTE) || (insIsCMOV(ins))); if ((size == EA_8BYTE) || (ins == INS_movsx)) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } else if (ins == INS_movsxd) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); #endif // TARGET_AMD64 } @@ -13403,8 +13577,8 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { assert(hasCodeRM(ins) && !hasCodeMI(ins) && !hasCodeMR(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); if ((ins == INS_crc32) && (size > EA_1BYTE)) { code |= 0x0100; @@ -13417,15 +13591,15 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } else if (size == EA_8BYTE) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } } #endif // FEATURE_HW_INTRINSICS else { - assert(!TakesSimdPrefix(ins)); + assert(!TakesSimdPrefix(id)); code = insCodeMR(ins); - code = insEncodeMRreg(ins, code); + code = insEncodeMRreg(id, code); if (ins != INS_test) { @@ -13455,7 +13629,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) // Don't need to zero out the high bits explicitly if ((ins != INS_xor) || (reg1 != reg2)) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } else { @@ -13492,10 +13666,10 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) } } - unsigned regCode = insEncodeReg345(ins, regFor345Bits, size, &code); - regCode |= insEncodeReg012(ins, regFor012Bits, size, &code); + unsigned regCode = insEncodeReg345(id, regFor345Bits, size, &code); + regCode |= insEncodeReg012(id, regFor012Bits, size, &code); - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { // In case of AVX instructions that take 3 operands, we generally want to encode reg1 // as first source. In this case, reg1 is both a source and a destination. @@ -13507,12 +13681,12 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) if (IsDstDstSrcAVXInstruction(ins)) { // encode source/dest operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, reg1, size, code); + code = insEncodeReg3456(id, reg1, size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, reg2, size, code); + code = insEncodeReg3456(id, reg2, size, code); } } @@ -13754,21 +13928,21 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) emitAttr size = id->idOpSize(); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); - code = insEncodeRMreg(ins, code); + code = insEncodeRMreg(id, code); // TODO-XARCH-AVX512 : Update this check once all paths have EVEX support. // Explore moving IsWEvexOpcodeExtension() logic to instruction table as flag. - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } - unsigned regCode = insEncodeReg345(ins, targetReg, size, &code); - regCode |= insEncodeReg012(ins, src2, size, &code); + unsigned regCode = insEncodeReg345(id, targetReg, size, &code); + regCode |= insEncodeReg012(id, src2, size, &code); // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, src1, size, code); + code = insEncodeReg3456(id, src1, size, code); // Output the REX/VEX/EVEX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13862,17 +14036,17 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // Get the 'base' opcode. code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMIreg(ins, reg, size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMIreg(id, reg, size, code); assert(code & 0x00FF0000); - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { // The 'vvvv' bits encode the destination register, which for this case (RI) // is the same as the source. - code = insEncodeReg3456(ins, reg, size, code); + code = insEncodeReg3456(id, reg, size, code); } - unsigned regcode = (insEncodeReg345(ins, regOpcode, size, &code) | insEncodeReg012(ins, reg, size, &code)) << 8; + unsigned regcode = (insEncodeReg345(id, regOpcode, size, &code) | insEncodeReg012(id, reg, size, &code)) << 8; // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -13900,15 +14074,15 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) assert(code < 0x100); code |= 0x08; // Set the 'w' bit - unsigned regcode = insEncodeReg012(ins, reg, size, &code); + unsigned regcode = insEncodeReg012(id, reg, size, &code); code |= regcode; // This is INS_mov and will not take VEX prefix assert(!TakesVexPrefix(ins)); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -14000,13 +14174,13 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) // r/m, immed form, but do have a dstReg,srcReg,imm8 form. if (valInByte && useSigned && insNeedsRRIb(ins)) { - code = insEncodeRRIb(ins, reg, size); + code = insEncodeRRIb(id, reg, size); } else { code = insCodeMI(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMIreg(ins, reg, size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMIreg(id, reg, size, code); } } @@ -14030,7 +14204,7 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) /* Set the 'w' bit to get the large version */ /* and the REX.W bit to get the really large version */ - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); code |= 0x1; break; #endif @@ -14239,9 +14413,9 @@ BYTE* emitter::emitOutputIV(BYTE* dst, instrDesc* id) } else { - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); } @@ -14549,7 +14723,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i) idAmd->idCodeSize(sz); code = insCodeRM(ins); - code |= (insEncodeReg345(ins, id->idReg1(), EA_PTRSIZE, &code) << 8); + code |= (insEncodeReg345(id, id->idReg1(), EA_PTRSIZE, &code) << 8); dst = emitOutputAM(dst, idAmd, code, nullptr); @@ -14665,7 +14839,7 @@ ssize_t emitter::GetInputSizeInBytes(instrDesc* id) // ssize_t emitter::TryEvexCompressDisp8Byte(instrDesc* id, ssize_t dsp, bool* dspInByte) { - assert(TakesEvexPrefix(id->idIns())); + assert(TakesEvexPrefix(id)); insTupleType tt = insTupleTypeInfo(id->idIns()); assert(hasTupleTypeInfo(id->idIns())); @@ -14876,12 +15050,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) #ifdef TARGET_AMD64 // Support only scalar AVX instructions and hence size is hard coded to 4-byte. - code = AddSimdPrefixIfNeeded(ins, code, EA_4BYTE); + code = AddSimdPrefixIfNeeded(id, code, EA_4BYTE); if (((ins == INS_cdq) || (ins == INS_cwde)) && - (TakesRexWPrefix(ins, id->idOpSize()) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins)))) + (TakesRexWPrefix(ins, id->idOpSize()) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id)))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); #endif @@ -15155,8 +15329,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RRW_SHF: code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMRreg(ins, id->idReg1(), size, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, id->idReg1(), size, code); // set the W bit if (size != EA_1BYTE) @@ -15165,9 +15339,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } // Emit the REX prefix if it exists - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } // Output a size prefix for a 16-bit operand @@ -15223,8 +15397,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeMR(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeMRreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeMRreg(id, code); mReg = id->idReg1(); rReg = id->idReg2(); } @@ -15233,7 +15407,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) code = insCodeMI(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); assert((code & 0xC000) == 0); code |= 0xC000; @@ -15247,19 +15421,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) { code = insCodeRM(ins); // Emit the VEX prefix if it exists - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeRMreg(ins, code); + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeRMreg(id, code); mReg = id->idReg2(); rReg = id->idReg1(); } assert(code & 0x00FF0000); - if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(ins))) + if (TakesRexWPrefix(ins, size) || (codeEvexMigrationCheck(code) && IsWEvexOpcodeExtension(id))) { - code = AddRexWPrefix(ins, code); + code = AddRexWPrefix(id, code); } - if (TakesSimdPrefix(ins)) + if (TakesSimdPrefix(id)) { if (IsDstDstSrcAVXInstruction(ins)) { @@ -15269,17 +15443,17 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) // (Though we will need to handle the few ops that can have the 'vvvv' bits as destination, // e.g. pslldq, when/if we support those instructions with 2 registers.) // (see x64 manual Table 2-9. Instructions with a VEX.vvvv destination) - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } else if (IsDstSrcSrcAVXInstruction(ins)) { // This is a "merge" move instruction. // Encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg2(), size, code); + code = insEncodeReg3456(id, id->idReg2(), size, code); } } - regcode = (insEncodeReg345(ins, rReg, size, &code) | insEncodeReg012(ins, mReg, size, &code)); + regcode = (insEncodeReg345(id, rReg, size, &code) | insEncodeReg012(id, mReg, size, &code)); // Output the REX prefix dst += emitOutputRexOrSimdPrefixIfNeeded(ins, dst, code); @@ -15394,8 +15568,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } @@ -15422,8 +15596,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); } sz = emitSizeOfInsDsc(id); @@ -15451,8 +15625,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode, &cnsVal); } sz = emitSizeOfInsDsc(id); @@ -15463,8 +15637,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD: case IF_ARW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + code = AddSimdPrefixIfNeeded(id, code, size); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputAM(dst, id, code | regcode); sz = emitSizeOfInsDsc(id); break; @@ -15472,7 +15646,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_AWR_RRD_RRD: { code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); dst = emitOutputAM(dst, id, code); sz = emitSizeOfInsDsc(id); break; @@ -15561,7 +15735,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15572,10 +15746,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } @@ -15596,15 +15770,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } @@ -15617,8 +15791,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodedInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // 4-byte AVX instructions are special cased inside emitOutputSV @@ -15629,7 +15803,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); } break; @@ -15643,8 +15817,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // 4-byte AVX instructions are special cased inside emitOutputSV @@ -15655,7 +15829,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode, &cnsVal); } @@ -15667,7 +15841,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_SWR_RRD: case IF_SRW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15678,10 +15852,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputSV(dst, id, code | regcode); break; @@ -15715,7 +15889,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15726,10 +15900,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } @@ -15760,15 +15934,15 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } @@ -15782,8 +15956,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) assert(IsVexOrEvexEncodedInstruction(ins)); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // Special case 4-byte AVX instructions @@ -15793,7 +15967,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); } sz = emitSizeOfInsDsc(id); @@ -15808,8 +15982,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) emitGetInsCns(id, &cnsVal); code = insCodeRM(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); - code = insEncodeReg3456(ins, id->idReg2(), size, + code = AddSimdPrefixIfNeeded(id, code, size); + code = insEncodeReg3456(id, id->idReg2(), size, code); // encode source operand reg in 'vvvv' bits in 1's complement form // Special case 4-byte AVX instructions @@ -15819,7 +15993,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) } else { - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500, &cnsVal); } sz = emitSizeOfInsDsc(id); @@ -15828,7 +16002,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_RWR_MRD_OFF: code = insCode(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15839,10 +16013,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = insEncodeReg012(id->idIns(), id->idReg1(), size, &code); + regcode = insEncodeReg012(id, id->idReg1(), size, &code); dst = emitOutputCV(dst, id, code | 0x30 | regcode); sz = emitSizeOfInsDsc(id); break; @@ -15851,7 +16025,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) case IF_MWR_RRD: case IF_MRW_RRD: code = insCodeMR(ins); - code = AddSimdPrefixIfNeeded(ins, code, size); + code = AddSimdPrefixIfNeeded(id, code, size); // In case of AVX instructions that take 3 operands, encode reg1 as first source. // Note that reg1 is both a source and a destination. @@ -15862,10 +16036,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) if (IsDstDstSrcAVXInstruction(ins)) { // encode source operand reg in 'vvvv' bits in 1's complement form - code = insEncodeReg3456(ins, id->idReg1(), size, code); + code = insEncodeReg3456(id, id->idReg1(), size, code); } - regcode = (insEncodeReg345(ins, id->idReg1(), size, &code) << 8); + regcode = (insEncodeReg345(id, id->idReg1(), size, &code) << 8); dst = emitOutputCV(dst, id, code | regcode | 0x0500); sz = emitSizeOfInsDsc(id); break; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index dd4eec46dadb92..6741676dfce43f 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -75,16 +75,16 @@ unsigned emitGetAdjustedSize(instrDesc* id, code_t code) const; code_t emitExtractVexPrefix(instruction ins, code_t& code) const; code_t emitExtractEvexPrefix(instruction ins, code_t& code) const; -unsigned insEncodeReg012(instruction ins, regNumber reg, emitAttr size, code_t* code); -unsigned insEncodeReg345(instruction ins, regNumber reg, emitAttr size, code_t* code); -code_t insEncodeReg3456(instruction ins, regNumber reg, emitAttr size, code_t code); -unsigned insEncodeRegSIB(instruction ins, regNumber reg, code_t* code); +unsigned insEncodeReg012(const instrDesc* id, regNumber reg, emitAttr size, code_t* code); +unsigned insEncodeReg345(const instrDesc* id, regNumber reg, emitAttr size, code_t* code); +code_t insEncodeReg3456(const instrDesc* id, regNumber reg, emitAttr size, code_t code); +unsigned insEncodeRegSIB(const instrDesc* id, regNumber reg, code_t* code); -code_t insEncodeMRreg(instruction ins, code_t code); -code_t insEncodeRMreg(instruction ins, code_t code); -code_t insEncodeMRreg(instruction ins, regNumber reg, emitAttr size, code_t code); -code_t insEncodeRRIb(instruction ins, regNumber reg, emitAttr size); -code_t insEncodeOpreg(instruction ins, regNumber reg, emitAttr size); +code_t insEncodeMRreg(const instrDesc* id, code_t code); +code_t insEncodeRMreg(const instrDesc* id, code_t code); +code_t insEncodeMRreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code); +code_t insEncodeRRIb(const instrDesc* id, regNumber reg, emitAttr size); +code_t insEncodeOpreg(const instrDesc* id, regNumber reg, emitAttr size); unsigned insSSval(unsigned scale); @@ -103,16 +103,19 @@ bool IsVexEncodedInstruction(instruction ins) const; bool IsEvexEncodedInstruction(instruction ins) const; bool IsVexOrEvexEncodedInstruction(instruction ins) const; -code_t insEncodeMIreg(instruction ins, regNumber reg, emitAttr size, code_t code); +code_t insEncodeMIreg(const instrDesc* id, regNumber reg, emitAttr size, code_t code); -code_t AddRexWPrefix(instruction ins, code_t code); -code_t AddRexRPrefix(instruction ins, code_t code); -code_t AddRexXPrefix(instruction ins, code_t code); -code_t AddRexBPrefix(instruction ins, code_t code); +code_t AddRexWPrefix(const instrDesc* id, code_t code); +code_t AddRexRPrefix(const instrDesc* id, code_t code); +code_t AddRexXPrefix(const instrDesc* id, code_t code); +code_t AddRexBPrefix(const instrDesc* id, code_t code); code_t AddRexPrefix(instruction ins, code_t code); bool EncodedBySSE38orSSE3A(instruction ins) const; bool Is4ByteSSEInstruction(instruction ins) const; +code_t AddEvexVPrimePrefix(code_t code); +code_t AddEvexRPrimePrefix(code_t code); + static bool IsMovInstruction(instruction ins); bool HasSideEffect(instruction ins, emitAttr size); bool IsRedundantMov( @@ -181,13 +184,15 @@ code_t AddVexPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr // Returns: // `true` if W bit needs to be set to 1. // -bool IsWEvexOpcodeExtension(instruction ins) +bool IsWEvexOpcodeExtension(const instrDesc* id) { - if (!TakesEvexPrefix(ins)) + if (!TakesEvexPrefix(id)) { return false; } + instruction ins = id->idIns(); + switch (ins) { case INS_movq: @@ -486,7 +491,7 @@ bool UseSimdEncoding() const #define EVEX_PREFIX_MASK 0xFF00000000000000ULL #define EVEX_PREFIX_CODE 0x6200000000000000ULL -bool TakesEvexPrefix(instruction ins) const; +bool TakesEvexPrefix(const instrDesc* id) const; //------------------------------------------------------------------------ // hasEvexPrefix: Returns true if the instruction encoding already @@ -514,9 +519,13 @@ code_t AddEvexPrefix(instruction ins, code_t code, emitAttr attr); // // Returns: // code with prefix added. -code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) +// TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) +// to pass emitAttr size +code_t AddSimdPrefixIfNeeded(const instrDesc* id, code_t code, emitAttr size) { - if (TakesEvexPrefix(ins)) + instruction ins = id->idIns(); + + if (TakesEvexPrefix(id)) { code = AddEvexPrefix(ins, code, size); } @@ -537,11 +546,14 @@ code_t AddSimdPrefixIfNeeded(instruction ins, code_t code, emitAttr size) // size - operand size // // Returns: -// `true` if code has an Evex prefix. -// -code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr size) +// TRUE if code has an Evex prefix. +// TODO-XARCH-AVX512 come back and check whether we can id `id` directly (no need) +// to pass emitAttr size +code_t AddSimdPrefixIfNeededAndNotPresent(const instrDesc* id, code_t code, emitAttr size) { - if (TakesEvexPrefix(ins)) + instruction ins = id->idIns(); + + if (TakesEvexPrefix(id)) { code = !hasEvexPrefix(code) ? AddEvexPrefix(ins, code, size) : code; } @@ -552,7 +564,7 @@ code_t AddSimdPrefixIfNeededAndNotPresent(instruction ins, code_t code, emitAttr return code; } -bool TakesSimdPrefix(instruction ins) const; +bool TakesSimdPrefix(const instrDesc* id) const; //------------------------------------------------------------------------ // hasVexOrEvexPrefix: Returns true if the instruction encoding already @@ -1024,4 +1036,7 @@ inline bool HasEmbeddedBroadcast(instrDesc* id) return false; } +inline bool HasHighSIMDReg(const instrDesc* id) const; +inline bool IsHighSIMDReg(regNumber) const; + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index dd1dc0e70d371e..55e40fa6fad5a7 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -19034,6 +19034,27 @@ bool GenTree::isRMWHWIntrinsic(Compiler* comp) #endif } +//------------------------------------------------------------------------ +// isEvexCompatibleHWIntrinsic: Checks if the intrinsic has a compatible +// EVEX form for its intended lowering instruction. +// +// Return Value: +// true if the intrisic node lowering instruction has an EVEX form +// +bool GenTree::isEvexCompatibleHWIntrinsic() +{ + assert(gtOper == GT_HWINTRINSIC); + +// TODO-XARCH-AVX512 remove the ReturnsPerElementMask check once K registers have been properly +// implemented in the register allocator +#if defined(TARGET_AMD64) + return HWIntrinsicInfo::HasEvexSemantics(AsHWIntrinsic()->GetHWIntrinsicId()) && + !HWIntrinsicInfo::ReturnsPerElementMask(AsHWIntrinsic()->GetHWIntrinsicId()); +#else + return false; +#endif +} + GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, NamedIntrinsic hwIntrinsicID, CorInfoType simdBaseJitType, diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 790d14f2841779..3dea877572f76d 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -1508,6 +1508,7 @@ struct GenTree bool isCommutativeHWIntrinsic() const; bool isContainableHWIntrinsic() const; bool isRMWHWIntrinsic(Compiler* comp); + bool isEvexCompatibleHWIntrinsic(); #else bool isCommutativeHWIntrinsic() const { @@ -1523,6 +1524,11 @@ struct GenTree { return false; } + + bool isEvexCompatibleHWIntrinsic() + { + return false; + } #endif // FEATURE_HW_INTRINSICS static bool OperIsCommutative(genTreeOps gtOper) diff --git a/src/coreclr/jit/hwintrinsic.h b/src/coreclr/jit/hwintrinsic.h index b1299df1c1f1cf..bacb22173cedfa 100644 --- a/src/coreclr/jit/hwintrinsic.h +++ b/src/coreclr/jit/hwintrinsic.h @@ -158,6 +158,9 @@ enum HWIntrinsicFlag : unsigned int // contained HW_Flag_MaybeCommutative = 0x80000, + // The intrinsic has no EVEX compatible form + HW_Flag_NoEvexSemantics = 0x100000 + #elif defined(TARGET_ARM64) // The intrinsic has an immediate operand // - the value can be (and should be) encoded in a corresponding instruction when the operand value is constant @@ -172,8 +175,7 @@ enum HWIntrinsicFlag : unsigned int HW_Flag_SIMDScalar = 0x1000, // The intrinsic supports some sort of containment analysis - HW_Flag_SupportsContainment = 0x2000 - + HW_Flag_SupportsContainment = 0x2000, #else #error Unsupported platform #endif @@ -758,6 +760,22 @@ struct HWIntrinsicInfo return (flags & HW_Flag_HasRMWSemantics) != 0; #else #error Unsupported platform +#endif + } + //------------------------------------------------------------------------ + // HasEvexSemantics: Checks if the NamedIntrinsic has a lowering to + // to an instruction with an EVEX form. + // + // Return Value: + // true if the NamedIntrinsic lowering has an EVEX form. + // + static bool HasEvexSemantics(NamedIntrinsic id) + { +#if defined(TARGET_XARCH) + HWIntrinsicFlag flags = lookupFlags(id); + return (flags & HW_Flag_NoEvexSemantics) == 0; +#else + return false; #endif } diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index 8d5c2d16a35cbd..f474d5387333f4 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -68,7 +68,7 @@ HARDWARE_INTRINSIC(Vector128, EqualsAll, HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(Vector128, get_One, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) @@ -171,7 +171,7 @@ HARDWARE_INTRINSIC(Vector256, EqualsAll, HARDWARE_INTRINSIC(Vector256, EqualsAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ExtractMostSignificantBits, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible) -HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) +HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(Vector256, get_One, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, get_Zero, 32, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_ReturnsPerElementMask|HW_Flag_AvxOnlyCompatible) HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible) @@ -256,42 +256,42 @@ HARDWARE_INTRINSIC(SSE, Add, HARDWARE_INTRINSIC(SSE, AddScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, And, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, AndNot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE, CompareScalarOrderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, ConvertScalarToVector128Single, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss32, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromSecondArg|HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) @@ -308,7 +308,7 @@ HARDWARE_INTRINSIC(SSE, Min, HARDWARE_INTRINSIC(SSE, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE, MoveHighToLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE, MoveLowToHigh, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) -HARDWARE_INTRINSIC(SSE, MoveMask, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE, MoveMask, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, MoveScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE, MultiplyScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -317,10 +317,10 @@ HARDWARE_INTRINSIC(SSE, Prefetch0, HARDWARE_INTRINSIC(SSE, Prefetch1, 0, 1, {INS_invalid, INS_prefetcht1, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, Prefetch2, 0, 1, {INS_invalid, INS_prefetcht2, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, PrefetchNonTemporal, 0, 1, {INS_invalid, INS_prefetchnta, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, Reciprocal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE, Reciprocal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalSqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE, ReciprocalSqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE, Shuffle, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE, SqrtScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) @@ -357,42 +357,42 @@ HARDWARE_INTRINSIC(SSE2, AddScalar, HARDWARE_INTRINSIC(SSE2, And, 16, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, AndNot, 16, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE2, Average, 16, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareEqual, 16, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareGreaterThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE2, CompareLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarLessThan, 16, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, CompareNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(SSE2, CompareScalarOrderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareScalarUnorderedNotEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotGreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarNotLessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarOrdered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, CompareUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(SSE2, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE2, CompareScalarUnordered, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToInt32WithTruncation, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttsd2si}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, ConvertToUInt32, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) @@ -420,7 +420,7 @@ HARDWARE_INTRINSIC(SSE2, MemoryFence, HARDWARE_INTRINSIC(SSE2, MaxScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE2, Min, 16, 2, {INS_invalid, INS_pminub, INS_pminsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(SSE2, MinScalar, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE2, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE2, MoveMask, 16, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE2, MoveScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movq, INS_movq, INS_invalid, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE2, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuludq, INS_invalid, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE2, MultiplyHigh, 16, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -473,11 +473,11 @@ HARDWARE_INTRINSIC(SSE2_X64, StoreNonTemporal, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE3 Intrinsics -HARDWARE_INTRINSIC(SSE3, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE3, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE3, LoadDquVector128, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE3, AddSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, LoadAndDuplicateToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_lddqu, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE3, LoadDquVector128, 16, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE3, MoveAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3, MoveHighAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE3, MoveLowAndDuplicate, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -489,54 +489,54 @@ HARDWARE_INTRINSIC(SSE3, MoveLowAndDuplicate, // SSSE3 Intrinsics HARDWARE_INTRINSIC(SSSE3, Abs, 16, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(SSSE3, AlignRight, 16, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSSE3, HorizontalAdd, 16, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalAddSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalSubtract, 16, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSSE3, HorizontalSubtractSaturate, 16, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSSE3, MultiplyAddAdjacent, 16, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, MultiplyHighRoundScale, 16, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSSE3, Shuffle, 16, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSSE3, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSSE3, Sign, 16, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // SSE41 Intrinsics -HARDWARE_INTRINSIC(SSE41, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(SSE41, Blend, 16, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, BlendVariable, 16, 3, {INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_pblendvb, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, Ceiling, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, CeilingScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, CompareEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int16, 16, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int32, 16, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(SSE41, ConvertToVector128Int64, 16, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(SSE41, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41, DotProduct, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_dppd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Extract, 16, 2, {INS_pextrb, INS_pextrb, INS_invalid, INS_invalid, INS_pextrd, INS_pextrd, INS_invalid, INS_invalid, INS_extractps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MultiIns|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE41, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, FloorScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Insert, 16, 3, {INS_pinsrb, INS_pinsrb, INS_invalid, INS_invalid, INS_pinsrd, INS_pinsrd, INS_invalid, INS_invalid, INS_insertps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(SSE41, LoadAlignedVector128NonTemporal, 16, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE41, Max, 16, 2, {INS_pmaxsb, INS_invalid, INS_invalid, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41, Min, 16, 2, {INS_pminsb, INS_invalid, INS_invalid, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(SSE41, MinHorizontal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_phminposuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, MultipleSumAbsoluteDifferences, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(SSE41, Multiply, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE41, MultiplyLow, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE41, PackUnsignedSaturate, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_packusdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE41, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) -HARDWARE_INTRINSIC(SSE41, TestC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(SSE41, TestNotZAndNotC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(SSE41, TestZ, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(SSE41, RoundCurrentDirection, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundCurrentDirectionScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNearestInteger, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNearestIntegerScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToNegativeInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinity, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToPositiveInfinityScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToZero, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, RoundToZeroScalar, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundss, INS_roundsd}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestNotZAndNotC, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(SSE41, TestZ, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -568,15 +568,15 @@ HARDWARE_INTRINSIC(SSE42_X64, Crc32, // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AVX Intrinsics HARDWARE_INTRINSIC(AVX, Add, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, AddSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addsubps, INS_addsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, And, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_andpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, AndNot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_andnpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, Blend, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendps, INS_blendpd}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, BlendVariable, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, Ceiling, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector128, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, BroadcastScalarToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, BroadcastVector128ToVector256, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vbroadcastf128, INS_vbroadcastf128}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Compare, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_IMM, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareGreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) @@ -590,7 +590,7 @@ HARDWARE_INTRINSIC(AVX, CompareNotLessThan, HARDWARE_INTRINSIC(AVX, CompareNotLessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareOrdered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) HARDWARE_INTRINSIC(AVX, CompareUnordered, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_cmppd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(AVX, CompareScalar, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_cmpsd}, HW_Category_IMM, HW_Flag_CopyUpperBits|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector128Single, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtpd2ps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtps2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) @@ -599,43 +599,43 @@ HARDWARE_INTRINSIC(AVX, ConvertToVector256Double, HARDWARE_INTRINSIC(AVX, ConvertToVector128Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttpd2dq, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ConvertToVector256Int32WithTruncation, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttps2dq, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, Divide, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_divpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, DotProduct, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX, DotProduct, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_dpps, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, DuplicateEvenIndexed, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movsldup, INS_movddup}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, DuplicateOddIndexed, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movshdup, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, ExtractVector128, 32, 2, {INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128, INS_vextractf128}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_haddps, INS_haddpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_hsubps, INS_hsubpd}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, InsertVector128, 32, 3, {INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128, INS_vinsertf128}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX, LoadAlignedVector256, 32, 1, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, LoadDquVector256, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, LoadDquVector256, 32, 1, {INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_lddqu, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, LoadVector256, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX, Max, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_maxpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) HARDWARE_INTRINSIC(AVX, Min, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_minpd}, HW_Category_SimpleSIMD, HW_Flag_MaybeCommutative) -HARDWARE_INTRINSIC(AVX, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg) -HARDWARE_INTRINSIC(AVX, MoveMask, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vmaskmovps, INS_vmaskmovpd}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, MoveMask, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_movmskpd}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Multiply, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, Or, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_orpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX, Permute, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilps, INS_vpermilpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX, Permute2x128, 32, 3, {INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX, Permute2x128, 32, 3, {INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128, INS_vperm2f128}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, PermuteVar, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermilpsvar, INS_vpermilpdvar}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, Reciprocal, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, ReciprocalSqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundCurrentDirection, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToNearestInteger, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToNegativeInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToPositiveInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, RoundToZero, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(AVX, Reciprocal, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, ReciprocalSqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundCurrentDirection, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToNearestInteger, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToNegativeInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToPositiveInfinity, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, RoundToZero, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_roundps, INS_roundpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, Shuffle, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_shufpd}, HW_Category_IMM, HW_Flag_NoRMWSemantics|HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX, Sqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_sqrtpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX, Store, 32, 2, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(AVX, StoreAligned, 32, 2, {INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movdqa, INS_movaps, INS_movapd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX, StoreAlignedNonTemporal, 32, 2, {INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntdq, INS_movntps, INS_movntpd}, HW_Category_MemoryStore, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromSecondArg) HARDWARE_INTRINSIC(AVX, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_subpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX, TestC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) -HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX, TestC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, TestNotZAndNotC, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, TestZ, -1, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX, UnpackHigh, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_unpckhpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, UnpackLow, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_unpcklpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX, Xor, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_xorpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) @@ -648,47 +648,47 @@ HARDWARE_INTRINSIC(AVX, Xor, HARDWARE_INTRINSIC(AVX2, Abs, 32, 1, {INS_pabsb, INS_invalid, INS_pabsw, INS_invalid, INS_pabsd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(AVX2, Add, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AddSaturate, 32, 2, {INS_paddsb, INS_paddusb, INS_paddsw, INS_paddusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, AlignRight, 32, 3, {INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_palignr, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, And, 32, 2, {INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_pand, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, AndNot, 32, 2, {INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_pandn, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, Average, 32, 2, {INS_invalid, INS_pavgb, INS_invalid, INS_pavgw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, Blend, -1, 3, {INS_invalid, INS_invalid, INS_pblendw, INS_pblendw, INS_vpblendd, INS_vpblendd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, BlendVariable, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector128, 16, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_movddup}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, BroadcastScalarToVector256, 32, 1, {INS_vpbroadcastb, INS_vpbroadcastb, INS_vpbroadcastw, INS_vpbroadcastw, INS_vpbroadcastd, INS_vpbroadcastd, INS_vpbroadcastq, INS_vpbroadcastq, INS_vbroadcastss, INS_vbroadcastsd}, HW_Category_SIMDScalar, HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) -HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask) +HARDWARE_INTRINSIC(AVX2, BroadcastVector128ToVector256, 32, 1, {INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareEqual, 32, 2, {INS_pcmpeqb, INS_pcmpeqb, INS_pcmpeqw, INS_pcmpeqw, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqq, INS_pcmpeqq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareGreaterThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, CompareLessThan, 32, 2, {INS_pcmpgtb, INS_invalid, INS_pcmpgtw, INS_invalid, INS_pcmpgtd, INS_invalid, INS_pcmpgtq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag|HW_Flag_ReturnsPerElementMask|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, ExtractVector128, 32, 2, {INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_vextracti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, ConvertToInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2, ConvertToUInt32, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int16, 32, 1, {INS_pmovsxbw, INS_pmovzxbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int32, 32, 1, {INS_pmovsxbd, INS_pmovzxbd, INS_pmovsxwd, INS_pmovzxwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) HARDWARE_INTRINSIC(AVX2, ConvertToVector256Int64, 32, 1, {INS_pmovsxbq, INS_pmovzxbq, INS_pmovsxwq, INS_pmovzxwq, INS_pmovsxdq, INS_pmovzxdq, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_MaybeMemoryLoad) -HARDWARE_INTRINSIC(AVX2, GatherVector128, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherVector256, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherMaskVector128, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, GatherMaskVector256, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment) -HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalAddSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, HorizontalSubtractSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, GatherVector128, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherVector256, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherMaskVector128, 16, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, GatherMaskVector256, 32, 5, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpgatherdd, INS_vpgatherdd, INS_vpgatherdq, INS_vpgatherdq, INS_vgatherdps, INS_vgatherdpd}, HW_Category_IMM, HW_Flag_MaybeMemoryLoad|HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalAdd, 32, 2, {INS_invalid, INS_invalid, INS_phaddw, INS_phaddw, INS_phaddd, INS_phaddd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalAddSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phaddsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalSubtract, 32, 2, {INS_invalid, INS_invalid, INS_phsubw, INS_invalid, INS_phsubd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, HorizontalSubtractSaturate, 32, 2, {INS_invalid, INS_invalid, INS_phsubsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, InsertVector128, 32, 3, {INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_vinserti128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, LoadAlignedVector256NonTemporal, 32, 1, {INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_movntdqa, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AVX2, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg) +HARDWARE_INTRINSIC(AVX2, MaskLoad, -1, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX2, MaskStore, -1, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpmaskmovd, INS_vpmaskmovd, INS_vpmaskmovq, INS_vpmaskmovq, INS_invalid, INS_invalid}, HW_Category_MemoryStore, HW_Flag_NoContainment|HW_Flag_BaseTypeFromSecondArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Max, 32, 2, {INS_pmaxsb, INS_pmaxub, INS_pmaxsw, INS_pmaxuw, INS_pmaxsd, INS_pmaxud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, Min, 32, 2, {INS_pminsb, INS_pminub, INS_pminsw, INS_pminuw, INS_pminsd, INS_pminud, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, MoveMask, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg) +HARDWARE_INTRINSIC(AVX2, MoveMask, 32, 1, {INS_pmovmskb, INS_pmovmskb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Multiply, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, MultipleSumAbsoluteDifferences, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, MultipleSumAbsoluteDifferences, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_mpsadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, MultiplyAddAdjacent, 32, 2, {INS_invalid, INS_invalid, INS_pmaddubsw, INS_invalid, INS_pmaddwd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, MultiplyHigh, 32, 2, {INS_invalid, INS_invalid, INS_pmulhw, INS_pmulhuw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, MultiplyHighRoundScale, 32, 2, {INS_invalid, INS_invalid, INS_pmulhrsw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, MultiplyLow, 32, 2, {INS_invalid, INS_invalid, INS_pmullw, INS_pmullw, INS_pmulld, INS_pmulld, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) HARDWARE_INTRINSIC(AVX2, Or, 32, 2, {INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_por, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(AVX2, Permute2x128, 32, 3, {INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AVX2, Permute2x128, 32, 3, {INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_vperm2i128, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, Permute4x64, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermq, INS_vpermq, INS_invalid, INS_vpermpd}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, PermuteVar8x32, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vpermd, INS_vpermd, INS_invalid, INS_invalid, INS_vpermps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport) HARDWARE_INTRINSIC(AVX2, PackSignedSaturate, 32, 2, {INS_packsswb, INS_invalid, INS_packssdw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -704,7 +704,7 @@ HARDWARE_INTRINSIC(AVX2, ShiftRightLogicalVariable, HARDWARE_INTRINSIC(AVX2, Shuffle, 32, 2, {INS_pshufb, INS_pshufb, INS_invalid, INS_invalid, INS_pshufd, INS_pshufd, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_MaybeIMM) HARDWARE_INTRINSIC(AVX2, ShuffleHigh, 32, 2, {INS_invalid, INS_invalid, INS_pshufhw, INS_pshufhw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) HARDWARE_INTRINSIC(AVX2, ShuffleLow, 32, 2, {INS_invalid, INS_invalid, INS_pshuflw, INS_pshuflw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) -HARDWARE_INTRINSIC(AVX2, Sign, 32, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2, Sign, 32, 2, {INS_psignb, INS_invalid, INS_psignw, INS_invalid, INS_psignd, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(AVX2, SumAbsoluteDifferences, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_psadbw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, Subtract, 32, 2, {INS_psubb, INS_psubb, INS_psubw, INS_psubw, INS_psubd, INS_psubd, INS_psubq, INS_psubq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(AVX2, SubtractSaturate, 32, 2, {INS_psubsb, INS_psubusb, INS_psubsw, INS_psubusw, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) @@ -723,56 +723,56 @@ HARDWARE_INTRINSIC(AVXVNNI, MultiplyWideningAndAddSaturate, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // AES Intrinsics -HARDWARE_INTRINSIC(AES, Decrypt, 16, 2, {INS_invalid, INS_aesdec, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, DecryptLast, 16, 2, {INS_invalid, INS_aesdeclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, Encrypt, 16, 2, {INS_invalid, INS_aesenc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, EncryptLast, 16, 2, {INS_invalid, INS_aesenclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, InverseMixColumns, 16, 1, {INS_invalid, INS_aesimc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(AES, KeygenAssist, 16, 2, {INS_invalid, INS_aeskeygenassist, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(AES, Decrypt, 16, 2, {INS_invalid, INS_aesdec, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, DecryptLast, 16, 2, {INS_invalid, INS_aesdeclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, Encrypt, 16, 2, {INS_invalid, INS_aesenc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, EncryptLast, 16, 2, {INS_invalid, INS_aesenclast, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, InverseMixColumns, 16, 1, {INS_invalid, INS_aesimc, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AES, KeygenAssist, 16, 2, {INS_invalid, INS_aeskeygenassist, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics -HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI1, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1, GetMaskUpToLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1, ResetLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1, TrailingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns) -HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(BMI1, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI1 Intrinsics -HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI1_X64, AndNot, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andn, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI1_X64, ExtractLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) HARDWARE_INTRINSIC(BMI1_X64, GetMaskUpToLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsmsk, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1_X64, ResetLowestSetBit, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blsr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(BMI1_X64, TrailingZeroCount, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_tzcnt, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns) -HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport) +HARDWARE_INTRINSIC(BMI1_X64, BitFieldExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bextr, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_MultiIns|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI2 Intrinsics -HARDWARE_INTRINSIC(BMI2, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(BMI2, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI2, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // BMI2 Intrinsics -HARDWARE_INTRINSIC(BMI2_X64, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2_X64, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(BMI2_X64, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport) -HARDWARE_INTRINSIC(BMI2_X64, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ParallelBitDeposit, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pdep, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ParallelBitExtract, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pext, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, ZeroHighBits, 0, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_bzhi, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_SpecialImport|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(BMI2_X64, MultiplyNoFlags, 0, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulx, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoContainment|HW_Flag_MaybeMemoryStore|HW_Flag_MultiIns|HW_Flag_NoFloatingPointUsed|HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -809,7 +809,7 @@ HARDWARE_INTRINSIC(LZCNT_X64, LeadingZeroCount, // {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // PCLMULQDQ Intrinsics -HARDWARE_INTRINSIC(PCLMULQDQ, CarrylessMultiply, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pclmulqdq, INS_pclmulqdq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM) +HARDWARE_INTRINSIC(PCLMULQDQ, CarrylessMultiply, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pclmulqdq, INS_pclmulqdq, INS_invalid, INS_invalid}, HW_Category_IMM, HW_Flag_FullRangeIMM|HW_Flag_NoEvexSemantics) // *************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************** // ISA Function name SIMD size NumArg Instructions Category Flags @@ -842,8 +842,8 @@ HARDWARE_INTRINSIC(SSE, COMISS, HARDWARE_INTRINSIC(SSE, UCOMISS, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, COMISD, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(SSE2, UCOMISD, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomisd}, HW_Category_SIMDScalar, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) -HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics) +HARDWARE_INTRINSIC(SSE41, PTEST, 16, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) +HARDWARE_INTRINSIC(AVX, PTEST, 0, 2, {INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_ptest, INS_vtestps, INS_vtestpd}, HW_Category_SimpleSIMD, HW_Flag_NoRMWSemantics|HW_Flag_NoEvexSemantics) #endif // FEATURE_HW_INTRINSIC diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index a6968c123c7381..13ed02d75c6ead 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -202,8 +202,8 @@ INST3(movntdq, "movntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) INST3(movntpd, "movntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_64Bit) INST3(movntps, "movntps", IUM_WR, PCKFLT(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit) -INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_NONE, INS_FLAGS_None) -INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_NONE, INS_FLAGS_None) +INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqu32 +INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit) // TODO-XARCH-AVX512 TT and IP encoded is movdqa32 INST3(movlpd, "movlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movlps, "movlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), INS_TT_TUPLE1_FIXED, Input_32Bit | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movhpd, "movhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_Flags_IsDstSrcSrcAVXInstruction) @@ -341,10 +341,10 @@ INST3(pmulhuw, "pmulhuw", IUM_WR, BAD_CODE, BAD_CODE, INST3(pmuludq, "pmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), INS_TT_FULL_MEM, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result INST3(pmullw, "pmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), INS_TT_FULL_MEM, Input_16Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result // TODO-XArch-AVX512: pand, pandn, por, and pxor have AVX512 instructions under different names, pandd, pandq etc -INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs -INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs -INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs +INST3(pand, "pand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(pandn, "pandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(por, "por", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs // TODO-XARCH-AVX512 TT and IP encoded is pand32 +INST3(pxor, "pxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), INS_TT_FULL, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs INST3(psadbw, "psadbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF6), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Compute the sum of absolute differences of packed unsigned 8-bit integers INST3(psubsb, "psubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation INST3(psubusb, "psubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), INS_TT_FULL_MEM, Input_8Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation @@ -493,10 +493,10 @@ INST3(vpbroadcastb, "pbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, INST3(vpbroadcastw, "pbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x79), INS_TT_TUPLE1_SCALAR, Input_16Bit | INS_FLAGS_None) // Broadcast int16 value from reg/memory to entire ymm register INST3(vpbroadcastd, "pbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x58), INS_TT_TUPLE1_SCALAR, Input_32Bit | INS_FLAGS_None) // Broadcast int32 value from reg/memory to entire ymm register INST3(vpbroadcastq, "pbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), INS_TT_TUPLE1_SCALAR, Input_64Bit | INS_FLAGS_None) // Broadcast int64 value from reg/memory to entire ymm register -INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed floating point values -INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Extract 128-bit packed integer values -INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values -INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values +INST3(vextractf128, "extractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 +INST3(vextracti128, "extracti128", IUM_WR, SSE3A(0x39), BAD_CODE, BAD_CODE, INS_TT_TUPLE4, Input_32Bit ) // Extract 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is extractf32x4 +INST3(vinsertf128, "insertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed floating point values // TODO-XARCH-AVX512 TT and IP encoded is insertf32x4 +INST3(vinserti128, "inserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), INS_TT_TUPLE4, Input_32Bit | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 128-bit packed integer values // TODO-XARCH-AVX512 TT and IP encoded is inserti32x4 INST3(vzeroupper, "zeroupper", IUM_WR, 0xC577F8, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) INST3(vperm2i128, "perm2i128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x46), INS_TT_NONE, INS_Flags_IsDstDstSrcAVXInstruction) // Permute 128-bit halves of input register INST3(vpermq, "permq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x00), INS_TT_FULL, Input_64Bit | INS_FLAGS_None) // Permute 64-bit of input register diff --git a/src/coreclr/jit/lsra.cpp b/src/coreclr/jit/lsra.cpp index 659d630f8d326b..877b5cd24d108a 100644 --- a/src/coreclr/jit/lsra.cpp +++ b/src/coreclr/jit/lsra.cpp @@ -256,6 +256,23 @@ regMaskTP LinearScan::allSIMDRegs() return availableFloatRegs; } +//------------------------------------------------------------------------ +// lowSIMDRegs(): Return the set of SIMD registers associated with VEX +// encoding only, i.e., remove the high EVEX SIMD registers from the available +// set. +// +// Return Value: +// Register mask of the SSE/VEX-only SIMD registers +// +regMaskTP LinearScan::lowSIMDRegs() +{ +#if defined(TARGET_AMD64) + return (availableFloatRegs & RBM_LOWFLOAT); +#else + return availableFloatRegs; +#endif +} + void LinearScan::updateNextFixedRef(RegRecord* regRecord, RefPosition* nextRefPosition) { LsraLocation nextLocation; @@ -460,8 +477,19 @@ regMaskTP LinearScan::stressLimitRegs(RefPosition* refPosition, regMaskTP mask) } break; +#if defined(TARGET_AMD64) + case LSRA_LIMIT_UPPER_SIMD_SET: + if ((mask & LsraLimitUpperSimdSet) != RBM_NONE) + { + mask = getConstrainedRegMask(mask, LsraLimitUpperSimdSet, minRegCount); + } + break; +#endif + default: + { unreached(); + } } if (refPosition != nullptr && refPosition->isFixedRegRef) @@ -671,6 +699,17 @@ LinearScan::LinearScan(Compiler* theCompiler) } #endif // TARGET_AMD64 || TARGET_ARM64 +#if defined(TARGET_AMD64) + // TODO-XARCH-AVX512 switch this to canUseEvexEncoding() once we independently + // allow EVEX use from the stress flag (currently, if EVEX stress is turned off, + // we cannot use EVEX at all) + if (compiler->DoJitStressEvexEncoding()) + { + availableFloatRegs |= RBM_HIGHFLOAT; + availableDoubleRegs |= RBM_HIGHFLOAT; + } +#endif + for (unsigned int i = 0; i < TYP_COUNT; i++) { var_types thisType = (var_types)genActualTypes[i]; @@ -1848,7 +1887,7 @@ void LinearScan::identifyCandidates() } } JITDUMP(" "); - DBEXEC(VERBOSE, newInt->dump()); + DBEXEC(VERBOSE, newInt->dump(compiler)); } else { @@ -4025,7 +4064,7 @@ void LinearScan::processBlockStartLocations(BasicBlock* currentBlock) { // Just clear any constant registers and return. resetAvailableRegs(); - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); Interval* assignedInterval = physRegRecord->assignedInterval; @@ -4273,7 +4312,7 @@ void LinearScan::processBlockStartLocations(BasicBlock* currentBlock) resetRegState(); setRegsInUse(liveRegs); } - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); if ((liveRegs & genRegMask(reg)) == 0) @@ -4555,7 +4594,7 @@ void LinearScan::allocateRegisters() } resetRegState(); - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->recentRefPosition = nullptr; @@ -4718,7 +4757,7 @@ void LinearScan::allocateRegisters() #ifdef DEBUG // Validate the current state just after we've freed the registers. This ensures that any pending // freed registers will have had their state updated to reflect the intervals they were holding. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { regMaskTP regMask = genRegMask(reg); // If this isn't available or if it's still waiting to be freed (i.e. it was in @@ -5647,7 +5686,7 @@ void LinearScan::allocateRegisters() if (interval.isActive) { printf("Active "); - interval.dump(); + interval.dump(this->compiler); } } @@ -6638,7 +6677,7 @@ void LinearScan::resolveRegisters() // are encountered. if (enregisterLocalVars) { - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); Interval* assignedInterval = physRegRecord->assignedInterval; @@ -8930,29 +8969,6 @@ void LinearScan::dumpLsraStatsSummary(FILE* file) #endif // TRACK_LSRA_STATS #ifdef DEBUG -void dumpRegMask(regMaskTP regs) -{ - if (regs == RBM_ALLINT) - { - printf("[allInt]"); - } - else if (regs == (RBM_ALLINT & ~RBM_FPBASE)) - { - printf("[allIntButFP]"); - } - else if (regs == RBM_ALLFLOAT) - { - printf("[allFloat]"); - } - else if (regs == RBM_ALLDOUBLE) - { - printf("[allDouble]"); - } - else - { - dspRegMask(regs); - } -} static const char* getRefTypeName(RefType refType) { @@ -9024,7 +9040,7 @@ void RefPosition::dump(LinearScan* linearScan) printf(FMT_BB " ", this->bbNum); printf("regmask="); - dumpRegMask(registerAssignment); + linearScan->compiler->dumpRegMask(registerAssignment); printf(" minReg=%d", minRegCandidateCount); @@ -9087,7 +9103,7 @@ void RegRecord::dump() tinyDump(); } -void Interval::dump() +void Interval::dump(Compiler* compiler) { printf("Interval %2u:", intervalIndex); @@ -9160,7 +9176,7 @@ void Interval::dump() printf(" physReg:%s", getRegName(physReg)); printf(" Preferences="); - dumpRegMask(this->registerPreferences); + compiler->dumpRegMask(this->registerPreferences); if (relatedInterval) { @@ -9242,7 +9258,7 @@ void LinearScan::lsraDumpIntervals(const char* msg) { // only dump something if it has references // if (interval->firstRefPosition) - interval.dump(); + interval.dump(this->compiler); } printf("\n"); @@ -10378,7 +10394,7 @@ void LinearScan::verifyFinalAllocation() } // Clear register assignments. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->assignedInterval = nullptr; @@ -10482,7 +10498,7 @@ void LinearScan::verifyFinalAllocation() } // Clear register assignments. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->assignedInterval = nullptr; @@ -10807,7 +10823,7 @@ void LinearScan::verifyFinalAllocation() } // Clear register assignments. - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* physRegRecord = getRegisterRecord(reg); physRegRecord->assignedInterval = nullptr; @@ -11833,7 +11849,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, if (preferCalleeSave) { - regMaskTP calleeSaveCandidates = calleeSaveRegs(currentInterval->registerType); + regMaskTP calleeSaveCandidates = linearScan->calleeSaveRegs(currentInterval->registerType); if (currentInterval->isWriteThru) { // We'll only prefer a callee-save register if it's already been used. @@ -11849,7 +11865,7 @@ regMaskTP LinearScan::RegisterSelection::select(Interval* currentInterval, } else { - callerCalleePrefs = callerSaveRegs(currentInterval->registerType); + callerCalleePrefs = linearScan->callerSaveRegs(currentInterval->registerType); } // If this has a delayed use (due to being used in a rmw position of a diff --git a/src/coreclr/jit/lsra.h b/src/coreclr/jit/lsra.h index cfbd74487f4947..e6ca1d06e0955b 100644 --- a/src/coreclr/jit/lsra.h +++ b/src/coreclr/jit/lsra.h @@ -64,22 +64,6 @@ inline bool registerTypesEquivalent(RegisterType a, RegisterType b) return varTypeIsIntegralOrI(a) == varTypeIsIntegralOrI(b); } -//------------------------------------------------------------------------ -// calleeSaveRegs: Get the set of callee-save registers of the given RegisterType -// -inline regMaskTP calleeSaveRegs(RegisterType rt) -{ - return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_SAVED : RBM_FLT_CALLEE_SAVED; -} - -//------------------------------------------------------------------------ -// callerSaveRegs: Get the set of caller-save registers of the given RegisterType -// -inline regMaskTP callerSaveRegs(RegisterType rt) -{ - return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; -} - //------------------------------------------------------------------------ // RefInfo: Captures the necessary information for a definition that is "in-flight" // during `buildIntervals` (i.e. a tree-node definition has been encountered, @@ -736,8 +720,19 @@ class LinearScan : public LinearScanInterface unsigned lsraStressMask; // This controls the registers available for allocation - enum LsraStressLimitRegs{LSRA_LIMIT_NONE = 0, LSRA_LIMIT_CALLEE = 0x1, LSRA_LIMIT_CALLER = 0x2, - LSRA_LIMIT_SMALL_SET = 0x3, LSRA_LIMIT_MASK = 0x3}; + enum LsraStressLimitRegs + { + LSRA_LIMIT_NONE = 0, + LSRA_LIMIT_CALLEE = 0x1, + LSRA_LIMIT_CALLER = 0x2, + LSRA_LIMIT_SMALL_SET = 0x3, +#if defined(TARGET_AMD64) + LSRA_LIMIT_UPPER_SIMD_SET = 0x2000, + LSRA_LIMIT_MASK = 0x2003 +#else + LSRA_LIMIT_MASK = 0x3 +#endif + }; // When LSRA_LIMIT_SMALL_SET is specified, it is desirable to select a "mixed" set of caller- and callee-save // registers, so as to get different coverage than limiting to callee or caller. @@ -757,6 +752,9 @@ class LinearScan : public LinearScanInterface (RBM_EAX | RBM_ECX | RBM_EBX | RBM_ETW_FRAMED_EBP | RBM_ESI | RBM_EDI); #endif // !UNIX_AMD64_ABI static const regMaskTP LsraLimitSmallFPSet = (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM6 | RBM_XMM7); + static const regMaskTP LsraLimitUpperSimdSet = + (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | + RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31); #elif defined(TARGET_ARM) // On ARM, we may need two registers to set up the target register for a virtual call, so we need // to have at least the maximum number of arg registers, plus 2. @@ -1062,6 +1060,7 @@ class LinearScan : public LinearScanInterface regMaskTP allRegs(RegisterType rt); regMaskTP allByteRegs(); regMaskTP allSIMDRegs(); + regMaskTP lowSIMDRegs(); regMaskTP internalFloatRegCandidates(); void makeRegisterInactive(RegRecord* physRegRecord); @@ -1846,6 +1845,7 @@ class LinearScan : public LinearScanInterface int BuildCastUses(GenTreeCast* cast, regMaskTP candidates); #ifdef TARGET_XARCH int BuildRMWUses(GenTree* node, GenTree* op1, GenTree* op2, regMaskTP candidates = RBM_NONE); + inline regMaskTP BuildEvexIncompatibleMask(GenTree* tree); #endif // !TARGET_XARCH int BuildSelect(GenTreeOp* select); // This is the main entry point for building the RefPositions for a node. @@ -1926,6 +1926,40 @@ class LinearScan : public LinearScanInterface int BuildPutArgSplit(GenTreePutArgSplit* tree); #endif // FEATURE_ARG_SPLIT int BuildLclHeap(GenTree* tree); + +#if defined(TARGET_AMD64) + regMaskTP get_RBM_ALLFLOAT() const + { + return compiler->rbmAllFloat; + } + regMaskTP get_RBM_FLT_CALLEE_TRASH() const + { + return compiler->rbmFltCalleeTrash; + } + unsigned get_AVAILABLE_REG_COUNT() const + { + return compiler->availableRegCount; + } +#endif // TARGET_AMD64 + + //------------------------------------------------------------------------ + // calleeSaveRegs: Get the set of callee-save registers of the given RegisterType + // + // NOTE: we currently don't need a LinearScan `this` pointer for this definition, and some callers + // don't have one available, so make is static. + // + static regMaskTP calleeSaveRegs(RegisterType rt) + { + return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_SAVED : RBM_FLT_CALLEE_SAVED; + } + + //------------------------------------------------------------------------ + // callerSaveRegs: Get the set of caller-save registers of the given RegisterType + // + regMaskTP callerSaveRegs(RegisterType rt) const + { + return varTypeIsIntegralOrI(rt) ? RBM_INT_CALLEE_TRASH : RBM_FLT_CALLEE_TRASH; + } }; /*XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX @@ -1976,7 +2010,7 @@ class Interval : public Referenceable #ifdef DEBUG // print out representation - void dump(); + void dump(Compiler* compiler); // concise representation for embedding void tinyDump(); // extremely concise representation @@ -2186,7 +2220,7 @@ class Interval : public Referenceable if (preferCalleeSave) { - regMaskTP calleeSaveMask = (calleeSaveRegs(this->registerType) & (newPreferences)); + regMaskTP calleeSaveMask = (LinearScan::calleeSaveRegs(this->registerType) & newPreferences); if (calleeSaveMask != RBM_NONE) { newPreferences = calleeSaveMask; @@ -2511,10 +2545,6 @@ class RefPosition #endif // DEBUG }; -#ifdef DEBUG -void dumpRegMask(regMaskTP regs); -#endif // DEBUG - /*****************************************************************************/ #endif //_LSRA_H_ /*****************************************************************************/ diff --git a/src/coreclr/jit/lsrabuild.cpp b/src/coreclr/jit/lsrabuild.cpp index 3908f1998792a9..e6988402f5ce66 100644 --- a/src/coreclr/jit/lsrabuild.cpp +++ b/src/coreclr/jit/lsrabuild.cpp @@ -159,7 +159,7 @@ Interval* LinearScan::newInterval(RegisterType theRegisterType) newInt->intervalIndex = static_cast(intervals.size() - 1); #endif // DEBUG - DBEXEC(VERBOSE, newInt->dump()); + DBEXEC(VERBOSE, newInt->dump(this->compiler)); return newInt; } @@ -1212,7 +1212,7 @@ bool LinearScan::buildKillPositionsForNode(GenTree* tree, LsraLocation currentLo // If there are no callee-saved registers, the call could kill all the registers. // This is a valid state, so in that case assert should not trigger. The RA will spill in order // to free a register later. - assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType)) == RBM_NONE); + assert(compiler->opts.compDbgEnC || (calleeSaveRegs(varDsc->lvType) == RBM_NONE)); } } } @@ -1860,8 +1860,9 @@ void LinearScan::buildRefPositionsForNode(GenTree* tree, LsraLocation currentLoc JITDUMP("\n"); } -static const regNumber lsraRegOrder[] = {REG_VAR_ORDER}; -const unsigned lsraRegOrderSize = ArrLen(lsraRegOrder); +static const regNumber lsraRegOrder[] = {REG_VAR_ORDER}; +const unsigned lsraRegOrderSize = ArrLen(lsraRegOrder); +// TODO-XARCH-AVX512 we might want to move this to be configured with the rbm variables too static const regNumber lsraRegOrderFlt[] = {REG_VAR_ORDER_FLT}; const unsigned lsraRegOrderFltSize = ArrLen(lsraRegOrderFlt); @@ -1870,7 +1871,7 @@ const unsigned lsraRegOrderFltSize = ArrLen(lsraRegOrderFlt); // void LinearScan::buildPhysRegRecords() { - for (regNumber reg = REG_FIRST; reg < ACTUAL_REG_COUNT; reg = REG_NEXT(reg)) + for (regNumber reg = REG_FIRST; reg < AVAILABLE_REG_COUNT; reg = REG_NEXT(reg)) { RegRecord* curr = &physRegs[reg]; curr->init(reg); @@ -3010,7 +3011,7 @@ void LinearScan::UpdatePreferencesOfDyingLocal(Interval* interval) { printf("Last use of V%02u between PUTARG and CALL. Removing occupied arg regs from preferences: ", compiler->lvaTrackedIndexToLclNum(varIndex)); - dumpRegMask(unpref); + compiler->dumpRegMask(unpref); printf("\n"); } #endif diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index c0fd6030c28804..74d46520215a1f 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -156,7 +156,7 @@ int LinearScan::BuildNode(GenTree* tree) srcCount = 0; assert(dstCount == 1); assert(!tree->IsReuseRegVal()); - RefPosition* def = BuildDef(tree); + RefPosition* def = BuildDef(tree, BuildEvexIncompatibleMask(tree)); def->getInterval()->isConstant = true; } break; @@ -1885,21 +1885,24 @@ int LinearScan::BuildIntrinsic(GenTree* tree) break; } assert(tree->gtGetOp2IfPresent() == nullptr); + + // TODO-XARCH-AVX512 this is overly constraining register available as NI_System_Math_Abs + // can be lowered to EVEX compatible instruction (the rest cannot) int srcCount; if (op1->isContained()) { - srcCount = BuildOperandUses(op1); + srcCount = BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); } else { - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, BuildEvexIncompatibleMask(op1)); srcCount = 1; } if (internalFloatDef != nullptr) { buildInternalRegisterUses(); } - BuildDef(tree); + BuildDef(tree, BuildEvexIncompatibleMask(tree)); return srcCount; } @@ -2006,6 +2009,9 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // Determine whether this is an RMW operation where op2+ must be marked delayFree so that it // is not allocated the same register as the target. bool isRMW = intrinsicTree->isRMWHWIntrinsic(compiler); +#if defined(TARGET_AMD64) + bool isEvexCompatible = intrinsicTree->isEvexCompatibleHWIntrinsic(); +#endif // Create internal temps, and handle any other special requirements. // Note that the default case for building uses will handle the RMW flag, but if the uses @@ -2090,8 +2096,8 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(!isRMW); // MaskMove hardcodes the destination (op3) in DI/EDI/RDI - srcCount += BuildOperandUses(op1); - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); + srcCount += BuildOperandUses(op2, BuildEvexIncompatibleMask(op2)); srcCount += BuildOperandUses(op3, RBM_EDI); buildUses = false; @@ -2107,10 +2113,11 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(isRMW); // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, BuildEvexIncompatibleMask(op1)); srcCount += 1; - srcCount += op2->isContained() ? BuildOperandUses(op2) : BuildDelayFreeUses(op2, op1); + srcCount += op2->isContained() ? BuildOperandUses(op2, BuildEvexIncompatibleMask(op2)) + : BuildDelayFreeUses(op2, op1, BuildEvexIncompatibleMask(op2)); srcCount += BuildDelayFreeUses(op3, op1, RBM_XMM0); buildUses = false; @@ -2305,14 +2312,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou assert(!isRMW); // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1); - srcCount += BuildDelayFreeUses(op2); + srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); + srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2)); // op3 should always be contained assert(op3->isContained()); // get a tmp register for mask that will be cleared by gather instructions - buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); setInternalRegsDelayFree = true; buildUses = false; @@ -2328,16 +2335,16 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou GenTree* op5 = intrinsicTree->Op(5); // Any pair of the index, mask, or destination registers should be different - srcCount += BuildOperandUses(op1); - srcCount += BuildDelayFreeUses(op2); - srcCount += BuildDelayFreeUses(op3); - srcCount += BuildDelayFreeUses(op4); + srcCount += BuildOperandUses(op1, BuildEvexIncompatibleMask(op1)); + srcCount += BuildDelayFreeUses(op2, nullptr, BuildEvexIncompatibleMask(op2)); + srcCount += BuildDelayFreeUses(op3, nullptr, BuildEvexIncompatibleMask(op3)); + srcCount += BuildDelayFreeUses(op4, nullptr, BuildEvexIncompatibleMask(op4)); // op5 should always be contained assert(op5->isContained()); // get a tmp register for mask that will be cleared by gather instructions - buildInternalFloatRegisterDefForNode(intrinsicTree, allSIMDRegs()); + buildInternalFloatRegisterDefForNode(intrinsicTree, lowSIMDRegs()); setInternalRegsDelayFree = true; buildUses = false; @@ -2355,25 +2362,40 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou { assert((numArgs > 0) && (numArgs < 4)); + regMaskTP op1RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op1RegCandidates = BuildEvexIncompatibleMask(op1); + } +#endif + if (intrinsicTree->OperIsMemoryLoadOrStore()) { - srcCount += BuildAddrUses(op1); + srcCount += BuildAddrUses(op1, op1RegCandidates); } else if (isRMW && !op1->isContained()) { - tgtPrefUse = BuildUse(op1); + tgtPrefUse = BuildUse(op1, op1RegCandidates); srcCount += 1; } else { - srcCount += BuildOperandUses(op1); + srcCount += BuildOperandUses(op1, op1RegCandidates); } if (op2 != nullptr) { + regMaskTP op2RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op2RegCandidates = BuildEvexIncompatibleMask(op2); + } +#endif if (op2->OperIs(GT_HWINTRINSIC) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained()) { - srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1)); + srcCount += BuildAddrUses(op2->AsHWIntrinsic()->Op(1), op2RegCandidates); } else if (isRMW) { @@ -2382,7 +2404,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // When op2 is not contained and we are commutative, we can set op2 // to also be a tgtPrefUse. Codegen will then swap the operands. - tgtPrefUse2 = BuildUse(op2); + tgtPrefUse2 = BuildUse(op2, op2RegCandidates); srcCount += 1; } else if (!op2->isContained() || varTypeIsArithmetic(intrinsicTree->TypeGet())) @@ -2390,7 +2412,7 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // When op2 is not contained or if we are producing a scalar value // we need to mark it as delay free because the operand and target // exist in the same register set. - srcCount += BuildDelayFreeUses(op2, op1); + srcCount += BuildDelayFreeUses(op2, op1, op2RegCandidates); } else { @@ -2398,17 +2420,25 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou // have no concerns of overwriting op2 because they exist in different // register sets. - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op2, op2RegCandidates); } } else { - srcCount += BuildOperandUses(op2); + srcCount += BuildOperandUses(op2, op2RegCandidates); } if (op3 != nullptr) { - srcCount += isRMW ? BuildDelayFreeUses(op3, op1) : BuildOperandUses(op3); + regMaskTP op3RegCandidates = RBM_NONE; +#if defined(TARGET_AMD64) + if (!isEvexCompatible) + { + op3RegCandidates = BuildEvexIncompatibleMask(op3); + } +#endif + srcCount += isRMW ? BuildDelayFreeUses(op3, op1, op3RegCandidates) + : BuildOperandUses(op3, op3RegCandidates); } } } @@ -2418,6 +2448,14 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou if (dstCount == 1) { +#if defined(TARGET_AMD64) + if (!intrinsicTree->isEvexCompatibleHWIntrinsic() && + (varTypeIsFloating(intrinsicTree->gtType) || varTypeIsSIMD(intrinsicTree->gtType))) + { + dstCandidates = lowSIMDRegs(); + } +#endif + BuildDef(intrinsicTree, dstCandidates); } else @@ -2701,4 +2739,43 @@ void LinearScan::SetContainsAVXFlags(unsigned sizeOfSIMDVector /* = 0*/) } } +//------------------------------------------------------------------------------ +// BuildEvexIncompatibleMask: Returns RMB_NONE or a mask representing the +// lower SIMD registers for a node that lowers to an instruction that does not +// have an EVEX form (thus cannot use the upper SIMD registers). +// The caller invokes this function when it knows the node is EVEX incompatible. +// +// Simply using lowSIMDRegs() on an incompatible node's operand will incorrectly mask +// same cases, e.g., memory loads. +// +// Arguments: +// tree - tree to check for EVEX lowering compatibility +// +// Return Value: +// RBM_NONE if compatible with EVEX (or not a floating/SIMD register), +// lowSIMDRegs() (XMM0-XMM16) otherwise. +// +inline regMaskTP LinearScan::BuildEvexIncompatibleMask(GenTree* tree) +{ +#if defined(TARGET_AMD64) + if (!(varTypeIsFloating(tree->gtType) || varTypeIsSIMD(tree->gtType))) + { + return RBM_NONE; + } + + // If a node is contained and is a memory load etc., use RBM_NONE as it will use an integer register for the + // load, not a SIMD register. + if (tree->isContained() && + (tree->OperIsIndir() || (tree->OperIs(GT_HWINTRINSIC) && tree->AsHWIntrinsic()->OperIsMemoryLoad()) || + tree->OperIs(GT_LEA))) + { + return RBM_NONE; + } + + return lowSIMDRegs(); +#else + return RBM_NONE; +#endif +} + #endif // TARGET_XARCH diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 50555a7c1a256a..35786b781d544a 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -6966,7 +6966,7 @@ bool Compiler::optIsProfitableToHoistTree(GenTree* tree, unsigned lnum) // Don't hoist expressions that are not heavy: tree->GetCostEx() < (2*IND_COST_EX) if (tree->GetCostEx() < (2 * IND_COST_EX)) { - JITDUMP(" tree cost too low: %d < %d (loopVarCount %u >= availableRegCount %u)\n", tree->GetCostEx(), + JITDUMP(" tree cost too low: %d < %d (loopVarCount %u >= availRegCount %u)\n", tree->GetCostEx(), 2 * IND_COST_EX, loopVarCount, availRegCount); return false; } @@ -6985,7 +6985,7 @@ bool Compiler::optIsProfitableToHoistTree(GenTree* tree, unsigned lnum) // Don't hoist expressions that barely meet CSE cost requirements: tree->GetCostEx() == MIN_CSE_COST if (tree->GetCostEx() <= MIN_CSE_COST + 1) { - JITDUMP(" tree not good CSE: %d <= %d (varInOutCount %u > availableRegCount %u)\n", tree->GetCostEx(), + JITDUMP(" tree not good CSE: %d <= %d (varInOutCount %u > availRegCount %u)\n", tree->GetCostEx(), 2 * MIN_CSE_COST + 1, varInOutCount, availRegCount) return false; } diff --git a/src/coreclr/jit/register.h b/src/coreclr/jit/register.h index 6f63bc51211d63..ca90673e85adfe 100644 --- a/src/coreclr/jit/register.h +++ b/src/coreclr/jit/register.h @@ -94,7 +94,27 @@ REGDEF(XMM12, 12+XMMBASE, XMMMASK(12), "mm12" ) REGDEF(XMM13, 13+XMMBASE, XMMMASK(13), "mm13" ) REGDEF(XMM14, 14+XMMBASE, XMMMASK(14), "mm14" ) REGDEF(XMM15, 15+XMMBASE, XMMMASK(15), "mm15" ) -REGDEF(STK, 16+XMMBASE, 0x0000, "STK" ) + +REGDEF(XMM16, 16+XMMBASE, XMMMASK(16), "mm16" ) +REGDEF(XMM17, 17+XMMBASE, XMMMASK(17), "mm17" ) +REGDEF(XMM18, 18+XMMBASE, XMMMASK(18), "mm18" ) +REGDEF(XMM19, 19+XMMBASE, XMMMASK(19), "mm19" ) +REGDEF(XMM20, 20+XMMBASE, XMMMASK(20), "mm20" ) +REGDEF(XMM21, 21+XMMBASE, XMMMASK(21), "mm21" ) +REGDEF(XMM22, 22+XMMBASE, XMMMASK(22), "mm22" ) +REGDEF(XMM23, 23+XMMBASE, XMMMASK(23), "mm23" ) + +REGDEF(XMM24, 24+XMMBASE, XMMMASK(24), "mm24" ) +REGDEF(XMM25, 25+XMMBASE, XMMMASK(25), "mm25" ) +REGDEF(XMM26, 26+XMMBASE, XMMMASK(26), "mm26" ) +REGDEF(XMM27, 27+XMMBASE, XMMMASK(27), "mm27" ) +REGDEF(XMM28, 28+XMMBASE, XMMMASK(28), "mm28" ) +REGDEF(XMM29, 29+XMMBASE, XMMMASK(29), "mm29" ) +REGDEF(XMM30, 30+XMMBASE, XMMMASK(30), "mm30" ) +REGDEF(XMM31, 31+XMMBASE, XMMMASK(31), "mm31" ) + +REGDEF(STK, 32+XMMBASE, 0x0000, "STK" ) + #endif // !TARGET_X86 #elif defined(TARGET_ARM) diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 392a5417141398..cc97831c9f5287 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -61,7 +61,11 @@ inline bool compUnixX86Abi() /*****************************************************************************/ // The following are intended to capture only those #defines that cannot be replaced // with static const members of Target -#if defined(TARGET_XARCH) +#if defined(TARGET_AMD64) +#define REGMASK_BITS 64 +#define CSE_CONST_SHARED_LOW_BITS 16 + +#elif defined(TARGET_X86) #define REGMASK_BITS 32 #define CSE_CONST_SHARED_LOW_BITS 16 @@ -146,13 +150,14 @@ enum _regNumber_enum : unsigned ACTUAL_REG_COUNT = REG_COUNT - 1 // everything but REG_STK (only real regs) }; -enum _regMask_enum : unsigned +enum _regMask_enum : uint64_t { RBM_NONE = 0, #define REGDEF(name, rnum, mask, sname) RBM_##name = mask, #define REGALIAS(alias, realname) RBM_##alias = RBM_##realname, #include "register.h" + }; #elif defined(TARGET_X86) @@ -181,6 +186,13 @@ enum _regMask_enum : unsigned #error Unsupported target architecture #endif +#if defined(TARGET_AMD64) +// AVAILABLE_REG_COUNT is defined to be dynamic, based on whether AVX-512 high registers are available. +#define AVAILABLE_REG_COUNT get_AVAILABLE_REG_COUNT() +#else +#define AVAILABLE_REG_COUNT ACTUAL_REG_COUNT +#endif + /*****************************************************************************/ // TODO-Cleanup: The types defined below are mildly confusing: why are there both? @@ -192,7 +204,7 @@ enum _regMask_enum : unsigned // In any case, we believe that is OK to freely cast between these types; no information will // be lost. -#if defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) +#if defined(TARGET_AMD64) || defined(TARGET_ARMARCH) || defined(TARGET_LOONGARCH64) typedef unsigned __int64 regMaskTP; #else typedef unsigned regMaskTP; @@ -528,7 +540,7 @@ inline regMaskTP genRegMask(regNumber reg) // (L1 latency on sandy bridge is 4 cycles for [base] and 5 for [base + index*c] ) // the reason this is AMD-only is because the x86 BE will try to get reg masks for REG_STK // and the result needs to be zero. - regMaskTP result = 1 << reg; + regMaskTP result = 1ULL << reg; assert(result == regMasks[reg]); return result; #else diff --git a/src/coreclr/jit/targetamd64.h b/src/coreclr/jit/targetamd64.h index 4ec128a6345d21..64af2659bd592d 100644 --- a/src/coreclr/jit/targetamd64.h +++ b/src/coreclr/jit/targetamd64.h @@ -78,10 +78,17 @@ #endif // !UNIX_AMD64_ABI #define CSE_CONSTS 1 // Enable if we want to CSE constants - #define RBM_ALLFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15) + #define RBM_LOWFLOAT (RBM_XMM0 | RBM_XMM1 | RBM_XMM2 | RBM_XMM3 | RBM_XMM4 | RBM_XMM5 | RBM_XMM6 | RBM_XMM7 | RBM_XMM8 | RBM_XMM9 | RBM_XMM10 | RBM_XMM11 | RBM_XMM12 | RBM_XMM13 | RBM_XMM14 | RBM_XMM15 ) + #define RBM_HIGHFLOAT (RBM_XMM16 | RBM_XMM17 | RBM_XMM18 | RBM_XMM19 | RBM_XMM20 | RBM_XMM21 | RBM_XMM22 | RBM_XMM23 | RBM_XMM24 | RBM_XMM25 | RBM_XMM26 | RBM_XMM27 | RBM_XMM28 | RBM_XMM29 | RBM_XMM30 | RBM_XMM31) + #define CNT_HIGHFLOAT 16 + + #define RBM_ALLFLOAT_INIT RBM_LOWFLOAT + + #define RBM_ALLFLOAT get_RBM_ALLFLOAT() + #define RBM_ALLDOUBLE RBM_ALLFLOAT #define REG_FP_FIRST REG_XMM0 - #define REG_FP_LAST REG_XMM15 + #define REG_FP_LAST REG_XMM31 #define FIRST_FP_ARGREG REG_XMM0 #ifdef UNIX_AMD64_ABI @@ -117,8 +124,11 @@ #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15) #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_RDI|RBM_RSI|RBM_EDX|RBM_ECX|RBM_R8|RBM_R9|RBM_R10|RBM_R11) #define RBM_FLT_CALLEE_SAVED (0) - #define RBM_FLT_CALLEE_TRASH (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \ + + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_FLT_CALLEE_TRASH_INIT (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5|RBM_XMM6|RBM_XMM7| \ RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) + #define REG_PROFILER_ENTER_ARG_0 REG_R14 #define RBM_PROFILER_ENTER_ARG_0 RBM_R14 #define REG_PROFILER_ENTER_ARG_1 REG_R15 @@ -132,15 +142,19 @@ #define RBM_INT_CALLEE_SAVED (RBM_EBX|RBM_ESI|RBM_EDI|RBM_ETW_FRAMED_EBP|RBM_R12|RBM_R13|RBM_R14|RBM_R15) #define RBM_INT_CALLEE_TRASH (RBM_EAX|RBM_ECX|RBM_EDX|RBM_R8|RBM_R9|RBM_R10|RBM_R11) #define RBM_FLT_CALLEE_SAVED (RBM_XMM6|RBM_XMM7|RBM_XMM8|RBM_XMM9|RBM_XMM10|RBM_XMM11|RBM_XMM12|RBM_XMM13|RBM_XMM14|RBM_XMM15) - #define RBM_FLT_CALLEE_TRASH (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5) + /* NOTE: Sync with variable name defined in compiler.h */ + #define RBM_FLT_CALLEE_TRASH_INIT (RBM_XMM0|RBM_XMM1|RBM_XMM2|RBM_XMM3|RBM_XMM4|RBM_XMM5) #endif // !UNIX_AMD64_ABI + #define RBM_FLT_CALLEE_TRASH get_RBM_FLT_CALLEE_TRASH() + #define RBM_OSR_INT_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_EBP) #define REG_FLT_CALLEE_SAVED_FIRST REG_XMM6 #define REG_FLT_CALLEE_SAVED_LAST REG_XMM15 #define RBM_CALLEE_TRASH (RBM_INT_CALLEE_TRASH | RBM_FLT_CALLEE_TRASH) + #define RBM_CALLEE_SAVED (RBM_INT_CALLEE_SAVED | RBM_FLT_CALLEE_SAVED) #define RBM_ALLINT (RBM_INT_CALLEE_SAVED | RBM_INT_CALLEE_TRASH) @@ -169,7 +183,7 @@ #define REG_WRITE_BARRIER_SRC REG_ARG_1 #define RBM_WRITE_BARRIER_SRC RBM_ARG_1 - #define RBM_CALLEE_TRASH_NOGC RBM_CALLEE_TRASH + #define RBM_CALLEE_TRASH_NOGC RBM_CALLEE_TRASH // Registers killed by CORINFO_HELP_ASSIGN_REF and CORINFO_HELP_CHECKED_ASSIGN_REF. #define RBM_CALLEE_TRASH_WRITEBARRIER RBM_CALLEE_TRASH_NOGC @@ -181,7 +195,6 @@ #define RBM_CALLEE_TRASH_WRITEBARRIER_BYREF (RBM_RSI | RBM_RDI | RBM_CALLEE_TRASH_NOGC) // Registers no longer containing GC pointers after CORINFO_HELP_ASSIGN_BYREF. - // Note that RDI and RSI are still valid byref pointers after this helper call, despite their value being changed. #define RBM_CALLEE_GCTRASH_WRITEBARRIER_BYREF (RBM_CALLEE_TRASH_NOGC & ~(RBM_RDI | RBM_RSI)) #if 0 @@ -203,7 +216,10 @@ #endif // !UNIX_AMD64_ABI #endif - #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7,REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15 + #define REG_VAR_ORDER_FLT REG_XMM0,REG_XMM1,REG_XMM2,REG_XMM3,REG_XMM4,REG_XMM5,REG_XMM6,REG_XMM7, \ + REG_XMM8,REG_XMM9,REG_XMM10,REG_XMM11,REG_XMM12,REG_XMM13,REG_XMM14,REG_XMM15, \ + REG_XMM16,REG_XMM17,REG_XMM18,REG_XMM19,REG_XMM20,REG_XMM21,REG_XMM22,REG_XMM23, \ + REG_XMM24,REG_XMM25,REG_XMM26,REG_XMM27,REG_XMM28,REG_XMM29,REG_XMM30,REG_XMM31 #ifdef UNIX_AMD64_ABI #define CNT_CALLEE_SAVED (5 + REG_ETW_FRAMED_EBP_COUNT) @@ -211,7 +227,9 @@ #define CNT_CALLEE_ENREG (CNT_CALLEE_SAVED) #define CNT_CALLEE_SAVED_FLOAT (0) - #define CNT_CALLEE_TRASH_FLOAT (16) + #define CNT_CALLEE_TRASH_FLOAT_INIT (16) + #define CNT_CALLEE_TRASH_HIGHFLOAT (16) + /* NOTE: Sync with variable name defined in compiler.h */ #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 @@ -220,13 +238,16 @@ #define CNT_CALLEE_TRASH (7) #define CNT_CALLEE_ENREG (CNT_CALLEE_SAVED) - #define CNT_CALLEE_SAVED_FLOAT (10) - #define CNT_CALLEE_TRASH_FLOAT (6) - + #define CNT_CALLEE_SAVED_FLOAT (10) + #define CNT_CALLEE_TRASH_FLOAT_INIT (6) + #define CNT_CALLEE_TRASH_HIGHFLOAT (16) + /* NOTE: Sync with variable name defined in compiler.h */ #define REG_CALLEE_SAVED_ORDER REG_EBX,REG_ESI,REG_EDI,REG_ETW_FRAMED_EBP_LIST REG_R12,REG_R13,REG_R14,REG_R15 #define RBM_CALLEE_SAVED_ORDER RBM_EBX,RBM_ESI,RBM_EDI,RBM_ETW_FRAMED_EBP_LIST RBM_R12,RBM_R13,RBM_R14,RBM_R15 #endif // !UNIX_AMD64_ABI + #define CNT_CALLEE_TRASH_FLOAT get_CNT_CALLEE_TRASH_FLOAT() + #define CALLEE_SAVED_REG_MAXSZ (CNT_CALLEE_SAVED*REGSIZE_BYTES) #define CALLEE_SAVED_FLOAT_MAXSZ (CNT_CALLEE_SAVED_FLOAT*16) @@ -413,8 +434,9 @@ // The registers trashed by profiler enter/leave/tailcall hook // See vm\amd64\asmhelpers.asm for more details. - #define RBM_PROFILER_ENTER_TRASH RBM_CALLEE_TRASH - #define RBM_PROFILER_TAILCALL_TRASH RBM_PROFILER_LEAVE_TRASH + #define RBM_PROFILER_ENTER_TRASH RBM_CALLEE_TRASH + + #define RBM_PROFILER_TAILCALL_TRASH RBM_PROFILER_LEAVE_TRASH // The registers trashed by the CORINFO_HELP_STOP_FOR_GC helper. #ifdef UNIX_AMD64_ABI @@ -423,11 +445,11 @@ // On Unix a struct of size >=9 and <=16 bytes in size is returned in two return registers. // The return registers could be any two from the set { RAX, RDX, XMM0, XMM1 }. // STOP_FOR_GC helper preserves all the 4 possible return registers. - #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) + #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET | RBM_FLOATRET_1 | RBM_INTRET_1)) #else // See vm\amd64\asmhelpers.asm for more details. - #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) + #define RBM_STOP_FOR_GC_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) #define RBM_PROFILER_LEAVE_TRASH (RBM_CALLEE_TRASH & ~(RBM_FLOATRET | RBM_INTRET)) #endif diff --git a/src/coreclr/jit/utils.h b/src/coreclr/jit/utils.h index 0e129f1ed0340c..47558b161e3334 100644 --- a/src/coreclr/jit/utils.h +++ b/src/coreclr/jit/utils.h @@ -419,6 +419,16 @@ class PhasedVar return *this; } + PhasedVar& operator|=(const T& value) + { +#ifdef DEBUG + assert(m_writePhase); + m_initialized = true; +#endif // DEBUG + m_value |= value; + return *this; + } + // Note: if you need more = functions, you can define them here, like operator&= // Assign a value, but don't assert if we're not in the write phase, and diff --git a/src/coreclr/vm/threadsuspend.cpp b/src/coreclr/vm/threadsuspend.cpp index 73f10f1ef4ce4f..89f2f9d33f7e2c 100644 --- a/src/coreclr/vm/threadsuspend.cpp +++ b/src/coreclr/vm/threadsuspend.cpp @@ -1970,14 +1970,14 @@ CONTEXT* AllocateOSContextHelper(BYTE** contextBuffer) // Determine if the processor supports AVX so we could // retrieve extended registers DWORD64 FeatureMask = GetEnabledXStateFeatures(); - if ((FeatureMask & XSTATE_MASK_AVX) != 0) + if ((FeatureMask & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) != 0) { context = context | CONTEXT_XSTATE; } // Retrieve contextSize by passing NULL for Buffer DWORD contextSize = 0; - ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX; + ULONG64 xStateCompactionMask = XSTATE_MASK_LEGACY | XSTATE_MASK_AVX | XSTATE_MASK_MPX | XSTATE_MASK_AVX512; // The initialize call should fail but return contextSize BOOL success = g_pfnInitializeContext2 ? g_pfnInitializeContext2(NULL, context, NULL, &contextSize, xStateCompactionMask) : @@ -2899,7 +2899,7 @@ BOOL Thread::RedirectThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt) // This should not normally fail. // The system silently ignores any feature specified in the FeatureMask // which is not enabled on the processor. - SetXStateFeaturesMask(pCtx, XSTATE_MASK_AVX); + SetXStateFeaturesMask(pCtx, (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)); #endif //defined(TARGET_X86) || defined(TARGET_AMD64) // Make sure we specify CONTEXT_EXCEPTION_REQUEST to detect "trap frame reporting". @@ -3035,7 +3035,7 @@ BOOL Thread::RedirectCurrentThreadAtHandledJITCase(PFN_REDIRECTTARGET pTgt, CONT // Get may return 0 if no XState is set, which Set would not accept. if (srcFeatures != 0) { - success = SetXStateFeaturesMask(pCurrentThreadCtx, srcFeatures & XSTATE_MASK_AVX); + success = SetXStateFeaturesMask(pCurrentThreadCtx, srcFeatures & (XSTATE_MASK_AVX | XSTATE_MASK_AVX512)); _ASSERTE(success); if (!success) return FALSE; diff --git a/src/tests/Common/testenvironment.proj b/src/tests/Common/testenvironment.proj index 11b358a7f39f2e..b3de6ea96268d6 100644 --- a/src/tests/Common/testenvironment.proj +++ b/src/tests/Common/testenvironment.proj @@ -153,6 +153,7 @@ +