diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 4b14b8d1c451dd..e965e6638a26d3 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -9126,10 +9126,6 @@ void CodeGen::genAmd64EmitterUnitTestsApx() theEmitter->emitIns_R_R_I(INS_shld, EA_4BYTE, REG_EAX, REG_ECX, 5); theEmitter->emitIns_R_R_I(INS_shrd, EA_2BYTE, REG_EAX, REG_ECX, 5); - // TODO-XArch-apx: S_R_I path only accepts SEE or VEX instructions, - // so I assuem shld/shrd will not be taking the first argument from stack. - // theEmitter->emitIns_S_R_I(INS_shld, EA_2BYTE, 1, 2, REG_EAX, 5); - // theEmitter->emitIns_S_R_I(INS_shrd, EA_2BYTE, 1, 2, REG_EAX, 5); theEmitter->emitIns_AR_R(INS_cmpxchg, EA_2BYTE, REG_EAX, REG_EDX, 2); @@ -9299,6 +9295,51 @@ void CodeGen::genAmd64EmitterUnitTestsApx() theEmitter->emitIns_BASE_R_R(INS_inc, EA_4BYTE, REG_R11, REG_R12); theEmitter->emitIns_BASE_R_R_I(INS_add, EA_4BYTE, REG_R11, REG_R12, 5); + + // testing for EGPR encodings. + GenTreePhysReg eGPR(REG_R16); + eGPR.SetRegNum(REG_R16); + GenTreeIndir loadGPR = indirForm(TYP_SIMD32, &eGPR); + + // // SIMD instructions + // // In most of the cases, EGPR will only be used as BASE/INDEX registers in SIMD instructions. + theEmitter->emitIns_R_R_A(INS_addps, EA_32BYTE, REG_XMM16, REG_XMM16, &loadGPR); + + // // Legacy instructions + theEmitter->emitIns_R_ARX(INS_add, EA_4BYTE, REG_R16, REG_R17, REG_R18, 1, 0); + + theEmitter->emitIns_AR_R(INS_movnti, EA_8BYTE, REG_R17, REG_R16, 10); + theEmitter->emitIns_R_R_R(INS_andn, EA_8BYTE, REG_R17, REG_R16, REG_R18); + + theEmitter->emitIns_Mov(INS_kmovb_gpr, EA_4BYTE, REG_R16, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovb_msk, EA_4BYTE, REG_K5, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovw_gpr, EA_4BYTE, REG_R16, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovw_msk, EA_4BYTE, REG_K5, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovd_gpr, EA_4BYTE, REG_R16, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovd_msk, EA_4BYTE, REG_K5, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovq_gpr, EA_8BYTE, REG_R16, REG_K0, false); + theEmitter->emitIns_Mov(INS_kmovq_msk, EA_8BYTE, REG_K5, REG_K0, false); + + theEmitter->emitIns_R_R(INS_crc32_apx, EA_1BYTE, REG_R16, REG_R17); + theEmitter->emitIns_R_R(INS_crc32_apx, EA_2BYTE, REG_R16, REG_R17); + theEmitter->emitIns_R_R(INS_crc32_apx, EA_8BYTE, REG_R16, REG_R17); + theEmitter->emitIns_R_A(INS_crc32_apx, EA_8BYTE, REG_R18, &loadGPR); + theEmitter->emitIns_R_S(INS_crc32_apx, EA_8BYTE, REG_R18, 0, 0); + + // Note that BZHI has a reversed src operands due to special handling at import. + theEmitter->emitIns_R_R_R(INS_bzhi, EA_4BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_bzhi, EA_8BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_mulx, EA_4BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_mulx, EA_8BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_pdep, EA_4BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_pdep, EA_8BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_pext, EA_4BYTE, REG_R16, REG_R18, REG_R17); + theEmitter->emitIns_R_R_R(INS_pext, EA_8BYTE, REG_R16, REG_R18, REG_R17); + + theEmitter->emitIns_Mov(INS_movd, EA_4BYTE, REG_R16, REG_XMM0, false); + theEmitter->emitIns_Mov(INS_movd, EA_4BYTE, REG_R16, REG_XMM16, false); + theEmitter->emitIns_Mov(INS_movq, EA_8BYTE, REG_R16, REG_XMM0, false); + theEmitter->emitIns_Mov(INS_movq, EA_8BYTE, REG_R16, REG_XMM16, false); } void CodeGen::genAmd64EmitterUnitTestsAvx10v2() diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 6763f1f2d5231f..7b354ddbc6a9e9 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -3948,7 +3948,7 @@ class Compiler unsigned lvaInlineeReturnSpillTemp = BAD_VAR_NUM; // The temp to spill the non-VOID return expression // in case there are multiple BBJ_RETURN blocks in the inlinee // or if the inlinee has GC ref locals. - + bool lvaInlineeReturnSpillTempFreshlyCreated = false; // True if the temp was freshly created for the inlinee return #if FEATURE_FIXED_OUT_ARGS @@ -4476,7 +4476,7 @@ class Compiler CompAllocator alloc(compiler->getAllocator(CMK_Generic)); compiler->impEnumeratorGdvLocalMap = new (alloc) NodeToUnsignedMap(alloc); } - + return compiler->impEnumeratorGdvLocalMap; } diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index 7682b98a3a68ab..04fddafa511528 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -359,12 +359,13 @@ bool emitter::IsApxNFEncodableInstruction(instruction ins) const // bool emitter::IsApxExtendedEvexInstruction(instruction ins) const { +#ifdef TARGET_AMD64 if (!UsePromotedEVEXEncoding()) { return false; } - if (HasApxNdd(ins) || HasApxNf(ins)) + if (HasApxNdd(ins) || HasApxNf(ins) || (ins == INS_crc32_apx)) { return true; } @@ -375,6 +376,9 @@ bool emitter::IsApxExtendedEvexInstruction(instruction ins) const } return false; +#else // !TARGET_AMD64 + return false; +#endif } //------------------------------------------------------------------------ @@ -1711,6 +1715,14 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const if (HasHighSIMDReg(id) || (id->idOpSize() == EA_64BYTE) || HasMaskReg(id)) { // Requires the EVEX encoding due to used registers + // A special case here is KMOV, the original KMOV introduced in Avx512 can only be encoded in VEX, APX promoted + // them to EVEX, so only return true when APX is available. + if ((ins == INS_kmovb_msk) || (ins == INS_kmovw_msk) || (ins == INS_kmovd_msk) || (ins == INS_kmovq_msk) || + (ins == INS_kmovb_gpr) || (ins == INS_kmovw_gpr) || (ins == INS_kmovd_gpr) || (ins == INS_kmovq_gpr)) + { + // Use EVEX only when needed. + return HasExtendedGPReg(id); + } return true; } @@ -1720,6 +1732,14 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const return true; } + if (HasExtendedGPReg(id)) + { + // TODO-XArch-apx: + // revisit this part: this may have some conflicts with REX2 prefix, we may prefer REX2 if only EGPR is + // involved. + return true; + } + if (id->idIsEvexNfContextSet() && IsBMIInstruction(ins)) { // Only a few BMI instructions shall be promoted to APX-EVEX due to NF feature. @@ -1773,6 +1793,7 @@ bool emitter::TakesEvexPrefix(const instrDesc* id) const // bool emitter::TakesRex2Prefix(const instrDesc* id) const { +#ifdef TARGET_AMD64 // Return true iff the instruction supports REX2 encoding, and it requires to access EGPRs. // TODO-xarch-apx: @@ -1803,6 +1824,9 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const #endif // DEBUG return false; +#else // !TARGET_AMD64 + return false; +#endif } //------------------------------------------------------------------------ @@ -1816,9 +1840,7 @@ bool emitter::TakesRex2Prefix(const instrDesc* id) const // bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const { - // TODO-XArch-APX: - // Isolating legacy-promoted-EVEX case out from VEX/EVEX-promoted-EVEX, - // as the latter ones are relatively simple, providing EGPRs functionality, +#ifdef TARGET_AMD64 instruction ins = id->idIns(); if (!IsApxExtendedEvexInstruction(ins)) { @@ -1846,6 +1868,11 @@ bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const return true; } + if (ins == INS_crc32_apx) + { + return true; + } + #if defined(DEBUG) if (emitComp->DoJitStressPromotedEvexEncoding()) { @@ -1858,6 +1885,9 @@ bool emitter::TakesApxExtendedEvexPrefix(const instrDesc* id) const } return false; +#else // !TARGET_AMD64 + return false; +#endif } // Intel AVX-512 encoding is defined in "Intel 64 and ia-32 architectures software developer's manual volume 2", Section @@ -2387,7 +2417,12 @@ bool emitter::HasMaskReg(const instrDesc* id) const } #if defined(DEBUG) - assert(!isMaskReg(id->idReg2())); + // After APX, KMOV instructions can be encoded in EVEX. + if (isMaskReg(id->idReg2())) + { + assert(IsKInstruction(id->idIns())); + return UsePromotedEVEXEncoding(); + } if (!id->idIsSmallDsc()) { @@ -2733,7 +2768,7 @@ emitter::code_t emitter::AddRexPrefix(instruction ins, code_t code) emitter::code_t emitter::AddEvexVPrimePrefix(code_t code) { #if defined(TARGET_AMD64) - assert(UseEvexEncoding() && hasEvexPrefix(code)); + assert((UseEvexEncoding() || UsePromotedEVEXEncoding()) && hasEvexPrefix(code)); return emitter::code_t(code & 0xFFFFFFF7FFFFFFFFULL); #else unreached(); @@ -2753,7 +2788,7 @@ emitter::code_t emitter::AddEvexVPrimePrefix(code_t code) emitter::code_t emitter::AddEvexRPrimePrefix(code_t code) { #if defined(TARGET_AMD64) - assert(UseEvexEncoding() && hasEvexPrefix(code)); + assert((UseEvexEncoding() || UsePromotedEVEXEncoding()) && hasEvexPrefix(code)); return emitter::code_t(code & 0xFFEFFFFFFFFFFFFFULL); #else unreached(); @@ -2822,13 +2857,45 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co { case 0x66: { - // None of the existing BMI instructions should be EVEX encoded. - // After APX, BMI instructions can be EVEX encoded with NF feature. + // After APX, BMI instructions can be encoded in EVEX. if (IsBMIInstruction(ins)) { - // if BMI instructions reaches this part, then it should be APX-EVEX. - // although the opcode of all the BMI instructions are defined with 0x66, - // but it should not, skip this check. + switch (ins) + { + case INS_rorx: + case INS_pdep: + case INS_mulx: +// TODO: Unblock when enabled for x86 +#ifdef TARGET_AMD64 + case INS_shrx: +#endif + { + evexPrefix |= (0x03 << 8); + break; + } + + case INS_pext: +// TODO: Unblock when enabled for x86 +#ifdef TARGET_AMD64 + case INS_sarx: +#endif + { + evexPrefix |= (0x02 << 8); + break; + } +// TODO: Unblock when enabled for x86 +#ifdef TARGET_AMD64 + case INS_shlx: + { + evexPrefix |= (0x01 << 8); + break; + } +#endif + default: + { + break; + } + } break; } assert(!IsBMIInstruction(ins)); @@ -4231,8 +4298,19 @@ inline unsigned emitter::insEncodeReg012(const instrDesc* id, regNumber reg, emi if (IsExtendedGPReg(reg)) { // Seperate the encoding for REX2.B3/B4, REX2.B3 will be handled in `AddRexBPrefix`. - assert(TakesRex2Prefix(id)); - *code |= 0x001000000000ULL; // REX2.B4 + assert(TakesRex2Prefix(id) || TakesApxExtendedEvexPrefix(id) || TakesEvexPrefix(id)); + if (hasRex2Prefix(*code)) + { + *code |= 0x001000000000ULL; // REX2.B4 + } + else if (hasEvexPrefix(*code)) + { + *code |= 0x8000000000000ULL; // EVEX.B4 + } + else + { + // There are cases when this method is called before prefix is attached. + } } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) @@ -4280,8 +4358,19 @@ inline unsigned emitter::insEncodeReg345(const instrDesc* id, regNumber reg, emi if (IsExtendedGPReg(reg)) { // Seperate the encoding for REX2.R3/R4, REX2.R3 will be handled in `AddRexRPrefix`. - assert(TakesRex2Prefix(id)); - *code |= 0x004000000000ULL; // REX2.R4 + assert(TakesRex2Prefix(id) || TakesApxExtendedEvexPrefix(id) || TakesEvexPrefix(id)); + if (hasRex2Prefix(*code)) + { + *code |= 0x004000000000ULL; // REX2.R4 + } + else if (hasEvexPrefix(*code)) + { + *code = AddEvexRPrimePrefix(*code); // EVEX.R4 + } + else + { + // There are cases when this method is called before prefix is attached. + } } } else if ((EA_SIZE(size) == EA_1BYTE) && (reg > REG_RBX) && (code != nullptr)) @@ -4339,6 +4428,12 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber // Have to set the EVEX V' bit code = AddEvexVPrimePrefix(code); } + + if (isHighGPReg(reg) && IsBMIInstruction(ins)) + { + // APX: BMI instructions use RVM operand encoding + code = AddEvexVPrimePrefix(code); + } #endif // Shift count = 5-bytes of opcode + 0-2 bits for EVEX @@ -4364,7 +4459,7 @@ inline emitter::code_t emitter::insEncodeReg3456(const instrDesc* id, regNumber // Rather see these paths cleaned up. regBits = HighAwareRegEncoding(reg); - if (false /*reg >= REG_R16 && reg <= REG_R31*/) + if (isHighGPReg(reg)) { // Have to set the EVEX V' bit code = AddEvexVPrimePrefix(code); @@ -4410,8 +4505,21 @@ inline unsigned emitter::insEncodeRegSIB(const instrDesc* id, regNumber reg, cod if (IsExtendedGPReg(reg)) { // Separate the encoding for REX2.X3/X4, REX2.X3 will be handled in `AddRexXPrefix`. - assert(TakesRex2Prefix(id)); - *code |= 0x002000000000ULL; // REX2.X4 + assert(TakesRex2Prefix(id) || TakesApxExtendedEvexPrefix(id) || TakesEvexPrefix(id)); + if (hasRex2Prefix(*code)) + { + *code |= 0x002000000000ULL; // REX2.X4 + } + else if (hasEvexPrefix(*code)) + { + // Note that APX-EVEX use EVEX.X4 as the MSB of the INDEX register to address GPRs, and the original + // EVEX.V4 is used for VSIB addressing. + *code &= 0xFFFFFBFFFFFFFFFFULL; // EVEX.X4 + } + else + { + // There are cases when this method is called before prefix is attached. + } } } unsigned regBits = RegEncoding(reg); @@ -14534,6 +14642,12 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) NO_WAY("unexpected size"); break; } +#ifdef TARGET_AMD64 + if (ins == INS_crc32_apx) + { + code |= (insEncodeReg345(id, id->idReg1(), size, &code) << 8); + } +#endif // TARGET_AMD64 } // Output the REX prefix @@ -15356,6 +15470,12 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) { dst += emitOutputByte(dst, 0x66); } +#ifdef TARGET_AMD64 + else + { + code |= EXTENDED_EVEX_PP_BITS; + } +#endif // TARGET_AMD64 } FALLTHROUGH; @@ -15401,6 +15521,14 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) NO_WAY("unexpected size"); break; } +#ifdef TARGET_AMD64 + if (ins == INS_crc32_apx) + { + // The promoted CRC32 is in 1-byte opcode, unlike other instructions on this path, the register encoding for + // CRC32 need to be done here. + code |= (insEncodeReg345(id, id->idReg1(), size, &code) << 8); + } +#endif // TARGET_AMD64 } // Output the REX prefix @@ -16513,7 +16641,7 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) else if ((ins == INS_bsf) || (ins == INS_bsr) || (ins == INS_crc32) || (ins == INS_lzcnt) || (ins == INS_popcnt) || (ins == INS_tzcnt) #ifdef TARGET_AMD64 - || (ins == INS_lzcnt_apx) || (ins == INS_tzcnt_apx) || (ins == INS_popcnt_apx) + || (ins == INS_lzcnt_apx) || (ins == INS_tzcnt_apx) || (ins == INS_popcnt_apx) || (ins == INS_crc32_apx) #endif // TARGET_AMD64 ) { @@ -16525,11 +16653,24 @@ BYTE* emitter::emitOutputRR(BYTE* dst, instrDesc* id) { code |= 0x0100; } +#ifdef TARGET_AMD64 + if ((ins == INS_crc32_apx) && (size > EA_1BYTE)) + { + code |= 0x01; + } +#endif // TARGET_AMD64 - if (size == EA_2BYTE && !TakesApxExtendedEvexPrefix(id)) + if (size == EA_2BYTE) { - assert(ins == INS_crc32); - dst += emitOutputByte(dst, 0x66); + if (!TakesApxExtendedEvexPrefix(id)) + { + assert(ins == INS_crc32); + dst += emitOutputByte(dst, 0x66); + } + else + { + code |= EXTENDED_EVEX_PP_BITS; + } } else if (size == EA_8BYTE) { @@ -16982,14 +17123,13 @@ BYTE* emitter::emitOutputRI(BYTE* dst, instrDesc* id) code = insCodeACC(ins); assert(code < 0x100); - code |= 0x08; // Set the 'w' bit - unsigned regcode = insEncodeReg012(id, reg, size, &code); - code |= regcode; - // This is INS_mov and will not take VEX prefix assert(!TakesVexPrefix(ins)); code = AddX86PrefixIfNeededAndNotPresent(id, code, size); + code |= 0x08; // Set the 'w' bit + unsigned regcode = insEncodeReg012(id, reg, size, &code); + code |= regcode; if (TakesRexWPrefix(id)) { @@ -21345,6 +21485,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_popcnt_apx: case INS_lzcnt_apx: case INS_tzcnt_apx: + case INS_crc32_apx: #endif // TARGET_AMD64 { result.insThroughput = PERFSCORE_THROUGHPUT_1C; diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h index 93bd131f5a3472..583ff4c8f93bc3 100644 --- a/src/coreclr/jit/emitxarch.h +++ b/src/coreclr/jit/emitxarch.h @@ -297,6 +297,11 @@ bool HasKMaskRegisterDest(instruction ins) const case INS_vgatherqps: case INS_vgatherdpd: case INS_vgatherqpd: + // KMOV can be promoted to EVEX with APX. + case INS_kmovb_msk: + case INS_kmovw_msk: + case INS_kmovd_msk: + case INS_kmovq_msk: { return true; } diff --git a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp index 96d40a9e43b555..78183d6d4d36de 100644 --- a/src/coreclr/jit/hwintrinsiccodegenxarch.cpp +++ b/src/coreclr/jit/hwintrinsiccodegenxarch.cpp @@ -2231,15 +2231,47 @@ void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node, insOpts instOptions) assert(!op2->isUsedFromReg() || (op2->GetRegNum() != targetReg) || (op1Reg == targetReg)); emit->emitIns_Mov(INS_mov, emitTypeSize(targetType), targetReg, op1Reg, /* canSkip */ true); + instruction ins = INS_crc32; +#ifdef TARGET_AMD64 + bool needsEvex = false; + if (emit->IsExtendedGPReg(targetReg)) + { + needsEvex = true; + } + else if (op2->isUsedFromReg() && emit->IsExtendedGPReg(op2->GetRegNum())) + { + needsEvex = true; + } + else if (op2->isIndir()) + { + GenTreeIndir* indir = op2->AsIndir(); + + // We don't need to check if they are actually enregistered. + if (indir->HasBase() && emit->IsExtendedGPReg(indir->Base()->GetRegNum())) + { + needsEvex = true; + } + + if (indir->HasIndex() && emit->IsExtendedGPReg(indir->Index()->GetRegNum())) + { + needsEvex = true; + } + } + + if (needsEvex) + { + ins = INS_crc32_apx; + } +#endif // TARGET_AMD64 if ((baseType == TYP_UBYTE) || (baseType == TYP_USHORT)) // baseType is the type of the second argument { assert(targetType == TYP_INT); - genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(baseType), targetReg, op2, instOptions); + genHWIntrinsic_R_RM(node, ins, emitTypeSize(baseType), targetReg, op2, instOptions); } else { assert((targetType == TYP_INT) || (targetType == TYP_LONG)); - genHWIntrinsic_R_RM(node, INS_crc32, emitTypeSize(targetType), targetReg, op2, instOptions); + genHWIntrinsic_R_RM(node, ins, emitTypeSize(targetType), targetReg, op2, instOptions); } break; diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index 5957b4deb9799b..7427d05bef5aaa 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -228,10 +228,10 @@ INST3(movups, "movups", IUM_WR, PCKFLT(0x11), BAD_CODE, INST3(mulps, "mulps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x59), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Multiply packed singles INST3(mulss, "mulss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x59), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar single INST3(orps, "orps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x56), INS_TT_FULL, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Or packed singles -INST3(prefetchnta, "prefetchnta", IUM_RD, 0x000F0018, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) -INST3(prefetcht0, "prefetcht0", IUM_RD, 0x000F0818, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) -INST3(prefetcht1, "prefetcht1", IUM_RD, 0x000F1018, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) -INST3(prefetcht2, "prefetcht2", IUM_RD, 0x000F1818, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG) +INST3(prefetchnta, "prefetchnta", IUM_RD, 0x000F0018, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG | Encoding_REX2) +INST3(prefetcht0, "prefetcht0", IUM_RD, 0x000F0818, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG | Encoding_REX2) +INST3(prefetcht1, "prefetcht1", IUM_RD, 0x000F1018, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG | Encoding_REX2) +INST3(prefetcht2, "prefetcht2", IUM_RD, 0x000F1818, BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WIG | Encoding_REX2) INST3(rcpps, "rcpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x53), INS_TT_NONE, REX_WIG | Encoding_VEX) // Reciprocal of packed singles INST3(rcpss, "rcpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x53), INS_TT_NONE, REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal of scalar single INST3(rsqrtps, "rsqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x52), INS_TT_NONE, REX_WIG | Encoding_VEX) // Reciprocal Sqrt of packed singles @@ -280,16 +280,16 @@ INST3(mfence, "mfence", IUM_RD, 0x000FF0AE, BAD_CODE, INST3(minpd, "minpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5D), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Return Minimum packed doubles INST3(minsd, "minsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5D), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar double INST3(movapd, "movapd", IUM_WR, PCKDBL(0x29), BAD_CODE, PCKDBL(0x28), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) -INST3(movd, "movd", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WX | Encoding_VEX | Encoding_EVEX) // Move DWORD/QWORD between xmm regs <-> memory/r32/r64 regs -INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) -INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movd, "movd", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WX | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move DWORD/QWORD between xmm regs <-> memory/r32/r64 regs +INST3(movdqa, "movdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2) +INST3(movdqu, "movdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2 ) INST3(movhpd, "movhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movlpd, "movlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movmskpd, "movmskpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x50), INS_TT_NONE, REX_WIG | Encoding_VEX) // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros. INST3(movntdq, "movntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) -INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WX) +INST3(movnti, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, INS_TT_NONE, REX_WX | Encoding_REX2) INST3(movntpd, "movntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) -INST3(movq, "movq", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Move Quadword between memory/mm <-> regs +INST3(movq, "movq", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move Quadword between memory/mm <-> regs INST3(movsd_simd, "movsd", IUM_WR, SSEDBL(0x11), BAD_CODE, SSEDBL(0x10), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movupd, "movupd", IUM_WR, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10), INS_TT_FULL_MEM, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) INST3(mulpd, "mulpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x59), INS_TT_FULL, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_Flags_EmbeddedBroadcastSupported) // Multiply packed doubles @@ -602,15 +602,15 @@ INST3(blsmsk, "blsmsk", IUM_WR, BAD_CODE, BAD_CODE, INST3(blsr, "blsr", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF3), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Reset Lowest Set Bit // BMI2 -INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Zero High Bits Starting with Specified Bit Position -INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF6), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Unsigned Multiply Without Affecting Flags -INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit -INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract -INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_TT_NONE, REX_WX | Encoding_VEX) +INST3(bzhi, "bzhi", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF) // Zero High Bits Starting with Specified Bit Position +INST3(mulx, "mulx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF6), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Unsigned Multiply Without Affecting Flags +INST3(pdep, "pdep", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Deposit +INST3(pext, "pext", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF5), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Parallel Bits Extract +INST3(rorx, "rorx", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xF0), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX) #ifdef TARGET_AMD64 -INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags -INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags -INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags +INST3(sarx, "sarx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Arithmetic Right Without Affecting Flags +INST3(shlx, "shlx", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Left Without Affecting Flags +INST3(shrx, "shrx", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF7), INS_TT_NONE, REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift Logical Right Without Affecting Flags #endif INST3(LAST_BMI_INSTRUCTION, "LAST_BMI_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, BAD_CODE, INS_TT_NONE, INS_FLAGS_None) @@ -622,8 +622,8 @@ INST3(FIRST_AVX512_INSTRUCTION, "FIRST_AVX512_INSTRUCTION", IUM_WR, BAD_CODE, BA // AVX512F INST3(kandw, "kandw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x41), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical AND masks INST3(kandnw, "kandnw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x42), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical AND NOT masks -INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers -INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(kmovw_gpr, "kmovw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | Encoding_EVEX | KInstruction) // Move from and to mask registers +INST3(kmovw_msk, "kmovw", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | Encoding_EVEX | KInstruction) // Move from and to mask registers INST3(knotw, "knotw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x44), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // NOT mask register INST3(korw, "korw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x45), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical OR masks INST3(kortestw, "kortestw", IUM_RD, BAD_CODE, BAD_CODE, PCKFLT(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags @@ -772,10 +772,10 @@ INST3(kandd, "kandd", IUM_WR, BAD_CODE, BAD_ INST3(kandq, "kandq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x41), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical AND masks INST3(kandnd, "kandnd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x42), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical AND NOT masks INST3(kandnq, "kandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x42), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical AND NOT masks -INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers -INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Move from and to mask registers -INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Move from and to mask registers -INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(kmovd_gpr, "kmovd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | Encoding_EVEX | KInstruction) // Move from and to mask registers +INST3(kmovd_msk, "kmovd", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | Encoding_EVEX | KInstruction) // Move from and to mask registers +INST3(kmovq_gpr, "kmovq", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x92), INS_TT_NONE, REX_W1 | Encoding_VEX | Encoding_EVEX | KInstruction) // Move from and to mask registers +INST3(kmovq_msk, "kmovq", IUM_WR, PCKFLT(0x91), BAD_CODE, PCKFLT(0x90), INS_TT_NONE, REX_W1 | Encoding_VEX | Encoding_EVEX | KInstruction) // Move from and to mask registers INST3(knotd, "knotd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x44), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // NOT mask register INST3(knotq, "knotq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x44), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction) // NOT mask register INST3(kord, "kord", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x45), INS_TT_NONE, REX_W1 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical OR masks @@ -838,8 +838,8 @@ INST3(kaddb, "kaddb", IUM_WR, BAD_CODE, BAD_ INST3(kaddw, "kaddw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x4A), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Add two masks INST3(kandb, "kandb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x41), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical AND masks INST3(kandnb, "kandnb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x42), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical AND NOT masks -INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers -INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // Move from and to mask registers +INST3(kmovb_gpr, "kmovb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x92), INS_TT_NONE, REX_W0 | Encoding_VEX | Encoding_EVEX | KInstruction) // Move from and to mask registers +INST3(kmovb_msk, "kmovb", IUM_WR, PCKDBL(0x91), BAD_CODE, PCKDBL(0x90), INS_TT_NONE, REX_W0 | Encoding_VEX | Encoding_EVEX | KInstruction) // Move from and to mask registers INST3(knotb, "knotb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x44), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction) // NOT mask register INST3(korb, "korb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x45), INS_TT_NONE, REX_W0 | Encoding_VEX | KInstruction | KInstructionWithLBit) // Bitwise logical OR masks INST3(kortestb, "kortestb", IUM_RD, BAD_CODE, BAD_CODE, PCKDBL(0x98), INS_TT_NONE, REX_W0 | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF | KInstruction) // OR masks and set flags @@ -919,7 +919,7 @@ INST3(vcvttps2ibs, "cvttps2ibs", IUM_WR, BAD_CODE, BAD_ INST3(vcvttps2iubs, "cvttps2iubs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x6A), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation scalar single to unsigned DWORD/QWORD INST3(vmpsadbw, "mpsadbw", IUM_WR, BAD_CODE, BAD_CODE, AVX3A(0x42), INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference -INST3(vminmaxsd, "minmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x53), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Return Minimum/Maximum scalar double +INST3(vminmaxsd, "minmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x53), INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Return Minimum/Maximum scalar double INST3(vminmaxss, "minmaxss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x53), INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Return Minimum/Maximum scalar single INST3(vminmaxpd, "minmaxpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x52), INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_EmbeddedBroadcastSupported | INS_Flags_IsDstSrcSrcAVXInstruction) // Return Maximum packed doubles INST3(vminmaxps, "minmaxps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x52), INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_EmbeddedBroadcastSupported | INS_Flags_IsDstSrcSrcAVXInstruction) // Return Maximum packed singles @@ -969,24 +969,30 @@ INST3(LAST_APX_INSTRUCTION, "LAST_APX_INSTRUCTION", IUM_WR, BAD_CODE, BAD_CODE, // Scalar instructions in SSE4.2 INST3(crc32, "crc32", IUM_RW, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0xF0), INS_TT_NONE, INS_FLAGS_None) +#ifdef TARGET_AMD64 +INST3(crc32_apx, "crc32", IUM_RW, BAD_CODE, BAD_CODE, 0x0000F0, INS_TT_NONE, INS_FLAGS_None) +#endif // BMI1 INST3(tzcnt, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBC), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | Encoding_REX2) // Count the Number of Trailing Zero Bits +#ifdef TARGET_AMD64 +INST3(tzcnt_apx, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F4, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Count the Number of Trailing Zero Bits +#endif // LZCNT INST3(lzcnt, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xBD), INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | Encoding_REX2) +#ifdef TARGET_AMD64 +INST3(lzcnt_apx, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F5, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) +#endif // MOVBE INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), INS_TT_NONE, INS_FLAGS_None) // POPCNT INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | Encoding_REX2) - -#if defined(TARGET_AMD64) -INST3(tzcnt_apx, "tzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F4, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) // Count the Number of Trailing Zero Bits -INST3(lzcnt_apx, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x0000F5, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_Flags_Has_NF) -INST3(popcnt_apx, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x000088, INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | INS_Flags_Has_NF) -#endif // TARGET_AMD64 +#ifdef TARGET_AMD64 +INST3(popcnt_apx, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, 0x000088, INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | INS_Flags_Has_NF) +#endif INST3(neg, "neg", IUM_RW, 0x0018F6, BAD_CODE, 0x0018F6, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) INST3(not, "not", IUM_RW, 0x0010F6, BAD_CODE, 0x0010F6, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD)