diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h index 3327e4bb2cdf41..039e2ccaad169e 100644 --- a/src/coreclr/jit/emit.h +++ b/src/coreclr/jit/emit.h @@ -2057,13 +2057,17 @@ class emitter #define PERFSCORE_THROUGHPUT_8C 8.0f // slower - 8 cycles #define PERFSCORE_THROUGHPUT_9C 9.0f // slower - 9 cycles #define PERFSCORE_THROUGHPUT_10C 10.0f // slower - 10 cycles -#define PERFSCORE_THROUGHPUT_11C 10.0f // slower - 10 cycles +#define PERFSCORE_THROUGHPUT_11C 11.0f // slower - 11 cycles +#define PERFSCORE_THROUGHPUT_12C 12.0f // slower - 12 cycles #define PERFSCORE_THROUGHPUT_13C 13.0f // slower - 13 cycles -#define PERFSCORE_THROUGHPUT_14C 14.0f // slower - 13 cycles -#define PERFSCORE_THROUGHPUT_16C 16.0f // slower - 13 cycles +#define PERFSCORE_THROUGHPUT_14C 14.0f // slower - 14 cycles +#define PERFSCORE_THROUGHPUT_16C 16.0f // slower - 16 cycles +#define PERFSCORE_THROUGHPUT_18C 18.0f // slower - 18 cycles #define PERFSCORE_THROUGHPUT_19C 19.0f // slower - 19 cycles #define PERFSCORE_THROUGHPUT_25C 25.0f // slower - 25 cycles +#define PERFSCORE_THROUGHPUT_32C 32.0f // slower - 32 cycles #define PERFSCORE_THROUGHPUT_33C 33.0f // slower - 33 cycles +#define PERFSCORE_THROUGHPUT_36C 36.0f // slower - 36 cycles #define PERFSCORE_THROUGHPUT_50C 50.0f // slower - 50 cycles #define PERFSCORE_THROUGHPUT_52C 52.0f // slower - 52 cycles #define PERFSCORE_THROUGHPUT_57C 57.0f // slower - 57 cycles @@ -2088,11 +2092,18 @@ class emitter #define PERFSCORE_LATENCY_14C 14.0f #define PERFSCORE_LATENCY_15C 15.0f #define PERFSCORE_LATENCY_16C 16.0f +#define PERFSCORE_LATENCY_17C 17.0f #define PERFSCORE_LATENCY_18C 18.0f #define PERFSCORE_LATENCY_20C 20.0f #define PERFSCORE_LATENCY_22C 22.0f #define PERFSCORE_LATENCY_23C 23.0f #define PERFSCORE_LATENCY_26C 26.0f +#define PERFSCORE_LATENCY_28C 28.0f +#define PERFSCORE_LATENCY_31C 31.0f +#define PERFSCORE_LATENCY_32C 32.0f +#define PERFSCORE_LATENCY_33C 33.0f +#define PERFSCORE_LATENCY_41C 41.0f +#define PERFSCORE_LATENCY_45C 45.0f #define PERFSCORE_LATENCY_62C 62.0f #define PERFSCORE_LATENCY_69C 69.0f #define PERFSCORE_LATENCY_105C 105.0f @@ -2105,6 +2116,11 @@ class emitter #if defined(TARGET_XARCH) +// a read has 2x (0.5) throughput, while a write has 1C (1.0) throughput +#define PERFSCORE_THROUGHPUT_RD PERFSCORE_THROUGHPUT_2X +#define PERFSCORE_THROUGHPUT_WR PERFSCORE_THROUGHPUT_1C +#define PERFSCORE_THROUGHPUT_RW PERFSCORE_THROUGHPUT_1C + // a read,write or modify from stack location, possible def to use latency from L0 cache #define PERFSCORE_LATENCY_RD_STACK PERFSCORE_LATENCY_2C #define PERFSCORE_LATENCY_WR_STACK PERFSCORE_LATENCY_2C diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp index ecc7d78a340af1..d4678409670f09 100644 --- a/src/coreclr/jit/emitxarch.cpp +++ b/src/coreclr/jit/emitxarch.cpp @@ -20320,79 +20320,117 @@ const static float insThroughputInfos[] = // emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(instrDesc* id) { - insExecutionCharacteristics result; - instruction ins = id->idIns(); - insFormat insFmt = id->idInsFmt(); - emitAttr opSize = id->idOpSize(); - insFormat memFmt = getMemoryOperation(id); - unsigned memAccessKind; + instruction ins = id->idIns(); + insFormat insFmt = id->idInsFmt(); + emitAttr opSize = id->idOpSize(); + insFormat memFmt = getMemoryOperation(id); - result.insThroughput = PERFSCORE_THROUGHPUT_ILLEGAL; - result.insLatency = PERFSCORE_LATENCY_ILLEGAL; + // Model the memory throughput, latency, kind + + float memThroughput = PERFSCORE_THROUGHPUT_ILLEGAL; + float memLatency = PERFSCORE_LATENCY_ILLEGAL; + unsigned memAccessKind = 0; - // Model the memory latency switch (memFmt) { - // Model a read from stack location, possible def to use latency from L0 cache + // Model a read or write from stack location, possible def to use latency from L0 cache + case IF_SRD: - result.insLatency = PERFSCORE_LATENCY_RD_STACK; - memAccessKind = PERFSCORE_MEMORY_READ; + { + memThroughput = PERFSCORE_THROUGHPUT_RD; + memLatency = PERFSCORE_LATENCY_RD_STACK; + memAccessKind = PERFSCORE_MEMORY_READ; break; + } case IF_SWR: - result.insLatency = PERFSCORE_LATENCY_WR_STACK; - memAccessKind = PERFSCORE_MEMORY_WRITE; + { + memThroughput = PERFSCORE_THROUGHPUT_WR; + memLatency = PERFSCORE_LATENCY_WR_STACK; + memAccessKind = PERFSCORE_MEMORY_WRITE; break; + } case IF_SRW: - result.insLatency = PERFSCORE_LATENCY_RD_WR_STACK; - memAccessKind = PERFSCORE_MEMORY_READ_WRITE; + { + memThroughput = PERFSCORE_THROUGHPUT_RW; + memLatency = PERFSCORE_LATENCY_RD_WR_STACK; + memAccessKind = PERFSCORE_MEMORY_READ_WRITE; break; + } + + // Model a read or write from a constant location, possible def to use latency from L0 cache - // Model a read from a constant location, possible def to use latency from L0 cache case IF_MRD: - result.insLatency = PERFSCORE_LATENCY_RD_CONST_ADDR; - memAccessKind = PERFSCORE_MEMORY_READ; + { + memThroughput = PERFSCORE_THROUGHPUT_RD; + memLatency = PERFSCORE_LATENCY_RD_CONST_ADDR; + memAccessKind = PERFSCORE_MEMORY_READ; break; + } case IF_MWR: - result.insLatency = PERFSCORE_LATENCY_WR_CONST_ADDR; - memAccessKind = PERFSCORE_MEMORY_WRITE; + { + memThroughput = PERFSCORE_THROUGHPUT_WR; + memLatency = PERFSCORE_LATENCY_WR_CONST_ADDR; + memAccessKind = PERFSCORE_MEMORY_WRITE; break; + } case IF_MRW: - result.insLatency = PERFSCORE_LATENCY_RD_WR_CONST_ADDR; - memAccessKind = PERFSCORE_MEMORY_READ_WRITE; + { + memThroughput = PERFSCORE_THROUGHPUT_RW; + memLatency = PERFSCORE_LATENCY_RD_WR_CONST_ADDR; + memAccessKind = PERFSCORE_MEMORY_READ_WRITE; break; + } + + // Model a read or write from memory location, possible def to use latency from L0 or L1 cache - // Model a read from memory location, possible def to use latency from L0 or L1 cache case IF_ARD: - result.insLatency = PERFSCORE_LATENCY_RD_GENERAL; - memAccessKind = PERFSCORE_MEMORY_READ; + { + memThroughput = PERFSCORE_THROUGHPUT_RD; + memLatency = PERFSCORE_LATENCY_RD_GENERAL; + memAccessKind = PERFSCORE_MEMORY_READ; break; + } case IF_AWR: - result.insLatency = PERFSCORE_LATENCY_WR_GENERAL; - memAccessKind = PERFSCORE_MEMORY_WRITE; + { + memThroughput = PERFSCORE_THROUGHPUT_WR; + memLatency = PERFSCORE_LATENCY_WR_GENERAL; + memAccessKind = PERFSCORE_MEMORY_WRITE; break; + } case IF_ARW: - result.insLatency = PERFSCORE_LATENCY_RD_WR_GENERAL; - memAccessKind = PERFSCORE_MEMORY_READ_WRITE; + { + memThroughput = PERFSCORE_THROUGHPUT_RW; + memLatency = PERFSCORE_LATENCY_RD_WR_GENERAL; + memAccessKind = PERFSCORE_MEMORY_READ_WRITE; break; + } case IF_NONE: - result.insLatency = PERFSCORE_LATENCY_ZERO; - memAccessKind = PERFSCORE_MEMORY_NONE; + { + memThroughput = PERFSCORE_THROUGHPUT_ZERO; + memLatency = PERFSCORE_LATENCY_ZERO; + memAccessKind = PERFSCORE_MEMORY_NONE; break; + } default: + { assert(!"Unhandled insFmt for switch (memFmt)"); - result.insLatency = PERFSCORE_LATENCY_ZERO; - memAccessKind = PERFSCORE_MEMORY_NONE; + memThroughput = PERFSCORE_THROUGHPUT_ZERO; + memLatency = PERFSCORE_LATENCY_ZERO; + memAccessKind = PERFSCORE_MEMORY_NONE; break; + } } - result.insMemoryAccessKind = memAccessKind; + + float insThroughput = PERFSCORE_THROUGHPUT_ILLEGAL; + float insLatency = PERFSCORE_LATENCY_ILLEGAL; switch (ins) { @@ -20407,208 +20445,22 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins // instruction is placed immediately after unconditional jmp. // In both cases, don't count for PerfScore. - result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; - result.insLatency = PERFSCORE_LATENCY_ZERO; + insThroughput = PERFSCORE_THROUGHPUT_ZERO; + insLatency = PERFSCORE_LATENCY_ZERO; break; } #endif - result.insThroughput = PERFSCORE_THROUGHPUT_4X; - result.insLatency = PERFSCORE_LATENCY_ZERO; - break; - } - - case INS_push: - case INS_push_hide: - { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - if (insFmt == IF_RRD) // push reg - { - // For pushes (stack writes) we assume that the full latency will be covered - result.insLatency = PERFSCORE_LATENCY_ZERO; - } - break; - } - - case INS_push2: - { - // TODO-XArch-APX: to be verified. - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - if (insFmt == IF_RRD_RRD) // push2 reg1, reg2 - { - result.insLatency = PERFSCORE_LATENCY_ZERO; - } - break; - } - - case INS_pop: - case INS_pop_hide: - { - if (insFmt == IF_RWR) // pop reg - { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - // For pops (stack reads) we assume that the full latency will be covered - result.insLatency = PERFSCORE_LATENCY_ZERO; - } - else - { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - } - break; - } - - case INS_pop2: - { - // TODO-XArch-APX: to be verified. - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - if (insFmt == IF_RRD_RRD) // pop2 reg1, reg2 - { - result.insLatency = PERFSCORE_LATENCY_ZERO; - } - break; - } - - case INS_inc: - case INS_dec: - case INS_neg: - case INS_not: - { - if (memFmt == IF_NONE) - { - // ins reg - result.insThroughput = PERFSCORE_THROUGHPUT_4X; - result.insLatency = PERFSCORE_LATENCY_1C; - } - else - { - // ins mem - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - // no additional R/W latency - } - break; - } - -#ifdef TARGET_AMD64 - case INS_movsxd: -#endif - case INS_mov: - case INS_movsx: - case INS_movzx: - case INS_cwde: - case INS_cmp: - case INS_test: - case INS_cmovo: - case INS_cmovno: - case INS_cmovb: - case INS_cmovae: - case INS_cmove: - case INS_cmovne: - case INS_cmovbe: - case INS_cmova: - case INS_cmovs: - case INS_cmovns: - case INS_cmovp: - case INS_cmovnp: - case INS_cmovl: - case INS_cmovge: - case INS_cmovle: - case INS_cmovg: -#ifdef TARGET_AMD64 - // todo-xarch-apx: we need to double check the logic for ccmp - case INS_ccmpo: - case INS_ccmpno: - case INS_ccmpb: - case INS_ccmpae: - case INS_ccmpe: - case INS_ccmpne: - case INS_ccmpbe: - case INS_ccmpa: - case INS_ccmps: - case INS_ccmpns: - case INS_ccmpt: - case INS_ccmpf: - case INS_ccmpl: - case INS_ccmpge: - case INS_ccmple: - case INS_ccmpg: -#endif - { - - if (memFmt == IF_NONE) - { - result.insThroughput = PERFSCORE_THROUGHPUT_4X; - } - else if (memAccessKind == PERFSCORE_MEMORY_READ) - { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - if (ins == INS_cmp || ins == INS_test || insIsCMOV(ins)) - { - result.insLatency += PERFSCORE_LATENCY_1C; - } - else if (ins == INS_movsx -#ifdef TARGET_AMD64 - || ins == INS_movsxd -#endif - ) - { - result.insLatency += PERFSCORE_LATENCY_2C; - } - } - else // writes - { - assert(memAccessKind == PERFSCORE_MEMORY_WRITE); - assert(ins == INS_mov); - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - } - break; - } - - case INS_adc: - case INS_sbb: - { - if (memAccessKind != PERFSCORE_MEMORY_READ_WRITE) - { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += PERFSCORE_LATENCY_1C; - } - else - { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - // no additional R/W latency - } - break; - } - - case INS_add: - case INS_sub: - case INS_sub_hide: - case INS_and: - case INS_or: - case INS_xor: - { - if (memFmt == IF_NONE) - { - result.insThroughput = PERFSCORE_THROUGHPUT_4X; - result.insLatency = PERFSCORE_LATENCY_1C; - } - else if (memAccessKind == PERFSCORE_MEMORY_READ_WRITE) - { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - // no additional R/W latency - } - else - { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += PERFSCORE_LATENCY_1C; - } + insThroughput = PERFSCORE_THROUGHPUT_4X; + insLatency = PERFSCORE_LATENCY_ZERO; break; } case INS_lea: { // uops.info - result.insThroughput = PERFSCORE_THROUGHPUT_2X; // one or two components - result.insLatency = PERFSCORE_LATENCY_1C; + insThroughput = PERFSCORE_THROUGHPUT_2X; // one or two components + insLatency = PERFSCORE_LATENCY_1C; if (insFmt == IF_RWR_LABEL) { @@ -20616,7 +20468,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins // // - throughput is only 1 per cycle // - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + insThroughput = PERFSCORE_THROUGHPUT_1C; } else if (insFmt != IF_RWR_SRD) { @@ -20633,14 +20485,14 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins // // - throughput is only 1 per cycle // - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + insThroughput = PERFSCORE_THROUGHPUT_1C; if (baseRegisterRequiresDisplacement(baseReg) || id->idIsDspReloc()) { // Increased Latency for these cases // - see https://reviews.llvm.org/D32277 // - result.insLatency = PERFSCORE_LATENCY_3C; + insLatency = PERFSCORE_LATENCY_3C; } } } @@ -20654,14 +20506,14 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins // The integer divide instructions have long latencies if (opSize == EA_8BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_52C; - result.insLatency = PERFSCORE_LATENCY_62C; + insThroughput = PERFSCORE_THROUGHPUT_52C; + insLatency = PERFSCORE_LATENCY_62C; } else { assert(opSize == EA_4BYTE); - result.insThroughput = PERFSCORE_THROUGHPUT_6C; - result.insLatency = PERFSCORE_LATENCY_26C; + insThroughput = PERFSCORE_THROUGHPUT_6C; + insLatency = PERFSCORE_LATENCY_26C; } break; } @@ -20671,119 +20523,14 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins // The integer divide instructions have long latenies if (opSize == EA_8BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_57C; - result.insLatency = PERFSCORE_LATENCY_69C; + insThroughput = PERFSCORE_THROUGHPUT_57C; + insLatency = PERFSCORE_LATENCY_69C; } else { assert(opSize == EA_4BYTE); - result.insThroughput = PERFSCORE_THROUGHPUT_6C; - result.insLatency = PERFSCORE_LATENCY_26C; - } - break; - } - - case INS_shl: - case INS_shr: - case INS_sar: - case INS_ror: - case INS_rol: - { - switch (insFmt) - { - case IF_RRW_CNS: - // ins reg, cns - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency = PERFSCORE_LATENCY_1C; - break; - - case IF_MRW_CNS: - case IF_SRW_CNS: - case IF_ARW_CNS: - // ins [mem], cns - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - result.insLatency += PERFSCORE_LATENCY_1C; - break; - - case IF_RRW: - // TODO-XArch-APX: to be verified if this data is correct for NDD form. - case IF_RWR_RRD: - // ins reg, cl - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - result.insLatency = PERFSCORE_LATENCY_2C; - break; - - case IF_MRW: - case IF_SRW: - case IF_ARW: - // ins [mem], cl - result.insThroughput = PERFSCORE_THROUGHPUT_4C; - result.insLatency += PERFSCORE_LATENCY_2C; - break; - - default: - // unhandled instruction insFmt combination - perfScoreUnhandledInstruction(id, &result); - break; - } - break; - } - - case INS_shl_1: - case INS_shr_1: - case INS_sar_1: - { - result.insLatency += PERFSCORE_LATENCY_1C; - switch (insFmt) - { - case IF_RRW: - // TODO-XArch-APX: to be verified if this data is correct for NDD form. - case IF_RWR_RRD: - // ins reg, 1 - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - break; - - case IF_MRW: - case IF_SRW: - case IF_ARW: - // ins [mem], 1 - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - break; - - default: - // unhandled instruction insFmt combination - perfScoreUnhandledInstruction(id, &result); - break; - } - break; - } - - case INS_shl_N: - case INS_shr_N: - case INS_sar_N: - case INS_ror_N: - case INS_rol_N: - { - result.insLatency += PERFSCORE_LATENCY_1C; - switch (insFmt) - { - case IF_RRW_SHF: - case IF_RWR_RRD_SHF: - // ins reg, cns - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - break; - - case IF_MRW_SHF: - case IF_SRW_SHF: - case IF_ARW_SHF: - // ins [mem], cns - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - break; - - default: - // unhandled instruction insFmt combination - perfScoreUnhandledInstruction(id, &result); - break; + insThroughput = PERFSCORE_THROUGHPUT_6C; + insLatency = PERFSCORE_LATENCY_26C; } break; } @@ -20791,77 +20538,17 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_shld: case INS_shrd: { - result.insLatency += PERFSCORE_LATENCY_3C; + insLatency = PERFSCORE_LATENCY_3C; + if (insFmt == IF_RRW_RRD_CNS) { // ins reg, reg, cns - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + insThroughput = PERFSCORE_THROUGHPUT_1C; } else { assert(memAccessKind == PERFSCORE_MEMORY_WRITE); // _SHF form never emitted - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - } - break; - } - - case INS_bt: - { - result.insLatency += PERFSCORE_LATENCY_1C; - if ((insFmt == IF_RRD_RRD) || (insFmt == IF_RRD_CNS)) - { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - } - else - { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - } - break; - } - - case INS_seto: - case INS_setno: - case INS_setb: - case INS_setae: - case INS_sete: - case INS_setne: - case INS_setbe: - case INS_seta: - case INS_sets: - case INS_setns: - case INS_setp: - case INS_setnp: - case INS_setl: - case INS_setge: - case INS_setle: - case INS_setg: -#ifdef TARGET_AMD64 - case INS_seto_apx: - case INS_setno_apx: - case INS_setb_apx: - case INS_setae_apx: - case INS_sete_apx: - case INS_setne_apx: - case INS_setbe_apx: - case INS_seta_apx: - case INS_sets_apx: - case INS_setns_apx: - case INS_setp_apx: - case INS_setnp_apx: - case INS_setl_apx: - case INS_setge_apx: - case INS_setle_apx: - case INS_setg_apx: -#endif - { - result.insLatency += PERFSCORE_LATENCY_1C; - if (insFmt == IF_RRD) - { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - } - else - { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + insThroughput = PERFSCORE_THROUGHPUT_2C; } break; } @@ -20871,14 +20558,14 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins if (emitInstHasNoCode(id)) { // a removed jmp to the next instruction - result.insThroughput = PERFSCORE_THROUGHPUT_ZERO; - result.insLatency = PERFSCORE_LATENCY_ZERO; + insThroughput = PERFSCORE_THROUGHPUT_ZERO; + insLatency = PERFSCORE_LATENCY_ZERO; } else { // branch to a constant address - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - result.insLatency = PERFSCORE_LATENCY_BRANCH_DIRECT; + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_BRANCH_DIRECT; } break; } @@ -20886,30 +20573,29 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_call: { // uops.info - result.insLatency = PERFSCORE_LATENCY_ZERO; + insLatency = PERFSCORE_LATENCY_ZERO; + switch (insFmt) { case IF_LABEL: - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + insThroughput = PERFSCORE_THROUGHPUT_1C; break; case IF_METHOD: - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + insThroughput = PERFSCORE_THROUGHPUT_1C; break; case IF_METHPTR: - result.insThroughput = PERFSCORE_THROUGHPUT_3C; + insThroughput = PERFSCORE_THROUGHPUT_3C; break; case IF_SRD: case IF_ARD: case IF_MRD: - result.insThroughput = PERFSCORE_THROUGHPUT_3C; + insThroughput = PERFSCORE_THROUGHPUT_3C; break; default: - // unhandled instruction, insFmt combination - perfScoreUnhandledInstruction(id, &result); break; } break; @@ -20917,30 +20603,24 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_ret: { + insLatency = PERFSCORE_LATENCY_ZERO; + if (insFmt == IF_CNS) { - result.insThroughput = PERFSCORE_THROUGHPUT_2C; + insThroughput = PERFSCORE_THROUGHPUT_2C; } else { assert(insFmt == IF_NONE); - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + insThroughput = PERFSCORE_THROUGHPUT_1C; } break; } case INS_xchg: { - // uops.info - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - if (memFmt == IF_NONE) - { - result.insLatency = PERFSCORE_LATENCY_1C; - } - else - { - result.insLatency = PERFSCORE_LATENCY_23C; - } + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = (memFmt == IF_NONE) ? PERFSCORE_LATENCY_1C : PERFSCORE_LATENCY_23C; break; } @@ -20948,11 +20628,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_fld: case INS_fstp: { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - if (memAccessKind == PERFSCORE_MEMORY_NONE) - { - result.insLatency = PERFSCORE_LATENCY_1C; - } + insThroughput = PERFSCORE_THROUGHPUT_2X; + insLatency = PERFSCORE_LATENCY_1C; break; } #endif // TARGET_X86 @@ -20960,125 +20637,38 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_movd32: case INS_movd64: case INS_movq: + case INS_vmovw: { - if (memAccessKind == PERFSCORE_MEMORY_NONE) - { - if (isFloatReg(id->idReg1()) && isFloatReg(id->idReg2())) - { - // movq xmm, xmm - result.insThroughput = PERFSCORE_THROUGHPUT_3X; - result.insLatency = PERFSCORE_LATENCY_1C; - } - else - { - // movd r32/64, xmm or xmm, r32/64 - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency = PERFSCORE_LATENCY_3C; - } - } - else if (memAccessKind == PERFSCORE_MEMORY_READ) - { - // ins xmm, m32/64 - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += PERFSCORE_LATENCY_2C; - } - else - { - // ins m32/64, xmm - assert(memAccessKind == PERFSCORE_MEMORY_WRITE); - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += PERFSCORE_LATENCY_2C; - } - break; - } + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_3C; - case INS_movdqa32: - case INS_vmovdqa64: - case INS_movdqu32: - case INS_vmovdqu8: - case INS_vmovdqu16: - case INS_vmovdqu64: - case INS_vmovd_simd: - case INS_vmovw_simd: - case INS_movaps: - case INS_movups: - case INS_movapd: - case INS_movupd: - { - if (memAccessKind == PERFSCORE_MEMORY_NONE) - { - // ins reg, reg - result.insThroughput = PERFSCORE_THROUGHPUT_4X; - result.insLatency = PERFSCORE_LATENCY_ZERO; - } - else if (memAccessKind == PERFSCORE_MEMORY_READ) - { - // ins reg, mem - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C; - } - else - { - // ins mem, reg - assert(memAccessKind == PERFSCORE_MEMORY_WRITE); - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += PERFSCORE_LATENCY_2C; - } - break; - } - - case INS_movhps: - case INS_movhpd: - case INS_movlps: - case INS_movlpd: - { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - if (memAccessKind == PERFSCORE_MEMORY_READ) - { - result.insLatency += PERFSCORE_LATENCY_3C; - } - else - { - assert(memAccessKind == PERFSCORE_MEMORY_WRITE); - result.insLatency += PERFSCORE_LATENCY_2C; - } - break; - } - - case INS_movntdqa: - { - assert(memAccessKind == PERFSCORE_MEMORY_READ); - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C; - break; - } - - case INS_movss: - case INS_movsd_simd: - case INS_movddup: - { - if (memAccessKind == PERFSCORE_MEMORY_NONE) - { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency = PERFSCORE_LATENCY_1C; - } - else if (memAccessKind == PERFSCORE_MEMORY_READ) + if (memAccessKind == PERFSCORE_MEMORY_READ) { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C; + // The reads have twice the throughput of the register to register variants + insThroughput = PERFSCORE_THROUGHPUT_2X; } - else + else if (isFloatReg(id->idReg1()) && isFloatReg(id->idReg2())) { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += PERFSCORE_LATENCY_2C; + // movq xmm, xmm + insThroughput = PERFSCORE_THROUGHPUT_3X; + insLatency = PERFSCORE_LATENCY_1C; } break; } - case INS_lddqu: + case INS_movss: + case INS_movsd_simd: + case INS_movddup: + case INS_vmovsh: { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C; + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_1C; + + if (memAccessKind == PERFSCORE_MEMORY_READ) + { + // The reads have twice the throughput of the register to register variants + insThroughput = PERFSCORE_THROUGHPUT_2X; + } break; } @@ -21101,8 +20691,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vpmovuswb: case INS_vpmovwb: { - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - result.insLatency += (opSize == EA_16BYTE) ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_4C; + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = (opSize >= EA_32BYTE) ? PERFSCORE_LATENCY_4C : PERFSCORE_LATENCY_2C; break; } @@ -21117,13 +20707,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins { if (opSize == EA_64BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - result.insLatency += PERFSCORE_LATENCY_8C; + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_8C; } else { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += PERFSCORE_LATENCY_4C; + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_4C; } break; } @@ -21132,20 +20722,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins { if (opSize == EA_16BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_6C; - result.insLatency += PERFSCORE_LATENCY_12C; + insThroughput = PERFSCORE_THROUGHPUT_6C; + insLatency = PERFSCORE_LATENCY_12C; } else if (opSize == EA_32BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_10C; - result.insLatency += PERFSCORE_LATENCY_16C; + insThroughput = PERFSCORE_THROUGHPUT_10C; + insLatency = PERFSCORE_LATENCY_16C; } else { assert(opSize == EA_64BYTE); - result.insThroughput = PERFSCORE_THROUGHPUT_19C; - result.insLatency += PERFSCORE_LATENCY_26C; + insThroughput = PERFSCORE_THROUGHPUT_19C; + insLatency = PERFSCORE_LATENCY_26C; } break; } @@ -21154,20 +20744,20 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins { if (opSize == EA_16BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - result.insLatency += PERFSCORE_LATENCY_4C; + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_4C; } else if (opSize == EA_32BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_6C; - result.insLatency += PERFSCORE_LATENCY_12C; + insThroughput = PERFSCORE_THROUGHPUT_6C; + insLatency = PERFSCORE_LATENCY_12C; } else { assert(opSize == EA_64BYTE); - result.insThroughput = PERFSCORE_THROUGHPUT_10C; - result.insLatency += PERFSCORE_LATENCY_16C; + insThroughput = PERFSCORE_THROUGHPUT_10C; + insLatency = PERFSCORE_LATENCY_16C; } break; } @@ -21179,56 +20769,8 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vcvtss2usi32: case INS_vcvtss2usi64: { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C; - break; - } - - case INS_paddb: - case INS_psubb: - case INS_paddw: - case INS_psubw: - case INS_paddd: - case INS_psubd: - case INS_paddq: - case INS_psubq: - case INS_paddsb: - case INS_psubsb: - case INS_paddsw: - case INS_psubsw: - case INS_paddusb: - case INS_psubusb: - case INS_paddusw: - case INS_psubusw: - case INS_pandd: - case INS_vpandq: - case INS_pandnd: - case INS_vpandnq: - case INS_pord: - case INS_vporq: - case INS_pxord: - case INS_vpxorq: - case INS_andpd: - case INS_andps: - case INS_andnpd: - case INS_andnps: - case INS_orpd: - case INS_orps: - case INS_xorpd: - case INS_xorps: - case INS_blendps: - case INS_blendpd: - case INS_vpblendd: - { - result.insLatency += PERFSCORE_LATENCY_1C; - if (memAccessKind == PERFSCORE_MEMORY_NONE) - { - result.insThroughput = PERFSCORE_THROUGHPUT_3X; - } - else - { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - } + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = (opSize == EA_8BYTE) ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_7C; break; } @@ -21244,13 +20786,13 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins { if (insFmt == IF_RWR_CNS) { - result.insLatency = PERFSCORE_LATENCY_1C; - result.insThroughput = PERFSCORE_THROUGHPUT_2X; + insLatency = PERFSCORE_LATENCY_1C; + insThroughput = PERFSCORE_THROUGHPUT_2X; } else { - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_4C : PERFSCORE_LATENCY_2C; - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = (opSize >= EA_32BYTE) ? PERFSCORE_LATENCY_4C : PERFSCORE_LATENCY_2C; + insThroughput = PERFSCORE_THROUGHPUT_1C; } break; } @@ -21260,46 +20802,37 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vpblendmd: case INS_vpblendmq: { - if (opSize == EA_64BYTE) + if (opSize >= EA_64BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; + insThroughput = PERFSCORE_THROUGHPUT_2X; } else { - result.insThroughput = PERFSCORE_THROUGHPUT_3X; + insThroughput = PERFSCORE_THROUGHPUT_3X; } - result.insLatency += PERFSCORE_LATENCY_1C; + insLatency = PERFSCORE_LATENCY_1C; break; } case INS_vpblendmb: case INS_vpblendmw: { - if (opSize == EA_64BYTE) + if (opSize >= EA_64BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; + insThroughput = PERFSCORE_THROUGHPUT_2X; } else { - result.insThroughput = PERFSCORE_THROUGHPUT_3X; + insThroughput = PERFSCORE_THROUGHPUT_3X; } - result.insLatency += PERFSCORE_LATENCY_3C; + insLatency = PERFSCORE_LATENCY_3C; break; } case INS_bswap: { - if (opSize == EA_8BYTE) - { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency = PERFSCORE_LATENCY_2C; - } - else - { - assert(opSize == EA_4BYTE); - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency = PERFSCORE_LATENCY_1C; - } + insThroughput = PERFSCORE_THROUGHPUT_2X; + insLatency = (opSize == EA_8BYTE) ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C; break; } @@ -21307,14 +20840,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_movmskpd: case INS_movmskps: { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - if (opSize == EA_32BYTE) + insThroughput = PERFSCORE_THROUGHPUT_1C; + + if (opSize >= EA_32BYTE) { - result.insLatency += ins == INS_pmovmskb ? PERFSCORE_LATENCY_4C : PERFSCORE_LATENCY_5C; + insLatency = (ins == INS_pmovmskb) ? PERFSCORE_LATENCY_4C : PERFSCORE_LATENCY_5C; } else { - result.insLatency += PERFSCORE_LATENCY_3C; + insLatency = PERFSCORE_LATENCY_3C; } break; } @@ -21332,15 +20866,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_pmovzxwq: case INS_pmovzxdq: { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_1C; + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = (opSize >= EA_32BYTE) ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_1C; break; } case INS_ptest: { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_6C : PERFSCORE_LATENCY_4C; + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = (opSize >= EA_32BYTE) ? PERFSCORE_LATENCY_6C : PERFSCORE_LATENCY_4C; break; } @@ -21362,27 +20896,29 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vcvtudq2pd: case INS_vcvtuqq2ps: { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_7C : PERFSCORE_LATENCY_5C; + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = (opSize >= EA_32BYTE) ? PERFSCORE_LATENCY_7C : PERFSCORE_LATENCY_5C; break; } case INS_vtestps: case INS_vtestpd: { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_5C : PERFSCORE_LATENCY_3C; + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = (opSize >= EA_32BYTE) ? PERFSCORE_LATENCY_5C : PERFSCORE_LATENCY_3C; break; } case INS_vpbroadcastb: - case INS_vpbroadcastb_gpr: case INS_vpbroadcastw: - case INS_vpbroadcastw_gpr: + { + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = (opSize >= EA_32BYTE) ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_1C; + break; + } + case INS_vpbroadcastd: - case INS_vpbroadcastd_gpr: case INS_vpbroadcastq: - case INS_vpbroadcastq_gpr: case INS_vbroadcasti32x4: case INS_vbroadcastf32x4: case INS_vbroadcastf64x2: @@ -21396,19 +20932,15 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins case INS_vbroadcastss: case INS_vbroadcastsd: { + insLatency = (opSize >= EA_32BYTE) ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_1C; + if (memAccessKind == PERFSCORE_MEMORY_NONE) { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency = opSize == EA_16BYTE ? PERFSCORE_LATENCY_1C : PERFSCORE_LATENCY_3C; + insThroughput = PERFSCORE_THROUGHPUT_1C; } else { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += opSize == EA_16BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_3C; - if (ins == INS_vpbroadcastb || ins == INS_vpbroadcastw) - { - result.insLatency += PERFSCORE_LATENCY_1C; - } + insThroughput = PERFSCORE_THROUGHPUT_2X; } break; } @@ -21420,104 +20952,432 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins { if (memAccessKind == PERFSCORE_MEMORY_NONE) { - result.insThroughput = PERFSCORE_THROUGHPUT_2C; - result.insLatency = PERFSCORE_LATENCY_4C; + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_4C; } else { - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += PERFSCORE_LATENCY_3C; + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_3C; } break; } - case INS_vmaskmovpd: - case INS_vmaskmovps: - case INS_vpmaskmovd: - case INS_vpmaskmovq: + case INS_vpgatherdd: + case INS_vgatherdps: + case INS_vpgatherdd_msk: { - if (memAccessKind == PERFSCORE_MEMORY_READ) + if (opSize == EA_16BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_4C : PERFSCORE_LATENCY_3C; + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_11C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_4C; + insLatency = PERFSCORE_LATENCY_13C; } else { - assert(memAccessKind == PERFSCORE_MEMORY_WRITE); - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += PERFSCORE_LATENCY_12C; + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_8C; + insLatency = PERFSCORE_LATENCY_17C; } break; } - case INS_vpgatherdd: - case INS_vgatherdps: - { - result.insThroughput = PERFSCORE_THROUGHPUT_4C; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_13C : PERFSCORE_LATENCY_11C; - break; - } - case INS_vpgatherdq: case INS_vpgatherqd: case INS_vpgatherqq: case INS_vgatherdpd: case INS_vgatherqps: case INS_vgatherqpd: + case INS_vgatherdpd_msk: + case INS_vgatherdps_msk: + case INS_vgatherqpd_msk: + case INS_vgatherqps_msk: + case INS_vpgatherdq_msk: + case INS_vpgatherqd_msk: + case INS_vpgatherqq_msk: { - result.insThroughput = PERFSCORE_THROUGHPUT_4C; - result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_11C : PERFSCORE_LATENCY_9C; + if (opSize == EA_16BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_9C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_11C; + } + else + { + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_4C; + insLatency = PERFSCORE_LATENCY_13C; + } break; } - case INS_movbe: -#ifdef TARGET_AMD64 - case INS_movbe_apx: -#endif + case INS_vpscatterdd_msk: + case INS_vscatterdps_msk: { - if (memAccessKind == PERFSCORE_MEMORY_READ) + if (opSize == EA_16BYTE) { - result.insThroughput = PERFSCORE_THROUGHPUT_2X; - result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C; + insThroughput = PERFSCORE_THROUGHPUT_4C; + insLatency = PERFSCORE_LATENCY_7C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_8C; + insLatency = PERFSCORE_LATENCY_9C; } else { - assert(memAccessKind == PERFSCORE_MEMORY_WRITE); - result.insThroughput = PERFSCORE_THROUGHPUT_1C; - result.insLatency += opSize == EA_8BYTE ? PERFSCORE_LATENCY_2C : PERFSCORE_LATENCY_1C; + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_16C; + insLatency = PERFSCORE_LATENCY_13C; } break; } - case INS_vbmacor16x16x16: - case INS_vbmacxor16x16x16: - case INS_vbitrev: + case INS_vpscatterdq_msk: + case INS_vpscatterqd_msk: + case INS_vpscatterqq_msk: + case INS_vscatterdpd_msk: + case INS_vscatterqpd_msk: + case INS_vscatterqps_msk: { - result.insLatency = PERFSCORE_LATENCY_1C; - result.insThroughput = PERFSCORE_THROUGHPUT_1C; + if (opSize == EA_16BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_5C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_4C; + insLatency = PERFSCORE_LATENCY_7C; + } + else + { + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_8C; + insLatency = PERFSCORE_LATENCY_9C; + } break; } - default: + case INS_vpshldd: + case INS_vpshldq: + case INS_vpshldvd: + case INS_vpshldvq: + case INS_vpshldvw: + case INS_vpshldw: + case INS_vpshrdd: + case INS_vpshrdq: + case INS_vpshrdvd: + case INS_vpshrdvq: + case INS_vpshrdvw: + case INS_vpshrdw: { - assert((unsigned)ins < ArrLen(insThroughputInfos)); - float insThroughput = insThroughputInfos[ins]; + insThroughput = (opSize >= EA_64BYTE) ? PERFSCORE_THROUGHPUT_1C : PERFSCORE_THROUGHPUT_2X; + insLatency = PERFSCORE_LATENCY_1C; + break; + } - assert((unsigned)ins < ArrLen(insLatencyInfos)); - float insLatency = insLatencyInfos[ins]; + case INS_vcvtdq2ph: + case INS_vcvtudq2ph: + { + if (opSize >= EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_9C; + } + else + { + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_7C; + } + break; + } + + case INS_vcvtne2ps2bf16: + case INS_vcvtneps2bf16: + { + insThroughput = (opSize >= EA_64BYTE) ? PERFSCORE_THROUGHPUT_2C : PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_8C; + break; + } + + case INS_vcvtph2psx: + case INS_vcvtps2phx: + { + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = (opSize >= EA_32BYTE) ? PERFSCORE_LATENCY_8C : PERFSCORE_LATENCY_6C; + break; + } + + case INS_vcvtph2qq: + case INS_vcvtph2uqq: + case INS_vcvttph2qq: + case INS_vcvttph2uqq: + { + insThroughput = (opSize >= EA_64BYTE) ? PERFSCORE_THROUGHPUT_2C : PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_10C; + break; + } + + case INS_vcvtqq2ph: + case INS_vcvtuqq2ph: + { + if (opSize >= EA_64BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_10C; + } + else + { + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_8C; + } + break; + } + + case INS_divpd: + { + if (opSize == EA_16BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_4C; + insLatency = PERFSCORE_LATENCY_13C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_8C; + insLatency = PERFSCORE_LATENCY_13C; + } + else + { + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_16C; + insLatency = PERFSCORE_LATENCY_23C; + } + break; + } + + case INS_divps: + { + if (opSize == EA_16BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_3C; + insLatency = PERFSCORE_LATENCY_11C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_5C; + insLatency = PERFSCORE_LATENCY_11C; + } + else + { + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_10C; + insLatency = PERFSCORE_LATENCY_18C; + } + break; + } + + case INS_vdivph: + { + if (opSize == EA_16BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_8C; + insLatency = PERFSCORE_LATENCY_31C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_16C; + insLatency = PERFSCORE_LATENCY_31C; + } + else + { + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_32C; + insLatency = PERFSCORE_LATENCY_41C; + } + break; + } + + case INS_vrcpph: + { + if (opSize >= EA_64BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_6C; + } + else + { + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_4C; + } + break; + } + + case INS_vrsqrtph: + { + if (opSize >= EA_64BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_2C; + insLatency = PERFSCORE_LATENCY_7C; + } + else + { + insThroughput = PERFSCORE_THROUGHPUT_1C; + insLatency = PERFSCORE_LATENCY_5C; + } + break; + } + + case INS_sqrtpd: + { + if (opSize == EA_16BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_4C; + insLatency = PERFSCORE_LATENCY_16C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_8C; + insLatency = PERFSCORE_LATENCY_16C; + } + else + { + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_16C; + insLatency = PERFSCORE_LATENCY_28C; + } + break; + } + + case INS_sqrtps: + { + if (opSize == EA_16BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_3C; + insLatency = PERFSCORE_LATENCY_12C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_6C; + insLatency = PERFSCORE_LATENCY_12C; + } + else + { + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_12C; + insLatency = PERFSCORE_LATENCY_20C; + } + break; + } + + case INS_vsqrtph: + { + if (opSize == EA_16BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_9C; + insLatency = PERFSCORE_LATENCY_33C; + } + else if (opSize == EA_32BYTE) + { + insThroughput = PERFSCORE_THROUGHPUT_18C; + insLatency = PERFSCORE_LATENCY_33C; + } + else + { + assert(opSize == EA_64BYTE); + insThroughput = PERFSCORE_THROUGHPUT_36C; + insLatency = PERFSCORE_LATENCY_45C; + } + break; + } + + case INS_vp2intersectd: + case INS_vp2intersectq: + { + insThroughput = PERFSCORE_THROUGHPUT_1C; - if ((insLatency == PERFSCORE_LATENCY_ILLEGAL) || (insThroughput == PERFSCORE_THROUGHPUT_ILLEGAL)) + if (opSize == EA_16BYTE) + { + insLatency = PERFSCORE_LATENCY_3C; + } + else if (opSize == EA_32BYTE) { - // unhandled instruction insFmt combination - perfScoreUnhandledInstruction(id, &result); + insLatency = PERFSCORE_LATENCY_4C; + } + else + { + assert(opSize == EA_64BYTE); + insLatency = PERFSCORE_LATENCY_6C; } + break; + } + + default: + { + assert((unsigned)ins < ArrLen(insThroughputInfos)); + assert((unsigned)ins < ArrLen(insLatencyInfos)); - result.insLatency += insLatency; - result.insThroughput = insThroughput; + insThroughput = insThroughputInfos[ins]; + insLatency = insLatencyInfos[ins]; break; } } + insExecutionCharacteristics result; + + assert(memThroughput != PERFSCORE_THROUGHPUT_ILLEGAL); + assert(memLatency != PERFSCORE_LATENCY_ILLEGAL); + + if ((insLatency == PERFSCORE_LATENCY_ILLEGAL) || (insThroughput == PERFSCORE_THROUGHPUT_ILLEGAL)) + { + // unhandled instruction insFmt combination + perfScoreUnhandledInstruction(id, &result); + result.insMemoryAccessKind = memAccessKind; + return result; + } + else + { + if (memAccessKind != PERFSCORE_MEMORY_NONE) + { + if (IsSimdInstruction(ins)) + { + // SIMD and floating-point indirections are a bit more expensive + + if (opSize >= EA_64BYTE) + { + memLatency += PERFSCORE_LATENCY_2C; + } + else if (opSize >= EA_32BYTE) + { + memLatency += PERFSCORE_LATENCY_1C; + } + } + else if (insLatency < memLatency) + { + // Most general-purpose instructions have their costs amortized by the memory access + insLatency = PERFSCORE_LATENCY_ZERO; + } + } + + // The throughput is a reciprocal, meaning 2X is 0.5 and 2C is 2.0, so we want + // to pick the higher value as it is the bigger limiter on the throughput. + // + // Latencies we just want to add together as we require the time to read/write + // the memory and to perform the operation. + + result.insThroughput = std::max(insThroughput, memThroughput); + result.insLatency = insLatency + memLatency; + } + result.insMemoryAccessKind = memAccessKind; + return result; } diff --git a/src/coreclr/jit/instrsxarch.h b/src/coreclr/jit/instrsxarch.h index ce5f1f63320b67..622ae8cedee91f 100644 --- a/src/coreclr/jit/instrsxarch.h +++ b/src/coreclr/jit/instrsxarch.h @@ -54,38 +54,38 @@ // id nm um mr mi rm a4 rr lat tp tt flags INST5(invalid, "INVALID", IUM_RD, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_None) -INST5(push, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, ILLEGAL, ILLEGAL, INS_TT_NONE, Encoding_REX2) -INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, ILLEGAL, ILLEGAL, INS_TT_NONE, Encoding_REX2) +INST5(push, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, ZERO, 1C, INS_TT_NONE, Encoding_REX2) +INST5(pop, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, ZERO, 2X, INS_TT_NONE, Encoding_REX2) // Does not affect the stack tracking in the emitter -INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, ILLEGAL, ILLEGAL, INS_TT_NONE, Encoding_REX2) -INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, ILLEGAL, ILLEGAL, INS_TT_NONE, Encoding_REX2) +INST5(push_hide, "push", IUM_RD, 0x0030FE, 0x000068, BAD_CODE, BAD_CODE, 0x000050, ZERO, 1C, INS_TT_NONE, Encoding_REX2) +INST5(pop_hide, "pop", IUM_WR, 0x00008E, BAD_CODE, BAD_CODE, BAD_CODE, 0x000058, ZERO, 2X, INS_TT_NONE, Encoding_REX2) -INST5(push2, "push2", IUM_RD, 0x0030FF, BAD_CODE, 0x0030FF, BAD_CODE, 0x0030FF, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_Flags_Has_NDD) -INST5(pop2, "pop2", IUM_WR, 0x00008F, BAD_CODE, 0x00008F, BAD_CODE, 0x00008F, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_Flags_Has_NDD) +INST5(push2, "push2", IUM_RD, 0x0030FF, BAD_CODE, 0x0030FF, BAD_CODE, 0x0030FF, ZERO, 1C, INS_TT_NONE, INS_Flags_Has_NDD) +INST5(pop2, "pop2", IUM_WR, 0x00008F, BAD_CODE, 0x00008F, BAD_CODE, 0x00008F, ZERO, 2X, INS_TT_NONE, INS_Flags_Has_NDD) -INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, 0x0000FE, BAD_CODE, 0x000040, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF) -INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, 0x0008FE, BAD_CODE, 0x000048, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF) +INST5(inc, "inc", IUM_RW, 0x0000FE, BAD_CODE, 0x0000FE, BAD_CODE, 0x000040, 1C, 4X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST5(inc_l, "inc", IUM_RW, 0x0000FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C0FE, 1C, 4X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF) +INST5(dec, "dec", IUM_RW, 0x0008FE, BAD_CODE, 0x0008FE, BAD_CODE, 0x000048, 1C, 4X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST5(dec_l, "dec", IUM_RW, 0x0008FE, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C8FE, 1C, 4X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Encoding_REX2 | INS_Flags_Has_NF) // Multi-byte opcodes without modrm are represented in mixed endian fashion. // See comment around quarter way through this file for more information. INST5(bswap, "bswap", IUM_RW, 0x0F00C8, BAD_CODE, BAD_CODE, BAD_CODE, 0x00C80F, ILLEGAL, ILLEGAL, INS_TT_NONE, Encoding_REX2) // id nm um mr mi rm a4 lat tp tt flags -INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, ILLEGAL, ILLEGAL, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, ILLEGAL, ILLEGAL, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(add, "add", IUM_RW, 0x000000, 0x000080, 0x000002, 0x000004, 1C, 4X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(or, "or", IUM_RW, 0x000008, 0x000880, 0x00000A, 0x00000C, 1C, 4X, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(adc, "adc", IUM_RW, 0x000010, 0x001080, 0x000012, 0x000014, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(sbb, "sbb", IUM_RW, 0x000018, 0x001880, 0x00001A, 0x00001C, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | Reads_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(and, "and", IUM_RW, 0x000020, 0x002080, 0x000022, 0x000024, 1C, 4X, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(sub, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, 1C, 4X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) // Does not affect the stack tracking in the emitter -INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(sub_hide, "sub", IUM_RW, 0x000028, 0x002880, 0x00002A, 0x00002C, 1C, 4X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, ILLEGAL, ILLEGAL, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, ILLEGAL, ILLEGAL, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) -INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(xor, "xor", IUM_RW, 0x000030, 0x003080, 0x000032, 0x000034, 1C, 4X, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST4(cmp, "cmp", IUM_RD, 0x000038, 0x003880, 0x00003A, 0x00003C, ZERO, 4X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(test, "test", IUM_RD, 0x000084, 0x0000F6, 0x000084, 0x0000A8, ZERO, 4X, INS_TT_NONE, Resets_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Resets_CF | INS_FLAGS_Has_Wbit | Encoding_REX2) +INST4(mov, "mov", IUM_WR, 0x000088, 0x0000C6, 0x00008A, 0x0000B0, ZERO, 4X, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) INST4(lea, "lea", IUM_WR, BAD_CODE, BAD_CODE, 0x00008D, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Encoding_REX2) @@ -93,33 +93,33 @@ INST4(lea, "lea", IUM_WR, BAD_CODE, BAD_CODE, // Note that emitter has only partial support for BT. It can only emit the reg,reg form // and the registers need to be reversed to get the correct encoding. -INST3(bt, "bt", IUM_RD, 0x0F00A3, BAD_CODE, 0x0F00A3, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | Encoding_REX2) +INST3(bt, "bt", IUM_RD, 0x0F00A3, BAD_CODE, 0x0F00A3, 1C, 2X, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | Encoding_REX2) INST3(bsr, "bsr", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BD, 3C, 1C, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF | Encoding_REX2) INST3(bsf, "bsf", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BC, 3C, 1C, INS_TT_NONE, Undefined_OF | Undefined_SF | Writes_ZF | Undefined_AF | Undefined_PF | Undefined_CF | Encoding_REX2) -INST3(movsx, "movsx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BE, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) +INST3(movsx, "movsx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00BE, 1C, 4X, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) #ifdef TARGET_AMD64 -INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, 0x000063, ILLEGAL, ILLEGAL, INS_TT_NONE, REX_W1 | Encoding_REX2) +INST3(movsxd, "movsxd", IUM_WR, BAD_CODE, BAD_CODE, 0x000063, ZERO, 4X, INS_TT_NONE, REX_W1 | Encoding_REX2) #endif -INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) - -INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(movzx, "movzx", IUM_WR, BAD_CODE, BAD_CODE, 0x0F00B6, ZERO, 4X, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) + +INST3(cmovo, "cmovo", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0040, 1C, 2X, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovno, "cmovno", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0041, 1C, 2X, INS_TT_NONE, Reads_OF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovb, "cmovb", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0042, 1C, 2X, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovae, "cmovae", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0043, 1C, 2X, INS_TT_NONE, Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmove, "cmove", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0044, 1C, 2X, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovne, "cmovne", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0045, 1C, 2X, INS_TT_NONE, Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovbe, "cmovbe", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0046, 1C, 2X, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmova, "cmova", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0047, 1C, 2X, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovs, "cmovs", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0048, 1C, 2X, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovns, "cmovns", IUM_WR, BAD_CODE, BAD_CODE, 0x0F0049, 1C, 2X, INS_TT_NONE, Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovp, "cmovp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004A, 1C, 2X, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovnp, "cmovnp", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004B, 1C, 2X, INS_TT_NONE, Reads_PF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovl, "cmovl", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004C, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovge, "cmovge", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004D, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovle, "cmovle", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004E, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(cmovg, "cmovg", IUM_WR, BAD_CODE, BAD_CODE, 0x0F004F, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2 | INS_Flags_Has_NDD) INST3(xchg, "xchg", IUM_RW, 0x000086, BAD_CODE, 0x000086, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_Has_Wbit | Encoding_REX2) INST3(imul, "imul", IUM_RW, 0x0F00AC, BAD_CODE, 0x0F00AF, 3C, 1C, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Sbit | INS_Flags_Has_NDD | INS_Flags_Has_NF | Encoding_REX2) @@ -224,12 +224,12 @@ INST3(addsd, "vaddsd", IUM_WR, BAD_CODE, BAD_CODE, INST3(addss, "vaddss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x58), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add scalar singles INST3(addsubpd, "vaddsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed doubles INST3(addsubps, "vaddsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xD0), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add/Subtract packed singles -INST3(andnpd, "vandnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles -INST3(andnps, "vandnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles -INST3(andpd, "vandpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles -INST3(andps, "vandps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles -INST3(blendpd, "vblendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values -INST3(blendps, "vblendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values +INST3(andnpd, "vandnpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x55), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed doubles +INST3(andnps, "vandnps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x55), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // And-Not packed singles +INST3(andpd, "vandpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x54), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed doubles +INST3(andps, "vandps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x54), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // AND packed singles +INST3(blendpd, "vblendpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0D), 1C, 3X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Double Precision Floating-Point Values +INST3(blendps, "vblendps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0C), 1C, 3X, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed Single Precision Floating-Point Values INST3(blendvpd, "blendvpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Doubles INST3(blendvps, "blendvps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x14), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Singles INST3(cmppd, "vcmppd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC2), 4C, 2X, INS_TT_FULL, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // compare packed doubles @@ -260,8 +260,8 @@ INST3(cvttsd2si32, "vcvttsd2si", IUM_WR, BAD_CODE, BAD_CODE, INST3(cvttsd2si64, "vcvttsd2si", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x2C), 7C, 1C, INS_TT_TUPLE1_FIXED, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar double to signed DWORDs INST3(cvttss2si32, "vcvttss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2C), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar single to DWORD INST3(cvttss2si64, "vcvttss2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x2C), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_FIXED, Input_32Bit | REX_W1 | Encoding_VEX | Encoding_EVEX) // cvt with trunc scalar single to DWORD -INST3(divpd, "vdivpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5E), 13C, 4C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed doubles -INST3(divps, "vdivps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5E), 11C, 3C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed singles +INST3(divpd, "vdivpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5E), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed doubles +INST3(divps, "vdivps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5E), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide packed singles INST3(divsd, "vdivsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5E), 13C, 4C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar doubles INST3(divss, "vdivss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5E), 11C, 3C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Divide scalar singles INST3(dppd, "vdppd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x41), 9C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed dot product of two double vector regs @@ -272,7 +272,7 @@ INST3(haddps, "vhaddps", IUM_WR, BAD_CODE, BAD_CODE, INST3(hsubpd, "vhsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed doubles INST3(hsubps, "vhsubps", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x7D), 6C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Horizontal subtract packed floats INST3(insertps, "vinsertps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x21), 1C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert packed single precision float value -INST3(lddqu, "vlddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Load Unaligned integer +INST3(lddqu, "vlddqu", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0xF0), ZERO, ZERO, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Load Unaligned integer INST3(lfence, "lfence", IUM_RD, 0x000FE8AE, BAD_CODE, BAD_CODE, ZERO, 4C, INS_TT_NONE, REX_WIG) INST3(maskmovdqu, "vmaskmovdqu", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF7), 400C, 6C, INS_TT_NONE, REX_WIG | Encoding_VEX) INST3(maxpd, "vmaxpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5F), 4C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Maximum packed doubles @@ -284,23 +284,23 @@ INST3(minpd, "vminpd", IUM_WR, BAD_CODE, BAD_CODE, INST3(minps, "vminps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5D), 4C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum packed singles INST3(minsd, "vminsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x5D), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar double INST3(minss, "vminss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x5D), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Return Minimum scalar single -INST3(movapd, "vmovapd", IUM_WR, PCKDBL(0x29), BAD_CODE, PCKDBL(0x28), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) -INST3(movaps, "vmovaps", IUM_WR, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movapd, "vmovapd", IUM_WR, PCKDBL(0x29), BAD_CODE, PCKDBL(0x28), ZERO, 4X, INS_TT_FULL_MEM, REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movaps, "vmovaps", IUM_WR, PCKFLT(0x29), BAD_CODE, PCKFLT(0x28), ZERO, 4X, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) INST3(movd32, "vmovd", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move DWORD between xmm regs <-> memory/r32 regs INST3(movd64, "vmovq", IUM_WR, PCKDBL(0x7E), BAD_CODE, PCKDBL(0x6E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | Encoding_EVEX | Encoding_REX2) // Move QWORD between xmm regs <-> memory/r64 regs INST3(movddup, "vmovddup", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x12), ILLEGAL, ILLEGAL, INS_TT_MOVDDUP, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate Double FP Values -INST3(movdqa32, "vmovdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2 | INS_FLAGS_HasPseudoName) -INST3(movdqu32, "vmovdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2 | INS_FLAGS_HasPseudoName) +INST3(movdqa32, "vmovdqa", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), ZERO, 4X, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2 | INS_FLAGS_HasPseudoName) +INST3(movdqu32, "vmovdqu", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), ZERO, 4X, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | Encoding_REX2 | INS_FLAGS_HasPseudoName) INST3(movhlps, "vmovhlps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x12), 1C, 1C, INS_TT_NONE, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(movhpd, "vmovhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movhps, "vmovhps", IUM_WR, PCKFLT(0x17), BAD_CODE, PCKFLT(0x16), ILLEGAL, ILLEGAL, INS_TT_TUPLE2, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movhpd, "vmovhpd", IUM_WR, PCKDBL(0x17), BAD_CODE, PCKDBL(0x16), 1C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movhps, "vmovhps", IUM_WR, PCKFLT(0x17), BAD_CODE, PCKFLT(0x16), 1C, 1C, INS_TT_TUPLE2, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movlhps, "vmovlhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x16), 1C, 1C, INS_TT_NONE, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(movlpd, "vmovlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movlps, "vmovlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), ILLEGAL, ILLEGAL, INS_TT_TUPLE2, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movlpd, "vmovlpd", IUM_WR, PCKDBL(0x13), BAD_CODE, PCKDBL(0x12), 1C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) +INST3(movlps, "vmovlps", IUM_WR, PCKFLT(0x13), BAD_CODE, PCKFLT(0x12), 1C, 1C, INS_TT_TUPLE2, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) INST3(movmskpd, "vmovmskpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x50), ILLEGAL, ILLEGAL, INS_TT_NONE, REX_WIG | Encoding_VEX) // Extract 2-bit sign mask from xmm and store in reg. The upper bits of r32 or r64 are filled with zeros. INST3(movmskps, "vmovmskps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x50), ILLEGAL, ILLEGAL, INS_TT_NONE, REX_WIG | Encoding_VEX) INST3(movntdq, "vmovntdq", IUM_WR, PCKDBL(0xE7), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) -INST3(movntdqa, "vmovntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Load Double Quadword Non-Temporal Aligned Hint +INST3(movntdqa, "vmovntdqa", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2A), ZERO, ZERO, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Load Double Quadword Non-Temporal Aligned Hint INST3(movnti32, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_REX2) INST3(movnti64, "movnti", IUM_WR, PCKFLT(0xC3), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_REX2) INST3(movntpd, "vmovntpd", IUM_WR, PCKDBL(0x2B), BAD_CODE, BAD_CODE, 400C, 1C, INS_TT_FULL_MEM, REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) @@ -310,15 +310,15 @@ INST3(movsd_simd, "vmovsd", IUM_WR, SSEDBL(0x11), BAD_CODE, INST3(movshdup, "vmovshdup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x16), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate odd-indexed Single FP Values INST3(movsldup, "vmovsldup", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x12), 1C, 1C, INS_TT_FULL_MEM, KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Replicate even-indexed Single FP Values INST3(movss, "vmovss", IUM_WR, SSEFLT(0x11), BAD_CODE, SSEFLT(0x10), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) -INST3(movupd, "vmovupd", IUM_WR, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) -INST3(movups, "vmovups", IUM_WR, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movupd, "vmovupd", IUM_WR, PCKDBL(0x11), BAD_CODE, PCKDBL(0x10), ZERO, 4X, INS_TT_FULL_MEM, REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) +INST3(movups, "vmovups", IUM_WR, PCKFLT(0x11), BAD_CODE, PCKFLT(0x10), ZERO, 4X, INS_TT_FULL_MEM, REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) INST3(mpsadbw, "vmpsadbw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x42), 4C, 2C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference INST3(mulpd, "vmulpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x59), 4C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed doubles INST3(mulps, "vmulps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x59), 4C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply packed singles INST3(mulsd, "vmulsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x59), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar doubles INST3(mulss, "vmulss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x59), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Multiply scalar single -INST3(orpd, "vorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x56), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed doubles -INST3(orps, "vorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x56), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed singles +INST3(orpd, "vorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x56), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed doubles +INST3(orps, "vorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x56), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Or packed singles INST3(pabsb, "vpabsb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1C), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of bytes INST3(pabsd, "vpabsd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1E), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 32-bit integers INST3(pabsw, "vpabsw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1D), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX) // Packed absolute value of 16-bit integers @@ -326,17 +326,17 @@ INST3(packssdw, "vpackssdw", IUM_WR, BAD_CODE, BAD_CODE, INST3(packsswb, "vpacksswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x63), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to byte with saturation INST3(packusdw, "vpackusdw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x2B), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base8 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) int to unsigned short with saturation INST3(packuswb, "vpackuswb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x67), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Pack (narrow) short to unsigned byte with saturation -INST3(paddb, "vpaddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers -INST3(paddd, "vpaddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers -INST3(paddq, "vpaddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed quad-word (64-bit) integers -INST3(paddsb, "vpaddsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed byte integers and saturate the results -INST3(paddsw, "vpaddsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xED), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed word integers and saturate the results -INST3(paddusb, "vpaddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results -INST3(paddusw, "vpaddusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDD), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned word integers and saturate the results -INST3(paddw, "vpaddw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFD), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed word (16-bit) integers +INST3(paddb, "vpaddb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFC), 1C, 3X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed byte integers +INST3(paddd, "vpaddd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFE), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed double-word (32-bit) integers +INST3(paddq, "vpaddq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD4), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed quad-word (64-bit) integers +INST3(paddsb, "vpaddsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEC), 1C, 3X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed byte integers and saturate the results +INST3(paddsw, "vpaddsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xED), 1C, 3X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed signed word integers and saturate the results +INST3(paddusb, "vpaddusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDC), 1C, 3X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned byte integers and saturate the results +INST3(paddusw, "vpaddusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDD), 1C, 3X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed unsigned word integers and saturate the results +INST3(paddw, "vpaddw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFD), 1C, 3X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Add packed word (16-bit) integers INST3(palignr, "vpalignr", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x0F), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Align Right -INST3(pandd, "vpand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise AND of two xmm regs -INST3(pandnd, "vpandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise AND NOT of two xmm regs +INST3(pandd, "vpand", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise AND of two xmm regs +INST3(pandnd, "vpandn", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise AND NOT of two xmm regs INST3(pavgb, "vpavgb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE0), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed byte integers INST3(pavgw, "vpavgw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE3), 1C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Average of packed word integers INST3(pblendvb, "pblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x10), 1C, 2X, INS_TT_FULL_MEM, REX_W0) // Variable Blend Packed Bytes @@ -398,7 +398,7 @@ INST3(pmulhw, "vpmulhw", IUM_WR, BAD_CODE, BAD_CODE, INST3(pmulld, "vpmulld", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), 10C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 32 bit unsigned integers and store lower 32 bits of each result INST3(pmullw, "vpmullw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD5), 5C, 2X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 16 bit unsigned integers and store lower 16 bits of each result INST3(pmuludq, "vpmuludq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF4), 5C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // packed multiply 32-bit unsigned integers and store 64-bit result -INST3(pord, "vpor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise OR of two xmm regs +INST3(pord, "vpor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise OR of two xmm regs INST3(prefetchnta, "prefetchnta", IUM_RD, 0x000F0018, BAD_CODE, BAD_CODE, ZERO, 2X, INS_TT_TUPLE1_FIXED, Input_8Bit | REX_WIG | Encoding_REX2) INST3(prefetcht0, "prefetcht0", IUM_RD, 0x000F0818, BAD_CODE, BAD_CODE, ZERO, 2X, INS_TT_TUPLE1_FIXED, Input_8Bit | REX_WIG | Encoding_REX2) INST3(prefetcht1, "prefetcht1", IUM_RD, 0x000F1018, BAD_CODE, BAD_CODE, ZERO, 2X, INS_TT_TUPLE1_FIXED, Input_8Bit | REX_WIG | Encoding_REX2) @@ -421,14 +421,14 @@ INST3(psrld, "vpsrld", IUM_WR, BAD_CODE, PCKDBL(0x72), INST3(psrldq, "vpsrldq", IUM_WR, BAD_CODE, PCKDBL(0x73), BAD_CODE, 1C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Shift right logical of xmm reg by given number of bytes INST3(psrlq, "vpsrlq", IUM_WR, BAD_CODE, PCKDBL(0x73), PCKDBL(0xD3), ILLEGAL, ILLEGAL, INS_TT_FULL | INS_TT_MEM128, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 64-bit integers INST3(psrlw, "vpsrlw", IUM_WR, BAD_CODE, PCKDBL(0x71), PCKDBL(0xD1), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM | INS_TT_MEM128, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed shift right logical of 16-bit integers -INST3(psubb, "vpsubb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers -INST3(psubd, "vpsubd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFA), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed double-word (32-bit) integers -INST3(psubq, "vpsubq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFB), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // subtract packed quad-word (64-bit) integers -INST3(psubsb, "vpsubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation -INST3(psubsw, "vpsubsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation -INST3(psubusb, "vpsubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation -INST3(psubusw, "vpsubusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation -INST3(psubw, "vpsubw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers +INST3(psubb, "vpsubb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF8), 1C, 3X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers +INST3(psubd, "vpsubd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFA), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed double-word (32-bit) integers +INST3(psubq, "vpsubq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xFB), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // subtract packed quad-word (64-bit) integers +INST3(psubsb, "vpsubsb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE8), 1C, 3X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 8-bit integers in b from packed 8-bit integers in a using saturation +INST3(psubsw, "vpsubsw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xE9), 1C, 3X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed 16-bit integers in b from packed 16-bit integers in a using saturation +INST3(psubusb, "vpsubusb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD8), 1C, 3X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit integers in a using saturation +INST3(psubusw, "vpsubusw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xD9), 1C, 3X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit integers in a using saturation +INST3(psubw, "vpsubw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xF9), 1C, 3X, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed word (16-bit) integers INST3(ptest, "vptest", IUM_RD, BAD_CODE, BAD_CODE, SSE38(0x17), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Writes_CF) // Packed logical compare INST3(punpckhbw, "vpunpckhbw", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x68), 1C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) INST3(punpckhdq, "vpunpckhdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6A), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) @@ -438,7 +438,7 @@ INST3(punpcklbw, "vpunpcklbw", IUM_WR, BAD_CODE, BAD_CODE, INST3(punpckldq, "vpunpckldq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x62), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) INST3(punpcklqdq, "vpunpcklqdq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x6C), 1C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen uint to ulong (lo) INST3(punpcklwd, "vpunpcklwd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x61), 1C, 1C, INS_TT_FULL_MEM, KMask_Base8 | REX_WIG | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ushort to uint (lo) -INST3(pxord, "vpxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise XOR of two xmm regs +INST3(pxord, "vpxor", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Packed bit-wise XOR of two xmm regs INST3(rcpps, "vrcpps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x53), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX) // Reciprocal of packed singles INST3(rcpss, "vrcpss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x53), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_WIG | Encoding_VEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Reciprocal of scalar single INST3(roundpd, "vroundpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x09), 8C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Round packed double precision floating-point values @@ -450,9 +450,9 @@ INST3(rsqrtss, "vrsqrtss", IUM_WR, BAD_CODE, BAD_CODE, INST3(sfence, "sfence", IUM_RD, 0x000FF8AE, BAD_CODE, BAD_CODE, ZERO, 6C, INS_TT_NONE, REX_WIG) INST3(shufpd, "vshufpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xC6), 1C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) INST3(shufps, "vshufps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0xC6), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(sqrtpd, "vsqrtpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x51), 13C, 4C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Sqrt of packed doubles -INST3(sqrtps, "vsqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x51), 12C, 3C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Sqrt of packed singles -INST3(sqrtsd, "vsqrtsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x51), 13C, 4C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar double +INST3(sqrtpd, "vsqrtpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x51), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Sqrt of packed doubles +INST3(sqrtps, "vsqrtps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x51), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX) // Sqrt of packed singles +INST3(sqrtsd, "vsqrtsd", IUM_WR, BAD_CODE, BAD_CODE, SSEDBL(0x51), 16C, 4C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar double INST3(sqrtss, "vsqrtss", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0x51), 12C, 3C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Sqrt of scalar single INST3(subpd, "vsubpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x5C), 4C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed doubles INST3(subps, "vsubps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x5C), 4C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Subtract packed singles @@ -464,8 +464,8 @@ INST3(unpckhpd, "vunpckhpd", IUM_WR, BAD_CODE, BAD_CODE, INST3(unpckhps, "vunpckhps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x15), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) INST3(unpcklpd, "vunpcklpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x14), 1C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed logical (unsigned) widen ubyte to ushort (hi) INST3(unpcklps, "vunpcklps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x14), 1C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) -INST3(xorpd, "vxorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x57), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed doubles -INST3(xorps, "vxorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles +INST3(xorpd, "vxorpd", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x57), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed doubles +INST3(xorps, "vxorps", IUM_WR, BAD_CODE, BAD_CODE, PCKFLT(0x57), 1C, 3X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0_EVEX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // XOR packed singles // Instructions for AESNI, PCLMULQDQ INST3(aesdec, "vaesdec", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0xDE), 4C, 1C, INS_TT_FULL_MEM, REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Perform one round of an AES decryption flow @@ -477,13 +477,13 @@ INST3(aeskeygenassist, "vaeskeygenassist", IUM_WR, BAD_CODE, BAD_CODE, INST3(pclmulqdq, "vpclmulqdq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x44), 7C, 1C, INS_TT_FULL_MEM, KMask_Base1 | REX_WIG | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Perform a carry-less multiplication of two quadwords // Instructions for SHA -INST3(sha1msg1, "sha1msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC9), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords -INST3(sha1msg2, "sha1msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCA), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform a Final Calculation for the Next Four SHA1 Message Dwords -INST3(sha1nexte, "sha1nexte", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Calculate SHA1 State Variable E After Four Rounds -INST3(sha1rnds4, "sha1rnds4", IUM_RW, BAD_CODE, BAD_CODE, SSE3A(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform Four Rounds of SHA1 Operation -INST3(sha256msg1, "sha256msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA256 Message Dwords -INST3(sha256msg2, "sha256msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCD), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform a Final Calculation for the Next Four SHA256 Message Dwords -INST3(sha256rnds2, "sha256rnds2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCB), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_WIG) // Perform Two Rounds of SHA256 Operation +INST3(sha1msg1, "sha1msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC9), 2C, 1C, INS_TT_FULL_MEM, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA1 Message Dwords +INST3(sha1msg2, "sha1msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCA), 6C, 2C, INS_TT_FULL_MEM, REX_WIG) // Perform a Final Calculation for the Next Four SHA1 Message Dwords +INST3(sha1nexte, "sha1nexte", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xC8), 3C, 1C, INS_TT_FULL_MEM, REX_WIG) // Calculate SHA1 State Variable E After Four Rounds +INST3(sha1rnds4, "sha1rnds4", IUM_RW, BAD_CODE, BAD_CODE, SSE3A(0xCC), 4C, 1C, INS_TT_FULL_MEM, REX_WIG) // Perform Four Rounds of SHA1 Operation +INST3(sha256msg1, "sha256msg1", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCC), 5C, 2C, INS_TT_FULL_MEM, REX_WIG) // Perform an Intermediate Calculation for the Next Four SHA256 Message Dwords +INST3(sha256msg2, "sha256msg2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCD), 6C, 2C, INS_TT_FULL_MEM, REX_WIG) // Perform a Final Calculation for the Next Four SHA256 Message Dwords +INST3(sha256rnds2, "sha256rnds2", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xCB), 6C, 1C, INS_TT_FULL_MEM, REX_WIG) // Perform Two Rounds of SHA256 Operation // Instructions for GFNI INST3(gf2p8affineinvqb, "vgf2p8affineinvqb",IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0xCF), 5C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_WX | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Galois Field Affine Transformation Inverse @@ -500,8 +500,8 @@ INST3(vbroadcastsd, "vbroadcastsd", IUM_WR, BAD_CODE, BAD_CODE, INST3(vbroadcastss, "vbroadcastss", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x18), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX) // Broadcast float value read from memory to entire ymm register INST3(vextractf32x4, "vextractf128", IUM_WR, SSE3A(0x19), BAD_CODE, BAD_CODE, 3C, 1C, INS_TT_TUPLE4, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_FLAGS_HasPseudoName) // Extract 128-bit packed floating point values INST3(vinsertf32x4, "vinsertf128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x18), 3C, 1C, INS_TT_TUPLE4, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Insert 128-bit packed floating point values -INST3(vmaskmovpd, "vmaskmovpd", IUM_WR, SSE38(0x2F), BAD_CODE, SSE38(0x2D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores -INST3(vmaskmovps, "vmaskmovps", IUM_WR, SSE38(0x2E), BAD_CODE, SSE38(0x2C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores +INST3(vmaskmovpd, "vmaskmovpd", IUM_WR, SSE38(0x2F), BAD_CODE, SSE38(0x2D), ZERO, ZERO, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Double-Precision Floating-Point Loads and Stores +INST3(vmaskmovps, "vmaskmovps", IUM_WR, SSE38(0x2E), BAD_CODE, SSE38(0x2C), ZERO, ZERO, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Packed Single-Precision Floating-Point Loads and Stores INST3(vpblendvb, "vpblendvb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x4C), 2C, 1C, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Blend Packed Bytes INST3(vperm2f128, "vperm2f128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x06), 3C, 1C, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Permute Floating-Point Values INST3(vpermilpd, "vpermilpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x05), 1C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Permute In-Lane of Quadruples of Double-Precision Floating-Point Values @@ -522,7 +522,7 @@ INST3(vgatherdps, "vgatherdps", IUM_WR, BAD_CODE, BAD_CODE, INST3(vgatherqpd, "vgatherqpd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed DP FP Values Using Signed Qword Indices INST3(vgatherqps, "vgatherqps", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x93), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed SP FP values Using Signed Qword Indices INST3(vinserti32x4, "vinserti128", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), 3C, 1C, INS_TT_TUPLE4, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) // Insert 128-bit packed integer values -INST3(vpblendd, "vpblendd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x02), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed DWORDs +INST3(vpblendd, "vpblendd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x02), 1C, 3X, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Packed DWORDs INST3(vpbroadcastb, "vpbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x78), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_VEX | Encoding_EVEX) // Broadcast int8 value from reg/memory to entire ymm register INST3(vpbroadcastd, "vpbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x58), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX) // Broadcast int32 value from reg/memory to entire ymm register INST3(vpbroadcastq, "vpbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x59), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base2 | REX_W1_EVEX | Encoding_VEX | Encoding_EVEX) // Broadcast int64 value from reg/memory to entire ymm register @@ -536,8 +536,8 @@ INST3(vpgatherdd, "vpgatherdd", IUM_WR, BAD_CODE, BAD_CODE, INST3(vpgatherdq, "vpgatherdq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x90), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword with Signed Dword Indices INST3(vpgatherqd, "vpgatherqd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base2 | REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Dword Values Using Signed Qword INST3(vpgatherqq, "vpgatherqq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x91), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Gather Packed Qword with Signed Dword Indices -INST3(vpmaskmovd, "vpmaskmovd", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Dword Loads and Stores -INST3(vpmaskmovq, "vpmaskmovq", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W1 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Qword Loads and Stores +INST3(vpmaskmovd, "vpmaskmovd", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), ZERO, ZERO, INS_TT_FULL_MEM, REX_W0 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Dword Loads and Stores +INST3(vpmaskmovq, "vpmaskmovq", IUM_WR, SSE38(0x8E), BAD_CODE, SSE38(0x8C), ZERO, ZERO, INS_TT_FULL_MEM, REX_W1 | Encoding_VEX | INS_Flags_IsDstDstSrcAVXInstruction) // Conditional SIMD Integer Packed Qword Loads and Stores INST3(vpsllvd, "vpsllvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical INST3(vpsllvq, "vpsllvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x47), 1C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Left Logical INST3(vpsravd, "vpsravd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x46), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_VEX | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Variable Bit Shift Right Arithmetic @@ -653,16 +653,16 @@ INST3(vpdpbuuds, "vpdpbuuds", IUM_WR, BAD_CODE, BAD_ // Instructions for AVX512-BMM #define FIRST_AVX512BMM_INSTRUCTION INS_vbmacor16x16x16 -INST3(vbmacor16x16x16, "vbmacor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // 16x16 non-transposed fused BMM-accumulate (BMAC) with OR reduction. -INST3(vbmacxor16x16x16, "vbmacxor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // 16x16 non-transposed fused BMM-accumulate (BMAC) with XOR reduction. -INST3(vbitrev, "vbitrev", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x81), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX ) // Bit reversal within a byte boundary. +INST3(vbmacor16x16x16, "vbmacor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), 1C, 1C, INS_TT_FULL_MEM, Input_16Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // 16x16 non-transposed fused BMM-accumulate (BMAC) with OR reduction. +INST3(vbmacxor16x16x16, "vbmacxor16x16x16", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x80), 1C, 1C, INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // 16x16 non-transposed fused BMM-accumulate (BMAC) with XOR reduction. +INST3(vbitrev, "vbitrev", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x81), 1C, 1C, INS_TT_FULL_MEM, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX ) // Bit reversal within a byte boundary. #define LAST_AVX512BMM_INSTRUCTION INS_vbitrev #define FIRST_AVXIFMA_INSTRUCTION INS_vpmadd52huq // Instructions for AVXIFMA -INST3(vpmadd52huq, "vpmadd52huq", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB5), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply of Unsigned 52-Bit Unsigned Integers and Add High 52-Bit Products to 64-Bit Accumulators -INST3(vpmadd52luq, "vpmadd52luq", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB4), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply of Unsigned 52-Bit Integers and Add the Low 52-Bit Products to Qword Accumulators +INST3(vpmadd52huq, "vpmadd52huq", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB5), 4C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply of Unsigned 52-Bit Unsigned Integers and Add High 52-Bit Products to 64-Bit Accumulators +INST3(vpmadd52luq, "vpmadd52luq", IUM_RW, BAD_CODE, BAD_CODE, SSE38(0xB4), 4C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | INS_Flags_IsDstDstSrcAVXInstruction) // Packed Multiply of Unsigned 52-Bit Integers and Add the Low 52-Bit Products to Qword Accumulators #define LAST_AVXIFMA_INSTRUCTION INS_vpmadd52luq #define LAST_AVX_INSTRUCTION INS_vpmadd52luq @@ -806,21 +806,21 @@ INST3(vinsertf64x4, "vinsertf64x4", IUM_WR, BAD_CODE, BAD_ INST3(vinserti32x8, "vinserti32x8", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), 3C, 1C, INS_TT_TUPLE8, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(vinserti64x2, "vinserti64x2", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x38), 3C, 1C, INS_TT_TUPLE2, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values INST3(vinserti64x4, "vinserti64x4", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3A), 3C, 1C, INS_TT_TUPLE4, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Insert 256-bit packed quadword integer values -INST3(vmovdqa64, "vmovdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W1 | Encoding_EVEX) -INST3(vmovdqu16, "vmovdqu16", IUM_WR, SSEDBL(0x7F), BAD_CODE, SSEDBL(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W1 | Encoding_EVEX) -INST3(vmovdqu64, "vmovdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W1 | Encoding_EVEX) -INST3(vmovdqu8, "vmovdqu8", IUM_WR, SSEDBL(0x7F), BAD_CODE, SSEDBL(0x6F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0 | Encoding_EVEX) +INST3(vmovdqa64, "vmovdqa64", IUM_WR, PCKDBL(0x7F), BAD_CODE, PCKDBL(0x6F), ZERO, 4X, INS_TT_FULL_MEM, REX_W1 | Encoding_EVEX) +INST3(vmovdqu16, "vmovdqu16", IUM_WR, SSEDBL(0x7F), BAD_CODE, SSEDBL(0x6F), ZERO, 4X, INS_TT_FULL_MEM, REX_W1 | Encoding_EVEX) +INST3(vmovdqu64, "vmovdqu64", IUM_WR, SSEFLT(0x7F), BAD_CODE, SSEFLT(0x6F), ZERO, 4X, INS_TT_FULL_MEM, REX_W1 | Encoding_EVEX) +INST3(vmovdqu8, "vmovdqu8", IUM_WR, SSEDBL(0x7F), BAD_CODE, SSEDBL(0x6F), ZERO, 4X, INS_TT_FULL_MEM, REX_W0 | Encoding_EVEX) INST3(vpabsq, "vpabsq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x1F), 1C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Packed absolute value of 64-bit integers -INST3(vpandnq, "vpandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs -INST3(vpandq, "vpandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs +INST3(vpandnq, "vpandnq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDF), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND NOT of two xmm regs +INST3(vpandq, "vpandq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xDB), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise AND of two xmm regs INST3(vpblendmb, "vpblendmb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x66), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Int64 vectors using an OpMask control INST3(vpblendmd, "vpblendmd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x64), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Byte vectors using an OpMask control INST3(vpblendmq, "vpblendmq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x64), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Int32 vectors using an OpMask control INST3(vpblendmw, "vpblendmw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x66), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Blend Word vectors using an OpMask control -INST3(vpbroadcastb_gpr, "vpbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7A), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX) // Broadcast int8 value from gpr to entire register -INST3(vpbroadcastd_gpr, "vpbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Broadcast int32 value from gpr to entire register -INST3(vpbroadcastq_gpr, "vpbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Broadcast int64 value from gpr to entire register -INST3(vpbroadcastw_gpr, "vpbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7B), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Broadcast int16 value from gpr to entire register +INST3(vpbroadcastb_gpr, "vpbroadcastb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7A), 5C, 1C, INS_TT_TUPLE1_SCALAR, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX) // Broadcast int8 value from gpr to entire register +INST3(vpbroadcastd_gpr, "vpbroadcastd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), 5C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Broadcast int32 value from gpr to entire register +INST3(vpbroadcastq_gpr, "vpbroadcastq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7C), 5C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Broadcast int64 value from gpr to entire register +INST3(vpbroadcastw_gpr, "vpbroadcastw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x7B), 5C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Broadcast int16 value from gpr to entire register INST3(vpcmpb, "vpcmpb", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x3F), 4C, 1C, INS_TT_FULL_MEM, KMask_Base16 | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction | INS_FLAGS_HasPseudoName) INST3(vpcmpd, "vpcmpd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x1F), 4C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX | INS_Flags_Is3OperandInstructionMask | INS_FLAGS_HasPseudoName) INST3(vpcmpeqb, "vpcmpeqb", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0x74), 1C, 2X, INS_TT_FULL_MEM, KMask_Base16 | REX_WIG | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed compare 8-bit integers for equality @@ -893,7 +893,7 @@ INST3(vpmovuswb, "vpmovuswb", IUM_WR, PSSE38(0xF3, 0x10), BAD_ INST3(vpmovw2m, "vpmovw2m", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x29), 3C, 1C, INS_TT_NONE, REX_W1 | Encoding_EVEX) INST3(vpmovwb, "vpmovwb", IUM_WR, PSSE38(0xF3, 0x30), BAD_CODE, PSSE38(0xF3, 0x30), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) INST3(vpmullq, "vpmullq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x40), 15C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed multiply 64 bit unsigned integers and store lower 64 bits of each result -INST3(vporq, "vporq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs +INST3(vporq, "vporq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEB), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise OR of two xmm regs INST3(vprold, "vprold", IUM_WR, BAD_CODE, PCKDBL(0x72), BAD_CODE, 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Bit rotate left INST3(vprolq, "vprolq", IUM_WR, BAD_CODE, PCKDBL(0x72), BAD_CODE, 1C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Bit rotate left INST3(vprolvd, "vprolvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x15), 1C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Bit rotate left @@ -921,7 +921,7 @@ INST3(vptestnmb, "vptestnmb", IUM_RD, BAD_CODE, BAD_ INST3(vptestnmd, "vptestnmd", IUM_RD, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x27), 4C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical NAND and set mask INST3(vptestnmq, "vptestnmq", IUM_RD, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x27), 4C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical NAND and set mask INST3(vptestnmw, "vptestnmw", IUM_RD, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x26), 4C, 1C, INS_TT_FULL_MEM, KMask_Base8 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Logical NAND and set mask -INST3(vpxorq, "vpxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs +INST3(vpxorq, "vpxorq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBL(0xEF), 1C, 3X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Packed bit-wise XOR of two xmm regs INST3(vrangepd, "vrangepd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x50), 4C, 2X, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Range restriction calculation from a pair of packed double-precision floating-point values INST3(vrangeps, "vrangeps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x50), 4C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Range restriction calculation from a pair of packed single-precision floating-point values INST3(vrangesd, "vrangesd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x51), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Range restriction calculation from a pair of scalar double-precision floating-point value @@ -966,10 +966,10 @@ INST3(vpcompressb, "vpcompressb", IUM_WR, SSE38(0x63), BAD_ INST3(vpcompressw, "vpcompressw", IUM_WR, SSE38(0x63), BAD_CODE, BAD_CODE, 6C, 2C, INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX) // Store sparse packed words into dense memory INST3(vpexpandb, "vpexpandb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x62), 6C, 2C, INS_TT_FULL_MEM, Input_8Bit | REX_W0 | Encoding_EVEX) // Load sparse packed bytes from dense memory INST3(vpexpandw, "vpexpandw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x62), 6C, 2C, INS_TT_FULL_MEM, Input_16Bit | REX_W1 | Encoding_EVEX) // Load sparse packed words from dense memory -INST3(vpopcntb, "vpopcntb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x54), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX) // Return the Count of Number of Bits Set to 1 in BYTE -INST3(vpopcntd, "vpopcntd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x55), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Return the Count of Number of Bits Set to 1 in DWORD -INST3(vpopcntq, "vpopcntq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x55), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Return the Count of Number of Bits Set to 1 in QWORD -INST3(vpopcntw, "vpopcntw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x54), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W1 | Encoding_EVEX) // Return the Count of Number of Bits Set to 1 in WORD +INST3(vpopcntb, "vpopcntb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x54), 3C, 1C, INS_TT_FULL_MEM, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX) // Return the Count of Number of Bits Set to 1 in BYTE +INST3(vpopcntd, "vpopcntd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x55), 3C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Return the Count of Number of Bits Set to 1 in DWORD +INST3(vpopcntq, "vpopcntq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x55), 3C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Return the Count of Number of Bits Set to 1 in QWORD +INST3(vpopcntw, "vpopcntw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x54), 3C, 1C, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W1 | Encoding_EVEX) // Return the Count of Number of Bits Set to 1 in WORD INST3(vpshldd, "vpshldd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x71), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Concatenate and Shift Packed Data Left Logical INST3(vpshldq, "vpshldq", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x71), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Concatenate and Shift Packed Data Left Logical INST3(vpshldvd, "vpshldvd", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x71), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Concatenate and Variable Shift Packed Data Left Logical @@ -982,124 +982,124 @@ INST3(vpshrdvd, "vpshrdvd", IUM_WR, BAD_CODE, BAD_ INST3(vpshrdvq, "vpshrdvq", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x73), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Concatenate and Variable Shift Packed Data Right Logical INST3(vpshrdvw, "vpshrdvw", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x72), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W1 | Encoding_EVEX) // Concatenate and Variable Shift Packed Data Right Logical INST3(vpshrdw, "vpshrdw", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x72), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W1 | Encoding_EVEX) // Concatenate and Shift Packed Data Right Logical -INST3(vpshufbitqmb, "vpshufbitqmb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX) // Shuffle Bits From Quadword Elements Using Byte Indexes Into Mask +INST3(vpshufbitqmb, "vpshufbitqmb", IUM_WR, BAD_CODE, BAD_CODE, SSE38(0x8F), 6C, 1C, INS_TT_FULL_MEM, Input_8Bit | KMask_Base16 | REX_W0 | Encoding_EVEX) // Shuffle Bits From Quadword Elements Using Byte Indexes Into Mask // Instructions for AVX512-BF16, AVX512-FP16 -INST3(vaddph, "vaddph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x58), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Add Packed FP16 Values -INST3(vaddsh, "vaddsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x58), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Add Scalar FP16 Values -INST3(vcmpph, "vcmpph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0xC2), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Compare Packed FP16 Values -INST3(vcmpsh, "vcmpsh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0xF3, 0xC2), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Compare Scalar FP16 Values -INST3(vcomish, "vcomish", IUM_RD, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x2F), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Compare Scalar Ordered FP16 Values and Set EFLAGS +INST3(vaddph, "vaddph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x58), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Add Packed FP16 Values +INST3(vaddsh, "vaddsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x58), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Add Scalar FP16 Values +INST3(vcmpph, "vcmpph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0xC2), 3C, 1C, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Compare Packed FP16 Values +INST3(vcmpsh, "vcmpsh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0xF3, 0xC2), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Compare Scalar FP16 Values +INST3(vcomish, "vcomish", IUM_RD, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x2F), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Compare Scalar Ordered FP16 Values and Set EFLAGS INST3(vcvtdq2ph, "vcvtdq2ph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x5B), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert Packed Signed DWORD Integers to Packed FP16 Values INST3(vcvtne2ps2bf16, "vcvtne2ps2bf16", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0x72), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Two Packed Single Data to One Packed BF16 Data INST3(vcvtneps2bf16, "vcvtneps2bf16", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x72), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Two Packed Single Data to One Packed BF16 Data -INST3(vcvtpd2ph, "vcvtpd2ph", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x5A), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Convert Packed Double Precision FP Values to Packed FP16 Values +INST3(vcvtpd2ph, "vcvtpd2ph", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x5A), 8C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Convert Packed Double Precision FP Values to Packed FP16 Values INST3(vcvtph2dq, "vcvtph2dq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x5B), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Signed DWORD Integers -INST3(vcvtph2pd, "vcvtph2pd", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x5A), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Double Precision FP Values +INST3(vcvtph2pd, "vcvtph2pd", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x5A), 15C, 2C, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Double Precision FP Values INST3(vcvtph2psx, "vcvtph2psx", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x13), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Single Precision FP Values INST3(vcvtph2qq, "vcvtph2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x7B), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Signed QWORD Integers -INST3(vcvtph2udq, "vcvtph2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x79), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Unsigned DWORD Integers +INST3(vcvtph2udq, "vcvtph2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x79), 8C, 1C, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Unsigned DWORD Integers INST3(vcvtph2uqq, "vcvtph2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x79), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Unsigned QWORD Integers -INST3(vcvtph2uw, "vcvtph2uw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x7D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Unsigned WORD Integers -INST3(vcvtph2w, "vcvtph2w", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x7D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Signed WORD Integers +INST3(vcvtph2uw, "vcvtph2uw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x7D), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Unsigned WORD Integers +INST3(vcvtph2w, "vcvtph2w", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x7D), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Packed FP16 Values to Packed Signed WORD Integers INST3(vcvtps2phx, "vcvtps2phx", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x1D), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert Packed Single Precision FP Values to Packed FP16 Values INST3(vcvtqq2ph, "vcvtqq2ph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x5B), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Convert Packed Signed QWORD Integers to Packed FP16 Values -INST3(vcvtsd2sh, "vcvtsd2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x05, 0x5A), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar Double Precision FP Value to Scalar FP16 Value -INST3(vcvtsh2sd, "vcvtsh2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x5A), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Double Precision FP Value -INST3(vcvtsh2si32, "vcvtsh2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2D), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Signed DWORD Integer -INST3(vcvtsh2si64, "vcvtsh2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2D), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Signed QWORD Integer -INST3(vcvtsh2ss, "vcvtsh2ss", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x13), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Single Precision FP Value -INST3(vcvtsh2usi32, "vcvtsh2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x79), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Unsigned DWORD Integer -INST3(vcvtsh2usi64, "vcvtsh2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x79), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Unsigned QWORD Integer -INST3(vcvtsi2sh32, "vcvtsi2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2A), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar Signed DWORD Integer to Scalar FP16 Value -INST3(vcvtsi2sh64, "vcvtsi2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2A), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar Signed QWORD Integer to Scalar FP16 Value -INST3(vcvtss2sh, "vcvtss2sh", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x1D), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar Single Precision FP Value to Scalar FP16 Value -INST3(vcvttph2dq, "vcvttph2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x5B), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Signed DWORD Integers +INST3(vcvtsd2sh, "vcvtsd2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x05, 0x5A), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar Double Precision FP Value to Scalar FP16 Value +INST3(vcvtsh2sd, "vcvtsh2sd", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x5A), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Double Precision FP Value +INST3(vcvtsh2si32, "vcvtsh2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2D), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Signed DWORD Integer +INST3(vcvtsh2si64, "vcvtsh2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2D), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Signed QWORD Integer +INST3(vcvtsh2ss, "vcvtsh2ss", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x06, 0x13), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Single Precision FP Value +INST3(vcvtsh2usi32, "vcvtsh2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x79), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Unsigned DWORD Integer +INST3(vcvtsh2usi64, "vcvtsh2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x79), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar FP16 Value to Scalar Unsigned QWORD Integer +INST3(vcvtsi2sh32, "vcvtsi2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2A), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar Signed DWORD Integer to Scalar FP16 Value +INST3(vcvtsi2sh64, "vcvtsi2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2A), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar Signed QWORD Integer to Scalar FP16 Value +INST3(vcvtss2sh, "vcvtss2sh", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x1D), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar Single Precision FP Value to Scalar FP16 Value +INST3(vcvttph2dq, "vcvttph2dq", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x5B), 8C, 1C, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Signed DWORD Integers INST3(vcvttph2qq, "vcvttph2qq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x7A), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Signed QWORD Integers -INST3(vcvttph2udq, "vcvttph2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x78), ILLEGAL, ILLEGAL, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Unsigned DWORD Integers +INST3(vcvttph2udq, "vcvttph2udq", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x78), 8C, 1C, INS_TT_HALF_MEM, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Unsigned DWORD Integers INST3(vcvttph2uqq, "vcvttph2uqq", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x78), ILLEGAL, ILLEGAL, INS_TT_QUARTER_MEM, Input_16Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Unsigned QWORD Integers -INST3(vcvttph2uw, "vcvttph2uw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x7C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Unsigned WORD Integers -INST3(vcvttph2w, "vcvttph2w", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x7C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Signed WORD Integers -INST3(vcvttsh2si32, "vcvttsh2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2C), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert with Truncation Scalar FP16 Value to Scalar Signed DWORD Integer -INST3(vcvttsh2si64, "vcvttsh2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2C), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert with Truncation Scalar FP16 Value to Scalar Signed QWORD Integer -INST3(vcvttsh2usi32, "vcvttsh2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x78), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert with Truncation Scalar FP16 Value to Scalar Unsigned DWORD Integer -INST3(vcvttsh2usi64, "vcvttsh2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x78), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert with Truncation Scalar FP16 Value to Scalar Unsigned QWORD Integer +INST3(vcvttph2uw, "vcvttph2uw", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x7C), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Unsigned WORD Integers +INST3(vcvttph2w, "vcvttph2w", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x7C), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert with Truncation Packed FP16 Values to Packed Signed WORD Integers +INST3(vcvttsh2si32, "vcvttsh2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2C), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert with Truncation Scalar FP16 Value to Scalar Signed DWORD Integer +INST3(vcvttsh2si64, "vcvttsh2si", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x2C), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert with Truncation Scalar FP16 Value to Scalar Signed QWORD Integer +INST3(vcvttsh2usi32, "vcvttsh2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x78), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert with Truncation Scalar FP16 Value to Scalar Unsigned DWORD Integer +INST3(vcvttsh2usi64, "vcvttsh2usi", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x78), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert with Truncation Scalar FP16 Value to Scalar Unsigned QWORD Integer INST3(vcvtudq2ph, "vcvtudq2ph", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x05, 0x7A), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Convert Packed Single Precision FP Values to Packed FP16 Values INST3(vcvtuqq2ph, "vcvtuqq2ph", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x05, 0x7A), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // Convert Packed Single Precision FP Values to Packed FP16 Values -INST3(vcvtusi2sh32, "vcvtusi2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x7B), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar Unsigned DWORD Integer to Scalar FP16 Value -INST3(vcvtusi2sh64, "vcvtusi2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x7B), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar Unsigned QWORD Integer to Scalar FP16 Value -INST3(vcvtuw2ph, "vcvtuw2ph", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x05, 0x7D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Packed Single Precision FP Values to Packed FP16 Values -INST3(vcvtw2ph, "vcvtw2ph", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x7D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Packed Single Precision FP Values to Packed FP16 Values +INST3(vcvtusi2sh32, "vcvtusi2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x7B), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Scalar Unsigned DWORD Integer to Scalar FP16 Value +INST3(vcvtusi2sh64, "vcvtusi2sh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x7B), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1 | Encoding_EVEX) // Convert Scalar Unsigned QWORD Integer to Scalar FP16 Value +INST3(vcvtuw2ph, "vcvtuw2ph", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x05, 0x7D), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Packed Single Precision FP Values to Packed FP16 Values +INST3(vcvtw2ph, "vcvtw2ph", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x7D), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Packed Single Precision FP Values to Packed FP16 Values INST3(vdivph, "vdivph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x5E), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Divide Packed FP16 Values INST3(vdivsh, "vdivsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x5E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Divide Scalar FP16 Values -INST3(vdpbf16ps, "vdpbf16ps", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x52), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Dot Product of BF16 Pairs Accumulated Into Packed Single Precision -INST3(vfcmaddcph, "vfcmaddcph", IUM_RW, BAD_CODE, BAD_CODE, SSEDBLMAP(0x06, 0x56), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Complex Multiply and Accumulate Packed FP16 Values -INST3(vfcmaddcsh, "vfcmaddcsh", IUM_RW, BAD_CODE, BAD_CODE, SSEDBLMAP(0x06, 0x57), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Complex Multiply and Accumulate Scalar FP16 Values -INST3(vfcmulcph, "vfcmulcph", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x06, 0xD6), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Complex Multiply Packed FP16 Values -INST3(vfcmulcsh, "vfcmulcsh", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x06, 0xD7), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Complex Multiply Scalar FP16 Values -INST3(vfmadd132ph, "vvfmadd132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x98), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Packed FP16 Values -INST3(vfmadd132sh, "vvfmadd132sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x99), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Scalar FP16 Values -INST3(vfmadd213ph, "vvfmadd213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xA8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Packed FP16 Values -INST3(vfmadd213sh, "vvfmadd213sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xA9), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Scalar FP16 Values -INST3(vfmadd231ph, "vvfmadd231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xB8), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Packed FP16 Values -INST3(vfmadd231sh, "vvfmadd231sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xB9), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Scalar FP16 Values -INST3(vfmaddcph, "vfmaddcph", IUM_RW, BAD_CODE, BAD_CODE, SSEFLTMAP(0x06, 0x56), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Complex Multiply and Accumulate Packed FP16 Values -INST3(vfmaddcsh, "vfmaddcsh", IUM_RW, BAD_CODE, BAD_CODE, SSEFLTMAP(0x06, 0x57), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Complex Multiply and Accumulate Scalar FP16 Values -INST3(vfmaddsub132ph, "vvfmaddsub132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x96), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Add/Subtract of Packed FP16 Values -INST3(vfmaddsub213ph, "vvfmaddsub213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xA6), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Add/Subtract of Packed FP16 Values -INST3(vfmaddsub231ph, "vvfmaddsub231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xB6), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Add/Subtract of Packed FP16 Values -INST3(vfmsub132ph, "vvfmsub132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9A), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Packed FP16 Values -INST3(vfmsub132sh, "vvfmsub132sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9B), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Scalar FP16 Values -INST3(vfmsub213ph, "vvfmsub213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAA), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Packed FP16 Values -INST3(vfmsub213sh, "vvfmsub213sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAB), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Scalar FP16 Values -INST3(vfmsub231ph, "vvfmsub231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBA), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Packed FP16 Values -INST3(vfmsub231sh, "vvfmsub231sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBB), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Scalar FP16 Values -INST3(vfmsubadd132ph, "vvfmsubadd132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x97), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Subtract/Add of Packed FP16 Values -INST3(vfmsubadd213ph, "vvfmsubadd213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xA7), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Subtract/Add of Packed FP16 Values -INST3(vfmsubadd231ph, "vvfmsubadd231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xB7), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Subtract/Add of Packed FP16 Values -INST3(vfmulcph, "vfmulcph", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x06, 0xD6), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Complex Multiply Packed FP16 Values -INST3(vfmulcsh, "vfmulcsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x06, 0xD7), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Complex Multiply Scalar FP16 Values -INST3(vfnmadd132ph, "vvfnmadd132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Packed FP16 Values -INST3(vfnmadd132sh, "vvfnmadd132sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9D), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Scalar FP16 Values -INST3(vfnmadd213ph, "vvfnmadd213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Packed FP16 Values -INST3(vfnmadd213sh, "vvfnmadd213sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAD), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Scalar FP16 Values -INST3(vfnmadd231ph, "vvfnmadd231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBC), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Packed FP16 Values -INST3(vfnmadd231sh, "vvfnmadd231sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBD), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Scalar FP16 Values -INST3(vfnmsub132ph, "vvfnmsub132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9E), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Packed FP16 Values -INST3(vfnmsub132sh, "vvfnmsub132sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9F), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Scalar FP16 Values -INST3(vfnmsub213ph, "vvfnmsub213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAE), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Packed FP16 Values -INST3(vfnmsub213sh, "vvfnmsub213sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAF), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Scalar FP16 Values -INST3(vfnmsub231ph, "vvfnmsub231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBE), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Packed FP16 Values -INST3(vfnmsub231sh, "vvfnmsub231sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBF), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Scalar FP16 Values -INST3(vfpclassph, "vfpclassph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x66), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Test Types of Packed FP16 Values -INST3(vfpclasssh, "vfpclasssh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x67), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Test Types of Scalar FP16 Values -INST3(vgetexpph, "vgetexpph", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x42), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Exponents of Packed FP16 Values to FP16 Values -INST3(vgetexpsh, "vgetexpsh", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x43), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Exponents of Scalar FP16 Values to FP16 Values -INST3(vgetmantph, "vgetmantph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x26), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Extract Normalized Mantissas from Packed FP16 Values -INST3(vgetmantsh, "vgetmantsh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x27), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Extract Normalized Mantissas from Scalar FP16 Values -INST3(vmaxph, "vmaxph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x5F), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Return Maximum of Packed FP16 Values -INST3(vmaxsh, "vmaxsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x5F), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Return Maximum of Scalar FP16 Values -INST3(vminph, "vminph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x00, 0x5D), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Return Minimum of Packed FP16 Values -INST3(vminsh, "vminsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x00, 0x5D), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Return Minimum of Scalar FP16 Values +INST3(vdpbf16ps, "vdpbf16ps", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF3, 0x52), 8C, 2C, INS_TT_FULL, Input_16Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Dot Product of BF16 Pairs Accumulated Into Packed Single Precision +INST3(vfcmaddcph, "vfcmaddcph", IUM_RW, BAD_CODE, BAD_CODE, SSEDBLMAP(0x06, 0x56), 8C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Complex Multiply and Accumulate Packed FP16 Values +INST3(vfcmaddcsh, "vfcmaddcsh", IUM_RW, BAD_CODE, BAD_CODE, SSEDBLMAP(0x06, 0x57), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Complex Multiply and Accumulate Scalar FP16 Values +INST3(vfcmulcph, "vfcmulcph", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x06, 0xD6), 8C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Complex Multiply Packed FP16 Values +INST3(vfcmulcsh, "vfcmulcsh", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x06, 0xD7), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Complex Multiply Scalar FP16 Values +INST3(vfmadd132ph, "vvfmadd132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x98), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Packed FP16 Values +INST3(vfmadd132sh, "vvfmadd132sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x99), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Scalar FP16 Values +INST3(vfmadd213ph, "vvfmadd213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xA8), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Packed FP16 Values +INST3(vfmadd213sh, "vvfmadd213sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xA9), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Scalar FP16 Values +INST3(vfmadd231ph, "vvfmadd231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xB8), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Packed FP16 Values +INST3(vfmadd231sh, "vvfmadd231sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xB9), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Add of Scalar FP16 Values +INST3(vfmaddcph, "vfmaddcph", IUM_RW, BAD_CODE, BAD_CODE, SSEFLTMAP(0x06, 0x56), 8C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Complex Multiply and Accumulate Packed FP16 Values +INST3(vfmaddcsh, "vfmaddcsh", IUM_RW, BAD_CODE, BAD_CODE, SSEFLTMAP(0x06, 0x57), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Complex Multiply and Accumulate Scalar FP16 Values +INST3(vfmaddsub132ph, "vvfmaddsub132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x96), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Add/Subtract of Packed FP16 Values +INST3(vfmaddsub213ph, "vvfmaddsub213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xA6), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Add/Subtract of Packed FP16 Values +INST3(vfmaddsub231ph, "vvfmaddsub231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xB6), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Add/Subtract of Packed FP16 Values +INST3(vfmsub132ph, "vvfmsub132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9A), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Packed FP16 Values +INST3(vfmsub132sh, "vvfmsub132sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9B), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Scalar FP16 Values +INST3(vfmsub213ph, "vvfmsub213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAA), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Packed FP16 Values +INST3(vfmsub213sh, "vvfmsub213sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAB), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Scalar FP16 Values +INST3(vfmsub231ph, "vvfmsub231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBA), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Packed FP16 Values +INST3(vfmsub231sh, "vvfmsub231sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBB), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Multiply-Subtract of Scalar FP16 Values +INST3(vfmsubadd132ph, "vvfmsubadd132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x97), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Subtract/Add of Packed FP16 Values +INST3(vfmsubadd213ph, "vvfmsubadd213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xA7), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Subtract/Add of Packed FP16 Values +INST3(vfmsubadd231ph, "vvfmsubadd231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xB7), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Multiply-Alternating Subtract/Add of Packed FP16 Values +INST3(vfmulcph, "vfmulcph", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x06, 0xD6), 8C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Complex Multiply Packed FP16 Values +INST3(vfmulcsh, "vfmulcsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x06, 0xD7), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Complex Multiply Scalar FP16 Values +INST3(vfnmadd132ph, "vvfnmadd132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9C), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Packed FP16 Values +INST3(vfnmadd132sh, "vvfnmadd132sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9D), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Scalar FP16 Values +INST3(vfnmadd213ph, "vvfnmadd213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAC), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Packed FP16 Values +INST3(vfnmadd213sh, "vvfnmadd213sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAD), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Scalar FP16 Values +INST3(vfnmadd231ph, "vvfnmadd231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBC), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Packed FP16 Values +INST3(vfnmadd231sh, "vvfnmadd231sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBD), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Add of Scalar FP16 Values +INST3(vfnmsub132ph, "vvfnmsub132ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9E), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Packed FP16 Values +INST3(vfnmsub132sh, "vvfnmsub132sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x9F), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Scalar FP16 Values +INST3(vfnmsub213ph, "vvfnmsub213ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAE), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Packed FP16 Values +INST3(vfnmsub213sh, "vvfnmsub213sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xAF), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Scalar FP16 Values +INST3(vfnmsub231ph, "vvfnmsub231ph", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBE), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Packed FP16 Values +INST3(vfnmsub231sh, "vvfnmsub231sh", IUM_RW, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0xBF), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Fused Negative Multiply-Subtract of Scalar FP16 Values +INST3(vfpclassph, "vfpclassph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x66), 3C, 1C, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Test Types of Packed FP16 Values +INST3(vfpclasssh, "vfpclasssh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x67), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Test Types of Scalar FP16 Values +INST3(vgetexpph, "vgetexpph", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x42), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Convert Exponents of Packed FP16 Values to FP16 Values +INST3(vgetexpsh, "vgetexpsh", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x43), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Convert Exponents of Scalar FP16 Values to FP16 Values +INST3(vgetmantph, "vgetmantph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x26), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Extract Normalized Mantissas from Packed FP16 Values +INST3(vgetmantsh, "vgetmantsh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x27), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Extract Normalized Mantissas from Scalar FP16 Values +INST3(vmaxph, "vmaxph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x5F), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Return Maximum of Packed FP16 Values +INST3(vmaxsh, "vmaxsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x5F), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Return Maximum of Scalar FP16 Values +INST3(vminsh, "vminsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x00, 0x5D), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Return Minimum of Scalar FP16 Values +INST3(vminph, "vminph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x00, 0x5D), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Return Minimum of Packed FP16 Values INST3(vmovsh, "vmovsh", IUM_WR, SSEFLTMAP(0x00, 0x11), BAD_CODE, SSEFLTMAP(0x00, 0x10), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Move Scalar FP16 Value INST3(vmovw, "vmovw", IUM_WR, PCKDBLMAP(0x06, 0x7E), BAD_CODE, PCKDBLMAP(0x00, 0x6E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_WIG | Encoding_EVEX) // Move Word -INST3(vmulph, "vmulph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x59), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Multiply Packed FP16 Values -INST3(vmulsh, "vmulsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x59), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Multiply Scalar FP16 Values +INST3(vmulph, "vmulph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x59), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Multiply Packed FP16 Values +INST3(vmulsh, "vmulsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x59), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Multiply Scalar FP16 Values INST3(vrcpph, "vrcpph", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x4C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Compute REciprocals of Packed FP16 Values -INST3(vrcpsh, "vrcpsh", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x4D), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Compute REciprocals of Scalar FP16 Values -INST3(vreduceph, "vreduceph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x56), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Perform Reduction Transformation on Packed FP16 Values -INST3(vreducesh, "vreducesh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x57), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Perform Reduction Transformation on Scalar FP16 Values -INST3(vrndscaleph, "vrndscaleph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x08), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Round Packed FP16 Values to Include a Given Number of Fraction Bits -INST3(vrndscalesh, "vrndscalesh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x0A), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Round Scalar FP16 Values to Include a Given Number of Fraction Bits +INST3(vrcpsh, "vrcpsh", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x4D), 4C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Compute REciprocals of Scalar FP16 Values +INST3(vreduceph, "vreduceph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x56), 12C, 2C, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Perform Reduction Transformation on Packed FP16 Values +INST3(vreducesh, "vreducesh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x57), 12C, 2C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Perform Reduction Transformation on Scalar FP16 Values +INST3(vrndscaleph, "vrndscaleph", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x08), 8C, 1C, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Round Packed FP16 Values to Include a Given Number of Fraction Bits +INST3(vrndscalesh, "vrndscalesh", IUM_WR, BAD_CODE, BAD_CODE, PSSE3A(0x00, 0x0A), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Round Scalar FP16 Values to Include a Given Number of Fraction Bits INST3(vrsqrtph, "vrsqrtph", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x4E), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Compute REciprocals of Square Roots of Packed FP16 Values -INST3(vrsqrtsh, "vrsqrtsh", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x4F), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Compute REciprocals of Square Roots of Scalar FP16 Values -INST3(vscalefph, "vscalefph", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x2C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Scale Packed FP16 Values with FP16 Values -INST3(vscalefsh, "vscalefsh", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x2D), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Scale Scalar FP16 Values with FP16 Values +INST3(vrsqrtsh, "vrsqrtsh", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x4F), 5C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Compute REciprocals of Square Roots of Scalar FP16 Values +INST3(vscalefph, "vscalefph", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x2C), 8C, 1C, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Scale Packed FP16 Values with FP16 Values +INST3(vscalefsh, "vscalefsh", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x06, 0x2D), 8C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Scale Scalar FP16 Values with FP16 Values INST3(vsqrtph, "vsqrtph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x51), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Compute Square Root of Packed FP16 Values -INST3(vsqrtsh, "vsqrtsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x51), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Compute Square Root of Scalar FP16 Values -INST3(vsubph, "vsubph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x5C), ILLEGAL, ILLEGAL, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Subtract Packed FP16 Values -INST3(vsubsh, "vsubsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x5C), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Subtract Scalar FP16 Values -INST3(vucomish, "vucomish", IUM_RD, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x2E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Compare Scalar Unordered FP16 Values and Set EFLAGS +INST3(vsqrtsh, "vsqrtsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x51), 15C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Compute Square Root of Scalar FP16 Values +INST3(vsubph, "vsubph", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x5C), 4C, 2X, INS_TT_FULL_MEM, Input_16Bit | KMask_Base8 | REX_W0 | Encoding_EVEX) // Subtract Packed FP16 Values +INST3(vsubsh, "vsubsh", IUM_WR, BAD_CODE, BAD_CODE, SSEFLTMAP(0x05, 0x5C), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_16Bit | KMask_Base1 | REX_W0 | Encoding_EVEX) // Subtract Scalar FP16 Values +INST3(vucomish, "vucomish", IUM_RD, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x2E), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Compare Scalar Unordered FP16 Values and Set EFLAGS // AVX512-VP2INTERSECT INST3(vp2intersectd, "vp2intersectd", IUM_WR, BAD_CODE, BAD_CODE, PSSE38(0xF2, 0x68), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // Compute Intersection Between DWORDS to a Pair of Mask Registers @@ -1110,16 +1110,16 @@ INST3(vcomxsd, "vcomxsd", IUM_RD, BAD_CODE, BAD_ INST3(vcomxss, "vcomxss", IUM_RD, BAD_CODE, BAD_CODE, SSEDBL(0x2f), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Compare single precision floating point values and set flags INST3(vcvtps2ibs, "vcvtps2ibs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x69), 7C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) INST3(vcvtps2iubs, "vcvtps2iubs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x6B), 7C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation scalar single to unsigned DWORD/QWORD -INST3(vcvttpd2dqs, "vcvttpd2dqs", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x6D), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // cvt with truncation/saturation packed doubles to DWORDs +INST3(vcvttpd2dqs, "vcvttpd2dqs", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x6D), 7C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // cvt with truncation/saturation packed doubles to DWORDs INST3(vcvttpd2qqs, "vcvttpd2qqs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x6D), 7C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // cvt with truncation/saturation packed doubles to signed QWORDs -INST3(vcvttpd2udqs, "vcvttpd2udqs", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x6C), ILLEGAL, ILLEGAL, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // cvt with truncation/saturation packed doubles to unsigned DWORDs +INST3(vcvttpd2udqs, "vcvttpd2udqs", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x6C), 7C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // cvt with truncation/saturation packed doubles to unsigned DWORDs INST3(vcvttpd2uqqs, "vcvttpd2uqqs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x6C), 7C, 1C, INS_TT_FULL, Input_64Bit | KMask_Base2 | REX_W1 | Encoding_EVEX) // cvt with truncation/saturation packed doubles to signed QWORDs INST3(vcvttps2dqs, "vcvttps2dqs", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x6D), 7C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation packed singles to DWORDs INST3(vcvttps2ibs, "vcvttps2ibs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x68), 7C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation scalar single to unsigned DWORD/QWORD INST3(vcvttps2iubs, "vcvttps2iubs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x6A), 7C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation scalar single to unsigned DWORD/QWORD -INST3(vcvttps2qqs, "vcvttps2qqs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x6D), ILLEGAL, ILLEGAL, INS_TT_HALF, Input_32Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation packed singles to signed QWORDs +INST3(vcvttps2qqs, "vcvttps2qqs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x6D), 7C, 1C, INS_TT_HALF, Input_32Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation packed singles to signed QWORDs INST3(vcvttps2udqs, "vcvttps2udqs", IUM_WR, BAD_CODE, BAD_CODE, PCKFLTMAP(0x05, 0x6C), 7C, 1C, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation packed singles to unsigned DWORDs -INST3(vcvttps2uqqs, "vcvttps2uqqs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x6C), ILLEGAL, ILLEGAL, INS_TT_HALF, Input_32Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation packed singles to unsigned QWORDs +INST3(vcvttps2uqqs, "vcvttps2uqqs", IUM_WR, BAD_CODE, BAD_CODE, PCKDBLMAP(0x05, 0x6C), 7C, 1C, INS_TT_HALF, Input_32Bit | KMask_Base2 | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation packed singles to unsigned QWORDs INST3(vcvttsd2sis32, "vcvttsd2sis", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x05, 0x6D), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation scalar double to signed DWORDs INST3(vcvttsd2sis64, "vcvttsd2sis", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x05, 0x6D), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX) // cvt with truncation/saturation scalar double to signed DWORDs INST3(vcvttsd2usis32, "vcvttsd2usis", IUM_WR, BAD_CODE, BAD_CODE, SSEDBLMAP(0x05, 0x6C), 7C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W0 | Encoding_EVEX) // cvt with truncation/saturation scalar double to unsigned DWORD @@ -1132,8 +1132,8 @@ INST3(vminmaxpd, "vminmaxpd", IUM_WR, BAD_CODE, BAD_ INST3(vminmaxps, "vminmaxps", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x52), 4C, 2X, INS_TT_FULL, Input_32Bit | KMask_Base4 | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Return Maximum packed singles INST3(vminmaxsd, "vminmaxsd", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x53), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_64Bit | KMask_Base1 | REX_W1 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Return Minimum/Maximum scalar double INST3(vminmaxss, "vminmaxss", IUM_WR, BAD_CODE, BAD_CODE, SSE3A(0x53), 4C, 2X, INS_TT_TUPLE1_SCALAR, Input_32Bit | KMask_Base1 | REX_W0 | Encoding_EVEX | INS_Flags_IsDstSrcSrcAVXInstruction) // Return Minimum/Maximum scalar single -INST3(vmovd_simd, "vmovd", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX) // Move DWORD between xmm regs <-> memory/xmm regs -INST3(vmovw_simd, "vmovw", IUM_WR, SSEFLTMAP(0x05, 0x7E), BAD_CODE, SSEFLTMAP(0x05, 0x6E), ILLEGAL, ILLEGAL, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Move WORD between xmm regs <-> memory/xmm regs +INST3(vmovd_simd, "vmovd", IUM_WR, PCKDBL(0xD6), BAD_CODE, SSEFLT(0x7E), 1C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX) // Move DWORD between xmm regs <-> memory/xmm regs +INST3(vmovw_simd, "vmovw", IUM_WR, SSEFLTMAP(0x05, 0x7E), BAD_CODE, SSEFLTMAP(0x05, 0x6E), 1C, 1C, INS_TT_TUPLE1_SCALAR, Input_16Bit | REX_W0 | Encoding_EVEX) // Move WORD between xmm regs <-> memory/xmm regs INST3(vmpsadbw, "vmpsadbw", IUM_WR, BAD_CODE, BAD_CODE, AVX3A(0x42), 4C, 2C, INS_TT_FULL_MEM, KMask_Base8 | REX_W0 | Encoding_EVEX | INS_Flags_IsDstDstSrcAVXInstruction) // Compute Multiple Packed Sums of Absolute Difference INST3(vucomxsd, "vucomxsd", IUM_RD, BAD_CODE, BAD_CODE, SSEFLT(0x2f), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_64Bit | REX_W1 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Perform an unordered compare of double precision floating point values and set flags INST3(vucomxss, "vucomxss", IUM_RD, BAD_CODE, BAD_CODE, SSEDBL(0x2E), 3C, 1C, INS_TT_TUPLE1_SCALAR, Input_32Bit | REX_W0 | Encoding_EVEX | Writes_OF | Writes_SF | Writes_ZF | Writes_PF | Writes_CF | Resets_AF) // Perform an unordered compare of single precision floating point values and set flags @@ -1143,42 +1143,42 @@ INST3(vucomxss, "vucomxss", IUM_RD, BAD_CODE, BAD_ // id nm um mr mi rm lat tp tt flags #define FIRST_APX_INSTRUCTION INS_ccmpo #define FIRST_CCMP_INSTRUCTION INS_ccmpo -INST3(ccmpo, "ccmpo", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpno, "ccmpno", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpb, "ccmpb", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpae, "ccmpae", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpe, "ccmpe", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpne, "ccmpne", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpbe, "ccmpbe", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpa, "ccmpa", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmps, "ccmps", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpns, "ccmpns", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpt, "ccmpt", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpf, "ccmpf", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpl, "ccmpl", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpge, "ccmpge", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmple, "ccmple", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) -INST3(ccmpg, "ccmpg", IUM_RD, 0x000038, 0x0003880, 0x00003A, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpo, "ccmpo", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpno, "ccmpno", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpb, "ccmpb", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpae, "ccmpae", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpe, "ccmpe", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpne, "ccmpne", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpbe, "ccmpbe", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpa, "ccmpa", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmps, "ccmps", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpns, "ccmpns", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpt, "ccmpt", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpf, "ccmpf", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpl, "ccmpl", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpge, "ccmpge", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmple, "ccmple", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) +INST3(ccmpg, "ccmpg", IUM_RD, 0x000038, 0x0003880, 0x00003A, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_CF | INS_FLAGS_Has_Sbit) #define LAST_CCMP_INSTRUCTION INS_ccmpg INST3(crc32_apx, "crc32", IUM_RW, BAD_CODE, BAD_CODE, 0x0000F0, 3C, 1C, INS_TT_NONE, INS_FLAGS_None) -INST3(movbe_apx, "movbe", IUM_WR, 0x000061, BAD_CODE, 0x000060, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_None) - -INST3(seto_apx, "setzuo", IUM_WR, SSEDBLMAP(4, 0x40), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) -INST3(setno_apx, "setzuno", IUM_WR, SSEDBLMAP(4, 0x41), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF) -INST3(setb_apx, "setzub", IUM_WR, SSEDBLMAP(4, 0x42), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) -INST3(setae_apx, "setzuae", IUM_WR, SSEDBLMAP(4, 0x43), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF) -INST3(sete_apx, "setzue", IUM_WR, SSEDBLMAP(4, 0x44), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) -INST3(setne_apx, "setzune", IUM_WR, SSEDBLMAP(4, 0x45), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF) -INST3(setbe_apx, "setzube", IUM_WR, SSEDBLMAP(4, 0x46), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) -INST3(seta_apx, "setzua", IUM_WR, SSEDBLMAP(4, 0x47), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF) -INST3(sets_apx, "setzus", IUM_WR, SSEDBLMAP(4, 0x48), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) -INST3(setns_apx, "setzuns", IUM_WR, SSEDBLMAP(4, 0x49), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF) -INST3(setp_apx, "setzup", IUM_WR, SSEDBLMAP(4, 0x4A), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) -INST3(setnp_apx, "setzunp", IUM_WR, SSEDBLMAP(4, 0x4B), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF) -INST3(setl_apx, "setzul", IUM_WR, SSEDBLMAP(4, 0x4C), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) -INST3(setge_apx, "setzuge", IUM_WR, SSEDBLMAP(4, 0x4D), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF) -INST3(setle_apx, "setzule", IUM_WR, SSEDBLMAP(4, 0x4E), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) -INST3(setg_apx, "setzug", IUM_WR, SSEDBLMAP(4, 0x4F), BAD_CODE, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST3(movbe_apx, "movbe", IUM_WR, 0x000061, BAD_CODE, 0x000060, 1C, 2X, INS_TT_NONE, INS_FLAGS_None) + +INST3(seto_apx, "setzuo", IUM_WR, SSEDBLMAP(4, 0x40), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_OF) +INST3(setno_apx, "setzuno", IUM_WR, SSEDBLMAP(4, 0x41), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_OF) +INST3(setb_apx, "setzub", IUM_WR, SSEDBLMAP(4, 0x42), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_CF) +INST3(setae_apx, "setzuae", IUM_WR, SSEDBLMAP(4, 0x43), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_CF) +INST3(sete_apx, "setzue", IUM_WR, SSEDBLMAP(4, 0x44), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_ZF) +INST3(setne_apx, "setzune", IUM_WR, SSEDBLMAP(4, 0x45), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_ZF) +INST3(setbe_apx, "setzube", IUM_WR, SSEDBLMAP(4, 0x46), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_ZF | Reads_CF) +INST3(seta_apx, "setzua", IUM_WR, SSEDBLMAP(4, 0x47), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_ZF | Reads_CF) +INST3(sets_apx, "setzus", IUM_WR, SSEDBLMAP(4, 0x48), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_SF) +INST3(setns_apx, "setzuns", IUM_WR, SSEDBLMAP(4, 0x49), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_SF) +INST3(setp_apx, "setzup", IUM_WR, SSEDBLMAP(4, 0x4A), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_PF) +INST3(setnp_apx, "setzunp", IUM_WR, SSEDBLMAP(4, 0x4B), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_PF) +INST3(setl_apx, "setzul", IUM_WR, SSEDBLMAP(4, 0x4C), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF) +INST3(setge_apx, "setzuge", IUM_WR, SSEDBLMAP(4, 0x4D), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF) +INST3(setle_apx, "setzule", IUM_WR, SSEDBLMAP(4, 0x4E), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) +INST3(setg_apx, "setzug", IUM_WR, SSEDBLMAP(4, 0x4F), BAD_CODE, BAD_CODE, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF) #define LAST_APX_INSTRUCTION INS_setg_apx // Scalar instructions in SSE4.2 @@ -1197,7 +1197,7 @@ INST3(lzcnt_apx, "lzcnt", IUM_WR, BAD_CODE, BAD_CODE, #endif // MOVBE -INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_None) +INST3(movbe, "movbe", IUM_WR, PCKMVB(0xF1), BAD_CODE, PCKMVB(0xF0), 1C, 2X, INS_TT_NONE, INS_FLAGS_None) // POPCNT INST3(popcnt, "popcnt", IUM_WR, BAD_CODE, BAD_CODE, SSEFLT(0xB8), 3C, 1C, INS_TT_NONE, Resets_OF | Resets_SF | Writes_ZF | Resets_AF | Resets_PF | Resets_CF | Encoding_REX2) @@ -1210,31 +1210,36 @@ INST3(tpause, "tpause", IUM_RD, BAD_CODE, BAD_CODE, INST3(umonitor, "umonitor", IUM_RD, BAD_CODE, BAD_CODE, SSEFLT(0xAE), ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_None) // User Level Set Up Monitor Address INST3(umwait, "umwait", IUM_RD, BAD_CODE, BAD_CODE, SSEDBL(0xAE), ILLEGAL, ILLEGAL, INS_TT_NONE, Resets_OF | Resets_SF | Resets_ZF | Resets_AF | Resets_PF | Writes_CF) // User Level Monitor Wait -INST3(neg, "neg", IUM_RW, 0x0018F6, BAD_CODE, 0x0018F6, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(not, "not", IUM_RW, 0x0010F6, BAD_CODE, 0x0010F6, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) +INST3(neg, "neg", IUM_RW, 0x0018F6, BAD_CODE, 0x0018F6, 1C, 4X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Writes_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(not, "not", IUM_RW, 0x0010F6, BAD_CODE, 0x0010F6, 1C, 4X, INS_TT_NONE, INS_FLAGS_None | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, 0x0000D2, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rol, "rol", IUM_RW, 0x0000D2, BAD_CODE, 0x0000D2, 1C, 2X, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) INST3(rol_1, "rol", IUM_RW, 0x0000D0, 0x0000D0, 0x0000D0, 1C, 1C, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, 0x0008D2, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(rol_N, "rol", IUM_RW, 0x0000C0, 0x0000C0, BAD_CODE, 1C, 2X, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) + +INST3(ror, "ror", IUM_RW, 0x0008D2, BAD_CODE, 0x0008D2, 1C, 2X, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) INST3(ror_1, "ror", IUM_RW, 0x0008D0, 0x0008D0, 0x0008D0, 1C, 1C, INS_TT_NONE, Writes_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(ror_N, "ror", IUM_RW, 0x0008C0, 0x0008C0, BAD_CODE, 1C, 2X, INS_TT_NONE, Undefined_OF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) INST3(rcl, "rcl", IUM_RW, 0x0010D2, BAD_CODE, 0x0010D2, 6C, 6C, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) INST3(rcl_1, "rcl", IUM_RW, 0x0010D0, 0x0010D0, 0x0010D0, 2C, 1C, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) INST3(rcl_N, "rcl", IUM_RW, 0x0010C0, 0x0010C0, BAD_CODE, 6C, 6C, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) + INST3(rcr, "rcr", IUM_RW, 0x0018D2, BAD_CODE, 0x0018D2, 6C, 6C, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) INST3(rcr_1, "rcr", IUM_RW, 0x0018D0, 0x0018D0, 0x0018D0, 2C, 1C, INS_TT_NONE, Writes_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) INST3(rcr_N, "rcr", IUM_RW, 0x0018C0, 0x0018C0, BAD_CODE, 6C, 6C, INS_TT_NONE, Undefined_OF | Writes_CF | Reads_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD) -INST3(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, 0x0020D2, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, 0x0020D0, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, 0x0028D2, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, 0x0028D0, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, 0x0038D2, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, 0x0038D0, ILLEGAL, ILLEGAL, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) -INST3(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, BAD_CODE, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) + +INST3(shl, "shl", IUM_RW, 0x0020D2, BAD_CODE, 0x0020D2, 2C, 1C, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shl_1, "shl", IUM_RW, 0x0020D0, 0x0020D0, 0x0020D0, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shl_N, "shl", IUM_RW, 0x0020C0, 0x0020C0, BAD_CODE, 1C, 2X, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) + +INST3(shr, "shr", IUM_RW, 0x0028D2, BAD_CODE, 0x0028D2, 2C, 1C, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shr_1, "shr", IUM_RW, 0x0028D0, 0x0028D0, 0x0028D0, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(shr_N, "shr", IUM_RW, 0x0028C0, 0x0028C0, BAD_CODE, 1C, 2X, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) + +INST3(sar, "sar", IUM_RW, 0x0038D2, BAD_CODE, 0x0038D2, 2C, 1C, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(sar_1, "sar", IUM_RW, 0x0038D0, 0x0038D0, 0x0038D0, 1C, 2X, INS_TT_NONE, Writes_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) +INST3(sar_N, "sar", IUM_RW, 0x0038C0, 0x0038C0, BAD_CODE, 1C, 2X, INS_TT_NONE, Undefined_OF | Writes_SF | Writes_ZF | Undefined_AF | Writes_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NDD | INS_Flags_Has_NF) // id nm um mr mi lat tp tt flags INST2(ret, "ret", IUM_RD, 0x0000C3, 0x0000C2, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_None) @@ -1272,7 +1277,7 @@ INST1(leave, "leave", IUM_RD, 0x0000C9, INST1(serialize, "serialize", IUM_RD, 0x0fe801, 105C, 50C, INS_TT_NONE, INS_FLAGS_None) -INST1(cwde, "cwde", IUM_RD, 0x000098, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_HasPseudoName) +INST1(cwde, "cwde", IUM_RD, 0x000098, 1C, 4X, INS_TT_NONE, INS_FLAGS_HasPseudoName) INST1(cdq, "cdq", IUM_RD, 0x000099, 1C, 2X, INS_TT_NONE, INS_FLAGS_HasPseudoName) INST1(idiv, "idiv", IUM_RD, 0x0038F6, ILLEGAL, ILLEGAL, INS_TT_NONE, Undefined_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Undefined_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF) INST1(imulEAX, "imul", IUM_RD, 0x0028F6, 4C, 1C, INS_TT_NONE, Writes_OF | Undefined_SF | Undefined_ZF | Undefined_AF | Undefined_PF | Writes_CF | INS_FLAGS_Has_Wbit | Encoding_REX2 | INS_Flags_Has_NF) @@ -1295,22 +1300,22 @@ INST1(fld, "fld", IUM_WR, 0x0000D9, INST1(fstp, "fstp", IUM_WR, 0x0018D9, ILLEGAL, ILLEGAL, INS_TT_NONE, INS_FLAGS_x87Instr) #endif // TARGET_X86 -INST1(seto, "seto", IUM_WR, 0x0F0090, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Encoding_REX2) -INST1(setno, "setno", IUM_WR, 0x0F0091, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Encoding_REX2) -INST1(setb, "setb", IUM_WR, 0x0F0092, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF | Encoding_REX2) -INST1(setae, "setae", IUM_WR, 0x0F0093, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_CF | Encoding_REX2) -INST1(sete, "sete", IUM_WR, 0x0F0094, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Encoding_REX2) -INST1(setne, "setne", IUM_WR, 0x0F0095, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Encoding_REX2) -INST1(setbe, "setbe", IUM_WR, 0x0F0096, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) -INST1(seta, "seta", IUM_WR, 0x0F0097, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) -INST1(sets, "sets", IUM_WR, 0x0F0098, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF | Encoding_REX2) -INST1(setns, "setns", IUM_WR, 0x0F0099, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_SF | Encoding_REX2) -INST1(setp, "setp", IUM_WR, 0x0F009A, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF | Encoding_REX2) -INST1(setnp, "setnp", IUM_WR, 0x0F009B, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_PF | Encoding_REX2) -INST1(setl, "setl", IUM_WR, 0x0F009C, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) -INST1(setge, "setge", IUM_WR, 0x0F009D, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) -INST1(setle, "setle", IUM_WR, 0x0F009E, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) -INST1(setg, "setg", IUM_WR, 0x0F009F, ILLEGAL, ILLEGAL, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) +INST1(seto, "seto", IUM_WR, 0x0F0090, 1C, 2X, INS_TT_NONE, Reads_OF | Encoding_REX2) +INST1(setno, "setno", IUM_WR, 0x0F0091, 1C, 2X, INS_TT_NONE, Reads_OF | Encoding_REX2) +INST1(setb, "setb", IUM_WR, 0x0F0092, 1C, 2X, INS_TT_NONE, Reads_CF | Encoding_REX2) +INST1(setae, "setae", IUM_WR, 0x0F0093, 1C, 2X, INS_TT_NONE, Reads_CF | Encoding_REX2) +INST1(sete, "sete", IUM_WR, 0x0F0094, 1C, 2X, INS_TT_NONE, Reads_ZF | Encoding_REX2) +INST1(setne, "setne", IUM_WR, 0x0F0095, 1C, 2X, INS_TT_NONE, Reads_ZF | Encoding_REX2) +INST1(setbe, "setbe", IUM_WR, 0x0F0096, 1C, 2X, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) +INST1(seta, "seta", IUM_WR, 0x0F0097, 1C, 2X, INS_TT_NONE, Reads_ZF | Reads_CF | Encoding_REX2) +INST1(sets, "sets", IUM_WR, 0x0F0098, 1C, 2X, INS_TT_NONE, Reads_SF | Encoding_REX2) +INST1(setns, "setns", IUM_WR, 0x0F0099, 1C, 2X, INS_TT_NONE, Reads_SF | Encoding_REX2) +INST1(setp, "setp", IUM_WR, 0x0F009A, 1C, 2X, INS_TT_NONE, Reads_PF | Encoding_REX2) +INST1(setnp, "setnp", IUM_WR, 0x0F009B, 1C, 2X, INS_TT_NONE, Reads_PF | Encoding_REX2) +INST1(setl, "setl", IUM_WR, 0x0F009C, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) +INST1(setge, "setge", IUM_WR, 0x0F009D, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Encoding_REX2) +INST1(setle, "setle", IUM_WR, 0x0F009E, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) +INST1(setg, "setg", IUM_WR, 0x0F009F, 1C, 2X, INS_TT_NONE, Reads_OF | Reads_SF | Reads_ZF | Encoding_REX2) // Indirect jump used for tailcalls. We differentiate between func-internal // indirect jump (e.g. used for switch) and tailcall indirect jumps because the