From fffd34540b50cee2ae5a3e1215e525f029eec124 Mon Sep 17 00:00:00 2001 From: Fei Peng Date: Tue, 16 Jan 2018 18:35:38 -0800 Subject: [PATCH 1/2] table drive Intel hardware intrinsic --- src/jit/compiler.h | 15 +- src/jit/emit.h | 14 + src/jit/emitfmtsxarch.h | 2 + src/jit/emitxarch.cpp | 124 ++++- src/jit/emitxarch.h | 15 +- src/jit/gentree.cpp | 8 +- src/jit/hwintrinsiccodegenxarch.cpp | 183 ++++--- src/jit/hwintrinsiclistxarch.h | 291 +++++----- src/jit/hwintrinsicxarch.cpp | 503 ++++++++---------- src/jit/instrsxarch.h | 9 + src/jit/lowerxarch.cpp | 108 +--- src/jit/lsraxarch.cpp | 26 +- src/jit/namedintrinsiclist.h | 75 ++- .../HardwareIntrinsics/X86/Avx/Multiply.cs | 108 ++++ .../X86/Avx/Multiply_r.csproj | 34 ++ .../X86/Avx/Multiply_ro.csproj | 34 ++ .../HardwareIntrinsics/X86/Avx2/Multiply.cs | 116 ++++ .../X86/Avx2/Multiply_r.csproj | 34 ++ .../X86/Avx2/Multiply_ro.csproj | 34 ++ .../HardwareIntrinsics/X86/Sse41/Multiply.cs | 95 ++++ .../X86/Sse41/Multiply_r.csproj | 34 ++ .../X86/Sse41/Multiply_ro.csproj | 34 ++ 22 files changed, 1246 insertions(+), 650 deletions(-) create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply.cs create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_r.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_ro.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply.cs create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_r.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_ro.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply.cs create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_r.csproj create mode 100644 tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_ro.csproj diff --git a/src/jit/compiler.h b/src/jit/compiler.h index 7f2f3b85303d..ea1c6b2ac443 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -3042,11 +3042,11 @@ class Compiler NamedIntrinsic lookupNamedIntrinsic(CORINFO_METHOD_HANDLE method); #if FEATURE_HW_INTRINSICS - InstructionSet lookupHWIntrinsicISA(const char* className); - NamedIntrinsic lookupHWIntrinsic(const char* methodName, InstructionSet isa); - InstructionSet isaOfHWIntrinsic(NamedIntrinsic intrinsic); - bool isIntrinsicAnIsSupportedPropertyGetter(NamedIntrinsic intrinsic); - bool isFullyImplmentedISAClass(InstructionSet isa); + static InstructionSet lookupHWIntrinsicISA(const char* className); + static NamedIntrinsic lookupHWIntrinsic(const char* methodName, InstructionSet isa); + static InstructionSet isaOfHWIntrinsic(NamedIntrinsic intrinsic); + static bool isIntrinsicAnIsSupportedPropertyGetter(NamedIntrinsic intrinsic); + static bool isFullyImplmentedISAClass(InstructionSet isa); #ifdef _TARGET_XARCH_ GenTree* impUnsupportedHWIntrinsic(unsigned helper, CORINFO_METHOD_HANDLE method, @@ -3119,7 +3119,12 @@ class Compiler bool compSupportsHWIntrinsic(InstructionSet isa); bool isScalarISA(InstructionSet isa); static int ivalOfHWIntrinsic(NamedIntrinsic intrinsic); + static int numArgsOfHWIntrinsic(NamedIntrinsic intrinsic); static instruction insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type); + static HWIntrinsicCategory categoryOfHWIntrinsic(NamedIntrinsic intrinsic); + static HWIntrinsicFlag flagOfHWIntrinsic(NamedIntrinsic intrinsic); + GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass); + GenTreeArgList* buildArgList(CORINFO_SIG_INFO* sig); #endif // _TARGET_XARCH_ #endif // FEATURE_HW_INTRINSICS GenTreePtr impArrayAccessIntrinsic(CORINFO_CLASS_HANDLE clsHnd, diff --git a/src/jit/emit.h b/src/jit/emit.h index a602cfcf91f4..fd4ccbd8bf73 100644 --- a/src/jit/emit.h +++ b/src/jit/emit.h @@ -938,6 +938,7 @@ class emitter struct { regNumber _idReg3 : REGNUM_BITS; + regNumber _idReg4 : REGNUM_BITS; }; #endif // defined(_TARGET_XARCH_) @@ -1119,6 +1120,19 @@ class emitter idAddr()->_idReg3 = reg; assert(reg == idAddr()->_idReg3); } + regNumber idReg4() const + { + assert(!idIsTiny()); + assert(!idIsSmallDsc()); + return idAddr()->_idReg4; + } + void idReg4(regNumber reg) + { + assert(!idIsTiny()); + assert(!idIsSmallDsc()); + idAddr()->_idReg4 = reg; + assert(reg == idAddr()->_idReg4); + } #endif // defined(_TARGET_XARCH_) #ifdef _TARGET_ARMARCH_ insOpts idInsOpt() const diff --git a/src/jit/emitfmtsxarch.h b/src/jit/emitfmtsxarch.h index 953103a49596..b7ab38f0c034 100644 --- a/src/jit/emitfmtsxarch.h +++ b/src/jit/emitfmtsxarch.h @@ -110,6 +110,8 @@ IF_DEF(RRW_RRW_CNS, IS_R1_RW|IS_R2_RW, SCNS) // r/w reg , r/w r IF_DEF(RWR_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD, NONE) // write reg , read reg2 , read reg3 IF_DEF(RWR_RRD_RRD_CNS, IS_R1_WR|IS_R2_RD|IS_R3_RD, SCNS) // write reg , read reg2 , read reg3, const + +IF_DEF(RWR_RRD_RRD_RRD, IS_R1_WR|IS_R2_RD|IS_R3_RD|IS_R4_RD, NONE) // write reg , read reg2 , read reg3 , read reg4 //---------------------------------------------------------------------------- // The following formats are used for direct addresses (e.g. static data members) //---------------------------------------------------------------------------- diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index f55226fde6e7..0f6a9ff059b8 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -146,6 +146,7 @@ bool emitter::IsDstDstSrcAVXInstruction(instruction ins) case INS_pminub: case INS_pminud: case INS_pminuw: + case INS_pmuldq: case INS_pmulld: case INS_pmullw: case INS_pmuludq: @@ -4227,6 +4228,43 @@ void emitter::emitIns_R_R_S_I( emitCurIGsize += sz; } +static bool isAvxBlendv(instruction ins) +{ + return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb; +} + +static bool isSse41Blendv(instruction ins) +{ + return ins == INS_blendvps || ins == INS_blendvpd || ins == INS_pblendvb; +} + +void emitter::emitIns_R_R_R_R( + instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, regNumber reg3) +{ + assert(isAvxBlendv(ins)); + assert(UseVEXEncoding()); + // Currently vex prefix only use three bytes mode. + // size = vex + opcode + ModR/M + 1-byte-cns(Reg) = 3 + 1 + 1 + 1 = 6 + // TODO-XArch-CQ: We should create function which can calculate all kinds of AVX instructions size in future + UNATIVE_OFFSET sz = 6; + + // AVX/AVX2 supports 4-reg format for vblendvps/vblendvpd/vpblendvb, + // which encodes the fourth register into imm8[7:4] + int ival = (reg3 - XMMBASE) << 4; // convert reg3 to ival + + instrDesc* id = emitNewInstrCns(attr, ival); + id->idIns(ins); + id->idInsFmt(IF_RWR_RRD_RRD_RRD); + id->idReg1(targetReg); + id->idReg2(reg1); + id->idReg3(reg2); + id->idReg4(reg3); + + id->idCodeSize(sz); + dispIns(id); + emitCurIGsize += sz; +} + /***************************************************************************** * * Add an instruction with a register + static member operands. @@ -5166,23 +5204,17 @@ void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNu } #if FEATURE_HW_INTRINSICS -void emitter::emitIns_SIMD_R_R(instruction ins, regNumber reg, regNumber reg1, var_types simdtype) -{ - emitIns_R_R(ins, emitTypeSize(simdtype), reg, reg1); -} - -void emitter::emitIns_SIMD_R_R_A( - instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, var_types simdtype) +void emitter::emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir) { if (UseVEXEncoding()) { - emitIns_R_R_A(ins, emitTypeSize(simdtype), reg, reg1, indir, IF_RWR_RRD_ARD); + emitIns_R_R_A(ins, attr, reg, reg1, indir, IF_RWR_RRD_ARD); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } emitIns_R_A(ins, emitTypeSize(simdtype), reg, indir, IF_RRW_ARD); } @@ -5205,51 +5237,90 @@ void emitter::emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1 } void emitter::emitIns_SIMD_R_R_C( - instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, var_types simdtype) + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs) { if (UseVEXEncoding()) { - emitIns_R_R_C(ins, emitTypeSize(simdtype), reg, reg1, fldHnd, offs); + emitIns_R_R_C(ins, attr, reg, reg1, fldHnd, offs); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_C(ins, emitTypeSize(simdtype), reg, fldHnd, offs); + emitIns_R_C(ins, attr, reg, fldHnd, offs); } } -void emitter::emitIns_SIMD_R_R_R(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, var_types simdtype) +void emitter::emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2) { if (UseVEXEncoding()) { - emitIns_R_R_R(ins, emitTypeSize(simdtype), reg, reg1, reg2); + emitIns_R_R_R(ins, attr, reg, reg1, reg2); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_R(ins, emitTypeSize(simdtype), reg, reg2); + emitIns_R_R(ins, attr, reg, reg2); } } -void emitter::emitIns_SIMD_R_R_S(instruction ins, regNumber reg, regNumber reg1, int varx, int offs, var_types simdtype) +void emitter::emitIns_SIMD_R_R_R_R( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3) { + assert(isAvxBlendv(ins) || isSse41Blendv(ins)); if (UseVEXEncoding()) { - emitIns_R_R_S(ins, emitTypeSize(simdtype), reg, reg1, varx, offs); + // convert SSE encoding of SSE4.1 instructions to VEX encoding + switch (ins) + { + case INS_blendvps: + ins = INS_vblendvps; + break; + case INS_blendvpd: + ins = INS_vblendvpd; + break; + case INS_pblendvb: + ins = INS_vpblendvb; + break; + default: + break; + } + emitIns_R_R_R_R(ins, attr, reg, reg1, reg2, reg3); } else { + assert(isSse41Blendv(ins)); + // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 + if (reg3 != REG_XMM0) + { + emitIns_R_R(INS_movaps, attr, REG_XMM0, reg3); + } if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); + } + emitIns_R_R(ins, attr, reg, reg2); + } +} + +void emitter::emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs) +{ + if (UseVEXEncoding()) + { + emitIns_R_R_S(ins, attr, reg, reg1, varx, offs); + } + else + { + if (reg1 != reg) + { + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_S(ins, emitTypeSize(simdtype), reg, varx, offs); + emitIns_R_S(ins, attr, reg, varx, offs); } } @@ -7653,6 +7724,14 @@ void emitter::emitDispIns( val = emitGetInsSC(id); goto PRINT_CONSTANT; break; + case IF_RWR_RRD_RRD_RRD: + assert(IsAVXOnlyInstruction(ins)); + assert(UseVEXEncoding()); + printf("%s, ", emitRegName(id->idReg1(), attr)); + printf("%s, ", emitRegName(id->idReg2(), attr)); + printf("%s, ", emitRegName(id->idReg3(), attr)); + printf("%s", emitRegName(id->idReg4(), attr)); + break; case IF_RRW_RRW_CNS: printf("%s,", emitRegName(id->idReg1(), attr)); printf(" %s", emitRegName(id->idReg2(), attr)); @@ -10304,7 +10383,7 @@ BYTE* emitter::emitOutputRRR(BYTE* dst, instrDesc* id) instruction ins = id->idIns(); assert(IsAVXInstruction(ins)); - assert(IsThreeOperandAVXInstruction(ins)); + assert(IsThreeOperandAVXInstruction(ins) || isAvxBlendv(ins)); regNumber targetReg = id->idReg1(); regNumber src1 = id->idReg2(); regNumber src2 = id->idReg3(); @@ -11570,6 +11649,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp) sz = emitSizeOfInsDsc(id); break; case IF_RWR_RRD_RRD_CNS: + case IF_RWR_RRD_RRD_RRD: dst = emitOutputRRR(dst, id); sz = emitSizeOfInsDsc(id); dst += emitOutputByte(dst, emitGetInsSC(id)); diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index 9c176bc545e8..047344787116 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -398,6 +398,8 @@ void emitIns_R_R_R_I(instruction ins, emitAttr attr, regNumber reg1, regNumber r void emitIns_R_R_S_I(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, int varx, int offs, int ival); +void emitIns_R_R_R_R(instruction ins, emitAttr attr, regNumber reg1, regNumber reg2, regNumber reg3, regNumber reg4); + void emitIns_S(instruction ins, emitAttr attr, int varx, int offs); void emitIns_S_R(instruction ins, emitAttr attr, regNumber ireg, int varx, int offs); @@ -453,13 +455,7 @@ void emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp); #if FEATURE_HW_INTRINSICS -void emitIns_SIMD_R_R(instruction ins, regNumber reg, regNumber reg1, var_types simdtype); -void emitIns_SIMD_R_R_A(instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, var_types simdtype); void emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype); -void emitIns_SIMD_R_R_C( - instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, var_types simdtype); -void emitIns_SIMD_R_R_R(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, var_types simdtype); -void emitIns_SIMD_R_R_S(instruction ins, regNumber reg, regNumber reg1, int varx, int offs, var_types simdtype); void emitIns_SIMD_R_R_A_I( instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype); void emitIns_SIMD_R_R_C_I(instruction ins, @@ -472,6 +468,13 @@ void emitIns_SIMD_R_R_C_I(instruction ins, void emitIns_SIMD_R_R_R_I(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype); void emitIns_SIMD_R_R_S_I( instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype); +void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir); +void emitIns_SIMD_R_R_C( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs); +void emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs); +void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2); +void emitIns_SIMD_R_R_R_R( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, regNumber reg3); #endif #if FEATURE_STACK_FP_X87 diff --git a/src/jit/gentree.cpp b/src/jit/gentree.cpp index 9fd4acfb2e24..5d1fdc2f604f 100644 --- a/src/jit/gentree.cpp +++ b/src/jit/gentree.cpp @@ -17914,20 +17914,20 @@ GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, } GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode( - var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size) + var_types type, GenTree* op1, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned simdSize) { SetOpLclRelatedToSIMDIntrinsic(op1); - return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, hwIntrinsicID, baseType, size); + return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, hwIntrinsicID, baseType, simdSize); } GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode( - var_types type, GenTree* op1, GenTree* op2, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned size) + var_types type, GenTree* op1, GenTree* op2, NamedIntrinsic hwIntrinsicID, var_types baseType, unsigned simdSize) { SetOpLclRelatedToSIMDIntrinsic(op1); SetOpLclRelatedToSIMDIntrinsic(op2); - return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, op2, hwIntrinsicID, baseType, size); + return new (this, GT_HWIntrinsic) GenTreeHWIntrinsic(type, op1, op2, hwIntrinsicID, baseType, simdSize); } GenTreeHWIntrinsic* Compiler::gtNewSimdHWIntrinsicNode(var_types type, diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index ed61c9530910..6ea0de7e2332 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -24,10 +24,94 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "gcinfo.h" #include "gcinfoencoder.h" +//------------------------------------------------------------------------ +// genIsTableDrivenHWIntrinsic: +// +// Arguments: +// category - category of a HW intrinsic +// +// Return Value: +// returns true if this category can be table-driven in CodeGen +// +static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category) +{ + // TODO - make more categories to the table-driven framework + const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD; + const bool nonTableDrivenIntrinsic = category == HW_Category_Special; + return tableDrivenIntrinsic && !nonTableDrivenIntrinsic; +} + void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) { - NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; - InstructionSet isa = compiler->isaOfHWIntrinsic(intrinsicID); + NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; + InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID); + HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID); + HWIntrinsicFlag flag = Compiler::flagOfHWIntrinsic(intrinsicID); + int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID); + + assert((flag & HW_Flag_NoCodeGen) == 0); + + if (genIsTableDrivenHWIntrinsic(category)) + { + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); + regNumber targetReg = node->gtRegNum; + var_types targetType = node->TypeGet(); + var_types baseType = node->gtSIMDBaseType; + + regNumber op1Reg = REG_NA; + regNumber op2Reg = REG_NA; + emitter* emit = getEmitter(); + + assert(numArgs >= 0); + instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType); + assert(ins != INS_invalid); + emitAttr simdSize = (emitAttr)(node->gtSIMDSize); + assert(simdSize != 0); + + switch (numArgs) + { + case 1: + genConsumeOperands(node); + op1Reg = op1->gtRegNum; + emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg); + break; + case 2: + genHWIntrinsic_R_R_RM(node, ins); + break; + case 3: + { + assert(op1->OperIsList()); + assert(op1->gtGetOp2()->OperIsList()); + assert(op1->gtGetOp2()->gtGetOp2()->OperIsList()); + + GenTreeArgList* argList = op1->AsArgList(); + op1 = argList->Current(); + genConsumeRegs(op1); + op1Reg = op1->gtRegNum; + + argList = argList->Rest(); + op2 = argList->Current(); + genConsumeRegs(op2); + op2Reg = op2->gtRegNum; + + argList = argList->Rest(); + GenTree* op3 = argList->Current(); + genConsumeRegs(op3); + regNumber op3Reg = op3->gtRegNum; + + emit->emitIns_SIMD_R_R_R_R(ins, simdSize, targetReg, op1Reg, op2Reg, op3Reg); + break; + } + + default: + unreached(); + break; + } + genProduceReg(node); + return; + } + switch (isa) { case InstructionSet_SSE: @@ -87,6 +171,7 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins) regNumber targetReg = node->gtRegNum; GenTree* op1 = node->gtGetOp1(); GenTree* op2 = node->gtGetOp2(); + emitAttr simdSize = (emitAttr)(node->gtSIMDSize); emitter* emit = getEmitter(); // TODO-XArch-CQ: Commutative operations can have op1 be contained @@ -136,13 +221,13 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins) case GT_CLS_VAR_ADDR: { - emit->emitIns_SIMD_R_R_C(ins, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0, targetType); + emit->emitIns_SIMD_R_R_C(ins, simdSize, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0); return; } default: { - emit->emitIns_SIMD_R_R_A(ins, targetReg, op1Reg, memIndir, targetType); + emit->emitIns_SIMD_R_R_A(ins, simdSize, targetReg, op1Reg, memIndir); return; } } @@ -180,11 +265,11 @@ void CodeGen::genHWIntrinsic_R_R_RM(GenTreeHWIntrinsic* node, instruction ins) assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); assert(offset != (unsigned)-1); - emit->emitIns_SIMD_R_R_S(ins, targetReg, op1Reg, varNum, offset, targetType); + emit->emitIns_SIMD_R_R_S(ins, simdSize, targetReg, op1Reg, varNum, offset); } else { - emit->emitIns_SIMD_R_R_R(ins, targetReg, op1Reg, op2->gtRegNum, targetType); + emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op2->gtRegNum); } } @@ -691,40 +776,6 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node) switch (intrinsicID) { - case NI_SSE2_Add: - { - assert(node->TypeGet() == TYP_SIMD16); - - switch (baseType) - { - case TYP_DOUBLE: - ins = INS_addpd; - break; - case TYP_INT: - case TYP_UINT: - ins = INS_paddd; - break; - case TYP_LONG: - case TYP_ULONG: - ins = INS_paddq; - break; - case TYP_BYTE: - case TYP_UBYTE: - ins = INS_paddb; - break; - case TYP_SHORT: - case TYP_USHORT: - ins = INS_paddw; - break; - default: - unreached(); - break; - } - - genHWIntrinsic_R_R_RM(node, ins); - break; - } - default: unreached(); break; @@ -800,27 +851,6 @@ void CodeGen::genAVXIntrinsic(GenTreeHWIntrinsic* node) switch (intrinsicID) { - case NI_AVX_Add: - { - assert(node->TypeGet() == TYP_SIMD32); - - switch (baseType) - { - case TYP_DOUBLE: - ins = INS_addpd; - break; - case TYP_FLOAT: - ins = INS_addps; - break; - default: - unreached(); - break; - } - - genHWIntrinsic_R_R_RM(node, ins); - break; - } - default: unreached(); break; @@ -839,37 +869,6 @@ void CodeGen::genAVX2Intrinsic(GenTreeHWIntrinsic* node) switch (intrinsicID) { - case NI_AVX2_Add: - { - assert(node->TypeGet() == TYP_SIMD32); - - switch (baseType) - { - case TYP_INT: - case TYP_UINT: - ins = INS_paddd; - break; - case TYP_LONG: - case TYP_ULONG: - ins = INS_paddq; - break; - case TYP_BYTE: - case TYP_UBYTE: - ins = INS_paddb; - break; - case TYP_SHORT: - case TYP_USHORT: - ins = INS_paddw; - break; - default: - unreached(); - break; - } - - genHWIntrinsic_R_R_RM(node, ins); - break; - } - default: unreached(); break; diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index cd5e59d10eb6..f9ccf7fd8c27 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -11,145 +11,166 @@ // clang-format off #if FEATURE_HW_INTRINSICS -// Intrinsic ID Function name ISA -// SSE Intrinsics -HARDWARE_INTRINSIC(SSE_IsSupported, "get_IsSupported", SSE) -HARDWARE_INTRINSIC(SSE_Add, "Add", SSE) -HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE) -HARDWARE_INTRINSIC(SSE_And, "And", SSE) -HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE) -HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE) -HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareGreaterThan, "CompareGreaterThan", SSE) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual, "CompareGreaterThanOrEqual", SSE) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareLessThan, "CompareLessThan", SSE) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual, "CompareLessThanOrEqual", SSE) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan, "CompareNotGreaterThan", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual, "CompareNotGreaterThanOrEqual", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotLessThan, "CompareNotLessThan", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual, "CompareNotLessThanOrEqual", SSE) -HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareOrdered, "CompareOrdered", SSE) -HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_CompareUnordered, "CompareUnordered", SSE) -HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE) -HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE) -HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE) -HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE) -HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE) -HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE) -HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE) -HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE) -HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE) -HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE) -HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE) -HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE) -HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE) -HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE) -HARDWARE_INTRINSIC(SSE_Max, "Max", SSE) -HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE) -HARDWARE_INTRINSIC(SSE_Min, "Min", SSE) -HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE) -HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE) -HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE) -HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE) -HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE) -HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE) -HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE) -HARDWARE_INTRINSIC(SSE_Or, "Or", SSE) -HARDWARE_INTRINSIC(SSE_Reciprocal, "Reciprocal", SSE) -HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE) -HARDWARE_INTRINSIC(SSE_ReciprocalSqrt, "ReciprocalSqrt", SSE) -HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE) -HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE) -HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE) -HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE) -HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE) -HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE) -HARDWARE_INTRINSIC(SSE_Sqrt, "Sqrt", SSE) -HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE) -HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE) -HARDWARE_INTRINSIC(SSE_Store, "Store", SSE) -HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE) -HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE) -HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE) -HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE) -HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE) -HARDWARE_INTRINSIC(SSE_Subtract, "Subtract", SSE) -HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE) -HARDWARE_INTRINSIC(SSE_UnpackHigh, "UnpackHigh", SSE) -HARDWARE_INTRINSIC(SSE_UnpackLow, "UnpackLow", SSE) -HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE) - -// SSE2 Intrinsics -HARDWARE_INTRINSIC(SSE2_IsSupported, "get_IsSupported", SSE2) -HARDWARE_INTRINSIC(SSE2_Add, "Add", SSE2) - -// SSE3 Intrinsics -HARDWARE_INTRINSIC(SSE3_IsSupported, "get_IsSupported", SSE3) - -// SSSE3 Intrinsics -HARDWARE_INTRINSIC(SSSE3_IsSupported, "get_IsSupported", SSSE3) - -// SSE41 Intrinsics -HARDWARE_INTRINSIC(SSE41_IsSupported, "get_IsSupported", SSE41) - -// SSE42 Intrinsics -HARDWARE_INTRINSIC(SSE42_IsSupported, "get_IsSupported", SSE42) -HARDWARE_INTRINSIC(SSE42_Crc32, "Crc32", SSE42) - -// AVX Intrinsics -HARDWARE_INTRINSIC(AVX_IsSupported, "get_IsSupported", AVX) -HARDWARE_INTRINSIC(AVX_Add, "Add", AVX) - -// AVX2 Intrinsics -HARDWARE_INTRINSIC(AVX2_IsSupported, "get_IsSupported", AVX2) -HARDWARE_INTRINSIC(AVX2_Add, "Add", AVX2) - -// AES Intrinsics -HARDWARE_INTRINSIC(AES_IsSupported, "get_IsSupported", AES) - -// BMI1 Intrinsics -HARDWARE_INTRINSIC(BMI1_IsSupported, "get_IsSupported", BMI1) - -// BMI2 Intrinsics -HARDWARE_INTRINSIC(BMI2_IsSupported, "get_IsSupported", BMI2) - -// FMA Intrinsics -HARDWARE_INTRINSIC(FMA_IsSupported, "get_IsSupported", FMA) - -// LZCNT Intrinsics -HARDWARE_INTRINSIC(LZCNT_IsSupported, "get_IsSupported", LZCNT) -HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount, "LeadingZeroCount", LZCNT) +/* Note + 1) Each hardware intrinsic has a unique Intrinsic ID with type of `enum NamedIntrinsic` + 2) All the overloads of an intrinsic in an ISA class share one Intrinsic ID + 3) The intrinsic that generates instructions with a fixed imm8 operand has a `ival` field with "not -1" value, e.g., Sse.CompareEqual(v1,v2) -> cmpps xmm0, xmm1, 0 + 4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128` (16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_) or `Vector256` + 5) Scalar intrinsics that operate over general purpose registers (e.g., Sse41.Crc32) have `SIMD size` with 0 + 6) Each intrinsic has a `NumArg` for number of parameters, and some intrinsics that are overloaded on multiple parameter numbers have this field with -1 + 7) Each intrinsic has 10 `instructions` fields that list the instructions should be generated based-on the base type + 8) Each intrinsic has one category with type of `enum HWIntrinsicCategory`, please see the definition of HWIntrinsicCategory for details + 9) Each intrinsic has one or more flags with type of `enum HWIntrinsicFlag` +*/ +// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************ +// Intrinsic ID Function name ISA ival SIMD size NumArg instructions Category Flags +// {TYP_BYTE, TYP_UBYTE, TYP_SHORT, TYP_USHORT, TYP_INT, TYP_UINT, TYP_LONG, TYP_ULONG, TYP_FLOAT, TYP_DOUBLE} +// ************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************ +// SSE Intrinsics +HARDWARE_INTRINSIC(SSE_IsSupported, "get_IsSupported", SSE, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Add, "Add", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_And, "And", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Fixed) +HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThan, "CompareGreaterThan", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual, "CompareGreaterThanOrEqual", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThan, "CompareLessThan", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual, "CompareLessThanOrEqual", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan, "CompareNotGreaterThan", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual, "CompareNotGreaterThanOrEqual", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotLessThan, "CompareNotLessThan", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual, "CompareNotLessThanOrEqual", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareOrdered, "CompareOrdered", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareUnordered, "CompareUnordered", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Or, "Or", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Reciprocal, "Reciprocal", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ReciprocalSqrt, "ReciprocalSqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE, -1, 16, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Sqrt, "Sqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +//HARDWARE_INTRINSIC(SSE_Store, "Store", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +//HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +//HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +//HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +//HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +//HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Subtract, "Subtract", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_UnpackHigh, "UnpackHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_UnpackLow, "UnpackLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) + +// SSE2 Intrinsics +HARDWARE_INTRINSIC(SSE2_IsSupported, "get_IsSupported", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE2_Add, "Add", SSE2, -1, 16, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) + +// SSE3 Intrinsics +HARDWARE_INTRINSIC(SSE3_IsSupported, "get_IsSupported", SSE3, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) + +// SSSE3 Intrinsics +HARDWARE_INTRINSIC(SSSE3_IsSupported, "get_IsSupported", SSSE3, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) + +// SSE41 Intrinsics +HARDWARE_INTRINSIC(SSE41_IsSupported, "get_IsSupported", SSE41, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE41_Multiply, "Multiply", SSE41, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE41_BlendVariable, "BlendVariable", SSE41, -1, 16, 3, {INS_pblendvb, INS_pblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_blendvps, INS_blendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) + +// SSE42 Intrinsics +HARDWARE_INTRINSIC(SSE42_IsSupported, "get_IsSupported", SSE42, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE42_Crc32, "Crc32", SSE42, -1, 0, 2, {INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_crc32, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFlag) + +// AVX Intrinsics +HARDWARE_INTRINSIC(AVX_IsSupported, "get_IsSupported", AVX, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_Add, "Add", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_addpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX_Multiply, "Multiply", AVX, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_mulpd}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX_Reciprocal, "Reciprocal", AVX, -1, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX_BlendVariable, "BlendVariable", AVX, -1, 32, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_vblendvps, INS_vblendvpd}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) + +// AVX2 Intrinsics +HARDWARE_INTRINSIC(AVX2_IsSupported, "get_IsSupported", AVX2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(AVX2_Add, "Add", AVX2, -1, 32, 2, {INS_paddb, INS_paddb, INS_paddw, INS_paddw, INS_paddd, INS_paddd, INS_paddq, INS_paddq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX2_Multiply, "Multiply", AVX2, -1, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_pmuldq, INS_pmuludq, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(AVX2_BlendVariable, "BlendVariable", AVX2, -1, 32, 3, {INS_vpblendvb, INS_vpblendvb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) + +// AES Intrinsics +HARDWARE_INTRINSIC(AES_IsSupported, "get_IsSupported", AES, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) + +// BMI1 Intrinsics +HARDWARE_INTRINSIC(BMI1_IsSupported, "get_IsSupported", BMI1, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) + +// BMI2 Intrinsics +HARDWARE_INTRINSIC(BMI2_IsSupported, "get_IsSupported", BMI2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) + +// FMA Intrinsics +HARDWARE_INTRINSIC(FMA_IsSupported, "get_IsSupported", FMA, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) + +// LZCNT Intrinsics +HARDWARE_INTRINSIC(LZCNT_IsSupported, "get_IsSupported", LZCNT, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(LZCNT_LeadingZeroCount, "LeadingZeroCount", LZCNT, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_lzcnt, INS_invalid, INS_lzcnt, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFlag) // PCLMULQDQ Intrinsics -HARDWARE_INTRINSIC(PCLMULQDQ_IsSupported, "get_IsSupported", PCLMULQDQ) +HARDWARE_INTRINSIC(PCLMULQDQ_IsSupported, "get_IsSupported", PCLMULQDQ, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) // POPCNT Intrinsics -HARDWARE_INTRINSIC(POPCNT_IsSupported, "get_IsSupported", POPCNT) -HARDWARE_INTRINSIC(POPCNT_PopCount, "PopCount", POPCNT) -#endif // FEATURE_HW_INTRINSICS +HARDWARE_INTRINSIC(POPCNT_IsSupported, "get_IsSupported", POPCNT, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(POPCNT_PopCount, "PopCount", POPCNT, -1, 0, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_popcnt, INS_invalid, INS_popcnt, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Scalar, HW_Flag_NoFlag) +#endif // FEATURE_HW_INTRINSIC #undef HARDWARE_INTRINSIC diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index 5016cc09c4fd..a96952db6cb6 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -8,13 +8,20 @@ struct HWIntrinsicInfo { - NamedIntrinsic intrinsicID; - const char* intrinsicName; - InstructionSet isa; -} + NamedIntrinsic intrinsicID; + const char* intrinsicName; + InstructionSet isa; + int ival; + unsigned simdSize; + int numArgs; + instruction ins[10]; + HWIntrinsicCategory category; + HWIntrinsicFlag flag; +}; -static const hwIntrinsicInfoArray[] = { -#define HARDWARE_INTRINSIC(id, name, isa) {NI_##id, name, InstructionSet_##isa}, +static const HWIntrinsicInfo hwIntrinsicInfoArray[] = { +#define HARDWARE_INTRINSIC(id, name, isa, ival, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag) \ + {NI_##id, name, InstructionSet_##isa, ival, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag}, #include "hwintrinsiclistxarch.h" }; @@ -153,64 +160,56 @@ InstructionSet Compiler::isaOfHWIntrinsic(NamedIntrinsic intrinsic) } //------------------------------------------------------------------------ -// ivalOfHWIntrinsic: get the imm8 value of the given intrinsic +// ivalOfHWIntrinsic: get the imm8 value of this intrinsic from the hwIntrinsicInfoArray table // // Arguments: // intrinsic -- id of the intrinsic function. // // Return Value: -// the imm8 value of the intrinsic, -1 for non-IMM intrinsics +// The imm8 value that is implicit for this intrinsic, or -1 for intrinsics that do not take an immediate, or for +// which the immediate is an explicit argument. // int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic) { assert(intrinsic != NI_Illegal); assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); + return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].ival; +} - switch (intrinsic) - { - case NI_SSE_CompareEqual: - case NI_SSE_CompareEqualScalar: - return 0; - - case NI_SSE_CompareLessThan: - case NI_SSE_CompareLessThanScalar: - case NI_SSE_CompareNotGreaterThanOrEqual: - case NI_SSE_CompareNotGreaterThanOrEqualScalar: - return 1; - - case NI_SSE_CompareLessThanOrEqual: - case NI_SSE_CompareLessThanOrEqualScalar: - case NI_SSE_CompareNotGreaterThan: - case NI_SSE_CompareNotGreaterThanScalar: - return 2; - - case NI_SSE_CompareUnordered: - case NI_SSE_CompareUnorderedScalar: - return 3; - - case NI_SSE_CompareNotEqual: - case NI_SSE_CompareNotEqualScalar: - return 4; - - case NI_SSE_CompareGreaterThanOrEqual: - case NI_SSE_CompareGreaterThanOrEqualScalar: - case NI_SSE_CompareNotLessThan: - case NI_SSE_CompareNotLessThanScalar: - return 5; - - case NI_SSE_CompareGreaterThan: - case NI_SSE_CompareGreaterThanScalar: - case NI_SSE_CompareNotLessThanOrEqual: - case NI_SSE_CompareNotLessThanOrEqualScalar: - return 6; - - case NI_SSE_CompareOrdered: - case NI_SSE_CompareOrderedScalar: - return 7; +//------------------------------------------------------------------------ +// simdSizeOfHWIntrinsic: get the SIMD size of this intrinsic +// +// Arguments: +// intrinsic -- id of the intrinsic function. +// +// Return Value: +// the SIMD size of this intrinsic +// - from the hwIntrinsicInfoArray table if intrinsic has NO HW_Flag_UnfixedSIMDSize +// - TODO-XArch-NYI - from the signature if intrinsic has HW_Flag_UnfixedSIMDSize +// +// Note - this function is only used by the importer +// after importation (i.e., codegen), we can get the SIMD size from GenTreeHWIntrinsic IR +static unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig) +{ + assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); + assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag & HW_Flag_UnfixedSIMDSize) == 0); + return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize; +} - default: - return -1; - } +//------------------------------------------------------------------------ +// numArgsOfHWIntrinsic: get the number of arguments +// +// Arguments: +// intrinsic -- id of the intrinsic function. +// +// Return Value: +// number of arguments +// +int Compiler::numArgsOfHWIntrinsic(NamedIntrinsic intrinsic) +{ + assert(intrinsic != NI_Illegal); + assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); + return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].numArgs; } //------------------------------------------------------------------------ @@ -228,200 +227,72 @@ instruction Compiler::insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type) { assert(intrinsic != NI_Illegal); assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); + assert(type >= TYP_BYTE && type <= TYP_DOUBLE); + return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].ins[type - TYP_BYTE]; +} - switch (intrinsic) - { - case NI_SSE_Add: - return INS_addps; - - case NI_SSE_AddScalar: - return INS_addss; - - case NI_SSE_And: - return INS_andps; - - case NI_SSE_AndNot: - return INS_andnps; - - case NI_SSE_CompareEqual: - case NI_SSE_CompareGreaterThan: - case NI_SSE_CompareGreaterThanOrEqual: - case NI_SSE_CompareLessThan: - case NI_SSE_CompareLessThanOrEqual: - case NI_SSE_CompareNotEqual: - case NI_SSE_CompareNotGreaterThan: - case NI_SSE_CompareNotGreaterThanOrEqual: - case NI_SSE_CompareNotLessThan: - case NI_SSE_CompareNotLessThanOrEqual: - case NI_SSE_CompareOrdered: - case NI_SSE_CompareUnordered: - return INS_cmpps; - - case NI_SSE_CompareEqualScalar: - case NI_SSE_CompareGreaterThanScalar: - case NI_SSE_CompareGreaterThanOrEqualScalar: - case NI_SSE_CompareLessThanScalar: - case NI_SSE_CompareLessThanOrEqualScalar: - case NI_SSE_CompareNotEqualScalar: - case NI_SSE_CompareNotGreaterThanScalar: - case NI_SSE_CompareNotGreaterThanOrEqualScalar: - case NI_SSE_CompareNotLessThanScalar: - case NI_SSE_CompareNotLessThanOrEqualScalar: - case NI_SSE_CompareOrderedScalar: - case NI_SSE_CompareUnorderedScalar: - return INS_cmpss; - - case NI_SSE_CompareEqualOrderedScalar: - case NI_SSE_CompareGreaterThanOrderedScalar: - case NI_SSE_CompareGreaterThanOrEqualOrderedScalar: - case NI_SSE_CompareLessThanOrderedScalar: - case NI_SSE_CompareLessThanOrEqualOrderedScalar: - case NI_SSE_CompareNotEqualOrderedScalar: - return INS_comiss; - - case NI_SSE_CompareEqualUnorderedScalar: - case NI_SSE_CompareGreaterThanUnorderedScalar: - case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar: - case NI_SSE_CompareLessThanUnorderedScalar: - case NI_SSE_CompareLessThanOrEqualUnorderedScalar: - case NI_SSE_CompareNotEqualUnorderedScalar: - return INS_ucomiss; - - case NI_SSE_ConvertToInt32: - case NI_SSE_ConvertToInt64: - return INS_cvtss2si; - - case NI_SSE_ConvertToInt32WithTruncation: - case NI_SSE_ConvertToInt64WithTruncation: - return INS_cvttss2si; - - case NI_SSE_ConvertToSingle: - case NI_SSE_LoadScalar: - case NI_SSE_MoveScalar: - return INS_movss; - - case NI_SSE_ConvertToVector128SingleScalar: - return INS_cvtsi2ss; - - case NI_SSE_Divide: - return INS_divps; - - case NI_SSE_DivideScalar: - return INS_divss; - - case NI_SSE_LoadAlignedVector128: - case NI_SSE_StaticCast: - return INS_movaps; - - case NI_SSE_LoadHigh: - return INS_movhps; - - case NI_SSE_LoadLow: - return INS_movlps; - - case NI_SSE_LoadVector128: - return INS_movups; - - case NI_SSE_Max: - return INS_maxps; - - case NI_SSE_MaxScalar: - return INS_maxss; - - case NI_SSE_Min: - return INS_minps; - - case NI_SSE_MinScalar: - return INS_minss; - - case NI_SSE_MoveHighToLow: - return INS_movhlps; - - case NI_SSE_MoveLowToHigh: - return INS_movlhps; - - case NI_SSE_MoveMask: - return INS_movmskps; - - case NI_SSE_Multiply: - return INS_mulps; - - case NI_SSE_MultiplyScalar: - return INS_mulss; - - case NI_SSE_Or: - return INS_orps; - - case NI_SSE_Reciprocal: - return INS_rcpps; - - case NI_SSE_ReciprocalScalar: - return INS_rcpss; - - case NI_SSE_ReciprocalSqrt: - return INS_rsqrtps; - - case NI_SSE_ReciprocalSqrtScalar: - return INS_rsqrtss; - - case NI_SSE_Sqrt: - return INS_sqrtps; - - case NI_SSE_SqrtScalar: - return INS_sqrtss; - - case NI_SSE_Subtract: - return INS_subps; - - case NI_SSE_SubtractScalar: - return INS_subss; - - case NI_SSE_UnpackHigh: - return INS_unpckhps; - - case NI_SSE_UnpackLow: - return INS_unpcklps; - - case NI_SSE_Xor: - return INS_xorps; - - default: - return INS_invalid; - } +//------------------------------------------------------------------------ +// categoryOfHWIntrinsic: get the category of the given intrinsic +// +// Arguments: +// intrinsic -- id of the intrinsic function. +// +// Return Value: +// the category of the given intrinsic +// +HWIntrinsicCategory Compiler::categoryOfHWIntrinsic(NamedIntrinsic intrinsic) +{ + assert(intrinsic != NI_Illegal); + assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); + return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].category; } //------------------------------------------------------------------------ -// isIntrinsicAnIsSupportedPropertyGetter: return true if the intrinsic is "get_IsSupported" +// HWIntrinsicFlag: get the flag of the given intrinsic // // Arguments: // intrinsic -- id of the intrinsic function. // // Return Value: -// true if the intrinsic is "get_IsSupported" -// Sometimes we need to specially treat "get_IsSupported" -bool Compiler::isIntrinsicAnIsSupportedPropertyGetter(NamedIntrinsic intrinsic) +// the flag of the given intrinsic +// +HWIntrinsicFlag Compiler::flagOfHWIntrinsic(NamedIntrinsic intrinsic) { - switch (intrinsic) + assert(intrinsic != NI_Illegal); + assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); + return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag; +} + +//------------------------------------------------------------------------ +// getArgForHWIntrinsic: get the argument from the stack and match the signature +// +// Arguments: +// argType -- the required type of argument +// argClass -- the class handle of argType +// +// Return Value: +// get the argument at the given index from the stack and match the signature +// +GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass) +{ + GenTree* arg = nullptr; + if (argType == TYP_STRUCT) { - case NI_SSE_IsSupported: - case NI_SSE2_IsSupported: - case NI_SSE3_IsSupported: - case NI_SSSE3_IsSupported: - case NI_SSE41_IsSupported: - case NI_SSE42_IsSupported: - case NI_AVX_IsSupported: - case NI_AVX2_IsSupported: - case NI_AES_IsSupported: - case NI_BMI1_IsSupported: - case NI_BMI2_IsSupported: - case NI_FMA_IsSupported: - case NI_LZCNT_IsSupported: - case NI_PCLMULQDQ_IsSupported: - case NI_POPCNT_IsSupported: - return true; - default: - return false; + unsigned int argSizeBytes; + var_types base = getBaseTypeAndSizeOfSIMDType(argClass, &argSizeBytes); + argType = getSIMDTypeForSize(argSizeBytes); + assert(argType == TYP_SIMD32 || argType == TYP_SIMD16); + arg = impSIMDPopStack(argType); + assert(arg->TypeGet() == TYP_SIMD16 || arg->TypeGet() == TYP_SIMD32); + } + else + { + assert(varTypeIsArithmetic(argType)); + arg = impPopStack().val; + assert(varTypeIsArithmetic(arg->TypeGet())); + assert(genTypeSize(argType) <= genTypeSize(arg->TypeGet())); } + return arg; } //------------------------------------------------------------------------ @@ -503,6 +374,15 @@ bool Compiler::compSupportsHWIntrinsic(InstructionSet isa) isFullyImplmentedISAClass(isa)); } +static bool isTypeSupportedForIntrinsic(var_types type) +{ +#ifdef _TARGET_X86_ + return !varTypeIsLong(type); +#else + return true; +#endif +} + //------------------------------------------------------------------------ // impUnsupportedHWIntrinsic: returns a node for an unsupported HWIntrinsic // @@ -547,9 +427,25 @@ GenTree* Compiler::impUnsupportedHWIntrinsic(unsigned helper, } } +//------------------------------------------------------------------------ +// impIsTableDrivenHWIntrinsic: +// +// Arguments: +// category - category of a HW intrinsic +// +// Return Value: +// returns true if this category can be table-driven in the importer +// +static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category) +{ + // TODO - make more categories to the table-driven framework + const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD; + const bool nonTableDrivenIntrinsic = category == HW_Category_Special; + return tableDrivenIntrinsic && !nonTableDrivenIntrinsic; +} + //------------------------------------------------------------------------ // impX86HWIntrinsic: dispatch hardware intrinsics to their own implementation -// function // // Arguments: // intrinsic -- id of the intrinsic function. @@ -564,22 +460,94 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig, bool mustExpand) { - InstructionSet isa = isaOfHWIntrinsic(intrinsic); + InstructionSet isa = isaOfHWIntrinsic(intrinsic); + HWIntrinsicCategory category = categoryOfHWIntrinsic(intrinsic); + int numArgs = sig->numArgs; + var_types callType = JITtype2varType(sig->retType); // This intrinsic is supported if // - the ISA is available on the underlying hardware (compSupports returns true) // - the compiler supports this hardware intrinsics (compSupportsHWIntrinsic returns true) - bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa); + // - intrinsics do not require 64-bit registers (r64) on 32-bit platforms (isTypeSupportedForIntrinsic returns + // true) + bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(callType); - if (isIntrinsicAnIsSupportedPropertyGetter(intrinsic)) + if (category == HW_Category_IsSupportedProperty) { return gtNewIconNode(issupported); } + // - calling to unsupported intrinsics must throw PlatforNotSupportedException else if (!issupported) { return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); } + // table-driven importer of simple intrinsics + if (impIsTableDrivenHWIntrinsic(category)) + { + unsigned int sizeBytes; + var_types baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes); + assert(baseType != TYP_UNKNOWN && sizeBytes != 0); + var_types retType = getSIMDTypeForSize(sizeBytes); + unsigned simdSize = simdSizeOfHWIntrinsic(intrinsic, sig); + CORINFO_ARG_LIST_HANDLE argList = sig->args; + CORINFO_CLASS_HANDLE argClass; + var_types argType = TYP_UNKNOWN; + + assert(numArgs > 0); + assert(retType != TYP_UNDEF); + assert(retType == TYP_SIMD16 || retType == TYP_SIMD32); + assert(insOfHWIntrinsic(intrinsic, baseType) != INS_invalid); + assert(simdSize == 32 || simdSize == 16); + + GenTree* retNode = nullptr; + GenTree* op1 = nullptr; + GenTree* op2 = nullptr; + + switch (numArgs) + { + case 1: + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); + op1 = getArgForHWIntrinsic(argType, argClass); + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + break; + case 2: + argType = JITtype2varType( + strip(info.compCompHnd->getArgType(sig, info.compCompHnd->getArgNext(argList), &argClass))); + op2 = getArgForHWIntrinsic(argType, argClass); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); + op1 = getArgForHWIntrinsic(argType, argClass); + + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, intrinsic, baseType, simdSize); + break; + + case 3: + { + CORINFO_ARG_LIST_HANDLE arg2 = info.compCompHnd->getArgNext(argList); + CORINFO_ARG_LIST_HANDLE arg3 = info.compCompHnd->getArgNext(arg2); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg3, &argClass))); + GenTree* op3 = getArgForHWIntrinsic(argType, argClass); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, arg2, &argClass))); + op2 = getArgForHWIntrinsic(argType, argClass); + + argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); + op1 = getArgForHWIntrinsic(argType, argClass); + + op1 = gtNewArgList(op1, op2, op3); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + break; + } + default: + unreached(); + } + return retNode; + } + + // other intrinsics need special importation switch (isa) { case InstructionSet_SSE: @@ -940,14 +908,6 @@ GenTree* Compiler::impSSE2Intrinsic(NamedIntrinsic intrinsic, var_types baseType = TYP_UNKNOWN; switch (intrinsic) { - case NI_SSE2_Add: - assert(sig->numArgs == 2); - op2 = impSIMDPopStack(TYP_SIMD16); - op1 = impSIMDPopStack(TYP_SIMD16); - baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, NI_SSE2_Add, baseType, 16); - break; - default: JITDUMP("Not implemented hardware intrinsic"); break; @@ -989,26 +949,17 @@ GenTree* Compiler::impSSE42Intrinsic(NamedIntrinsic intrinsic, GenTree* op2 = nullptr; var_types callType = JITtype2varType(sig->retType); - CORINFO_ARG_LIST_HANDLE argLst = sig->args; + CORINFO_ARG_LIST_HANDLE argList = sig->args; CORINFO_CLASS_HANDLE argClass; CorInfoType corType; switch (intrinsic) { case NI_SSE42_Crc32: assert(sig->numArgs == 2); - -#ifdef _TARGET_X86_ - if (varTypeIsLong(callType)) - { - return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); - } -#endif - - op2 = impPopStack().val; - op1 = impPopStack().val; - - argLst = info.compCompHnd->getArgNext(argLst); // the second argument - corType = strip(info.compCompHnd->getArgType(sig, argLst, &argClass)); // type of the second argument + op2 = impPopStack().val; + op1 = impPopStack().val; + argList = info.compCompHnd->getArgNext(argList); // the second argument + corType = strip(info.compCompHnd->getArgType(sig, argList, &argClass)); // type of the second argument retNode = gtNewScalarHWIntrinsicNode(callType, op1, op2, NI_SSE42_Crc32); @@ -1035,14 +986,6 @@ GenTree* Compiler::impAVXIntrinsic(NamedIntrinsic intrinsic, var_types baseType = TYP_UNKNOWN; switch (intrinsic) { - case NI_AVX_Add: - assert(sig->numArgs == 2); - op2 = impSIMDPopStack(TYP_SIMD32); - op1 = impSIMDPopStack(TYP_SIMD32); - baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD32, op1, op2, NI_AVX_Add, baseType, 32); - break; - default: JITDUMP("Not implemented hardware intrinsic"); break; @@ -1061,14 +1004,6 @@ GenTree* Compiler::impAVX2Intrinsic(NamedIntrinsic intrinsic, var_types baseType = TYP_UNKNOWN; switch (intrinsic) { - case NI_AVX2_Add: - assert(sig->numArgs == 2); - op2 = impSIMDPopStack(TYP_SIMD32); - op1 = impSIMDPopStack(TYP_SIMD32); - baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD32, op1, op2, NI_AVX2_Add, baseType, 32); - break; - default: JITDUMP("Not implemented hardware intrinsic"); break; @@ -1115,14 +1050,6 @@ GenTree* Compiler::impLZCNTIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); var_types callType = JITtype2varType(sig->retType); - -#ifdef _TARGET_X86_ - if (varTypeIsLong(callType)) - { - return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); - } -#endif - return gtNewScalarHWIntrinsicNode(callType, impPopStack().val, NI_LZCNT_LeadingZeroCount); } @@ -1141,14 +1068,6 @@ GenTree* Compiler::impPOPCNTIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 1); var_types callType = JITtype2varType(sig->retType); - -#ifdef _TARGET_X86_ - if (varTypeIsLong(callType)) - { - return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); - } -#endif - return gtNewScalarHWIntrinsicNode(callType, impPopStack().val, NI_POPCNT_PopCount); } diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index afb84a52079d..f48a6ce6ef4b 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -252,6 +252,7 @@ INST3( andnpd, "andnpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x55)) / INST3( orps, "orps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x56)) // Or packed singles INST3( orpd, "orpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x56)) // Or packed doubles INST3( haddpd, "haddpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7C)) // Horizontal add packed doubles +INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocals of Packed Singles // SSE 2 approx arith INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocal of packed singles @@ -381,6 +382,11 @@ INST3( roundps, "roundps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( roundss, "roundss" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0A)) // Round scalar single precision floating-point values INST3( roundpd, "roundpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x09)) // Round packed double precision floating-point values INST3( roundsd, "roundsd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x0B)) // Round scalar double precision floating-point values +INST3( pmuldq, "pmuldq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x28)) // packed multiply 32-bit signed integers and store 64-bit result +INST3( blendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x14)) // Variable Blend Packed Singles +INST3( blendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x15)) // Variable Blend Packed Doubles +INST3( pblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE38(0x10)) // Variable Blend Packed Bytes + INST3(LAST_SSE4_INSTRUCTION, "LAST_SSE4_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) INST3(FIRST_AVX_INSTRUCTION, "FIRST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) @@ -398,6 +404,9 @@ INST3( vinserti128, "inserti128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SS INST3( vzeroupper, "zeroupper" , 0, IUM_WR, 0, 0, 0xC577F8, BAD_CODE, BAD_CODE) // Zero upper 128-bits of all YMM regs (includes 2-byte fixed VEX prefix) INST3( vperm2i128, "perm2i128" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x46)) // Permute 128-bit halves of input register INST3( vpermq, "permq" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x00)) // Permute 64-bit of input register +INST3( vblendvps, "blendvps" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4A)) // Variable Blend Packed Singles +INST3( vblendvpd, "blendvpd" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4B)) // Variable Blend Packed Doubles +INST3( vpblendvb, "pblendvb" , 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, SSE3A(0x4C)) // Variable Blend Packed Bytes INST3(LAST_AVX_INSTRUCTION, "LAST_AVX_INSTRUCTION", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, BAD_CODE) // Scalar instructions in SSE4.2 diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index a6f807348d1e..4169b4a4106e 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2304,96 +2304,36 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode) // void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { - NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; - GenTree* op1 = node->gtGetOp1(); - GenTree* op2 = node->gtGetOp2(); - - switch (node->gtHWIntrinsicId) - { - case NI_SSE_Add: - case NI_SSE_AddScalar: - case NI_SSE_And: - case NI_SSE_AndNot: - case NI_SSE_CompareEqual: - case NI_SSE_CompareEqualScalar: - case NI_SSE_CompareGreaterThan: - case NI_SSE_CompareGreaterThanScalar: - case NI_SSE_CompareGreaterThanOrEqual: - case NI_SSE_CompareGreaterThanOrEqualScalar: - case NI_SSE_CompareLessThan: - case NI_SSE_CompareLessThanScalar: - case NI_SSE_CompareLessThanOrEqual: - case NI_SSE_CompareLessThanOrEqualScalar: - case NI_SSE_CompareNotEqual: - case NI_SSE_CompareNotEqualScalar: - case NI_SSE_CompareNotGreaterThan: - case NI_SSE_CompareNotGreaterThanScalar: - case NI_SSE_CompareNotGreaterThanOrEqual: - case NI_SSE_CompareNotGreaterThanOrEqualScalar: - case NI_SSE_CompareNotLessThan: - case NI_SSE_CompareNotLessThanScalar: - case NI_SSE_CompareNotLessThanOrEqual: - case NI_SSE_CompareNotLessThanOrEqualScalar: - case NI_SSE_CompareOrdered: - case NI_SSE_CompareOrderedScalar: - case NI_SSE_CompareUnordered: - case NI_SSE_CompareUnorderedScalar: - case NI_SSE_ConvertToVector128SingleScalar: - case NI_SSE_Divide: - case NI_SSE_DivideScalar: - case NI_SSE_Max: - case NI_SSE_MaxScalar: - case NI_SSE_Min: - case NI_SSE_MinScalar: - case NI_SSE_Multiply: - case NI_SSE_MultiplyScalar: - case NI_SSE_Or: - case NI_SSE_Subtract: - case NI_SSE_SubtractScalar: - case NI_SSE_UnpackHigh: - case NI_SSE_UnpackLow: - case NI_SSE_Xor: - case NI_SSE2_Add: - if (!comp->getEmitter()->UseVEXEncoding()) - { - // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained - // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned - break; - } - __fallthrough; + NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; + HWIntrinsicCategory category = comp->categoryOfHWIntrinsic(intrinsicID); + int numArgs = comp->numArgsOfHWIntrinsic(intrinsicID); + GenTree* op1 = node->gtGetOp1(); + GenTree* op2 = node->gtGetOp2(); - case NI_AVX_Add: - case NI_AVX2_Add: + // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained + // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned + if (category == HW_Category_SimpleSIMD && numArgs == 2 && comp->canUseVexEncoding()) + { + if (IsContainableMemoryOp(op2)) { - assert(comp->getEmitter()->UseVEXEncoding()); - - if (IsContainableMemoryOp(op2)) - { - MakeSrcContained(node, op2); - } - else - { - // TODO-XArch-CQ: Commutative operations can have op1 be contained - op2->SetRegOptional(); - } - break; + MakeSrcContained(node, op2); } - - case NI_SSE_Shuffle: + else { - assert(op1->OperIsList()); - GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current(); - - if (op3->IsCnsIntOrI()) - { - MakeSrcContained(node, op3); - } - break; + // TODO-XArch-CQ: Commutative operations can have op1 be contained + op2->SetRegOptional(); } + } - default: - assert((intrinsicID > NI_HW_INTRINSIC_START) && (intrinsicID < NI_HW_INTRINSIC_END)); - break; + if (NamedIntrinsic == NI_SSE_Shuffle) + { + assert(op1->OperIsList()); + GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current(); + + if (op3->IsCnsIntOrI()) + { + MakeSrcContained(node, op3); + } } } #endif // FEATURE_HW_INTRINSICS diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index 5e8924fe3c56..f56a36a7d077 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2504,13 +2504,11 @@ void LinearScan::TreeNodeInfoInitSIMD(GenTreeSIMD* simdTree, TreeNodeInfo* info) void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, TreeNodeInfo* info) { NamedIntrinsic intrinsicID = intrinsicTree->gtHWIntrinsicId; - InstructionSet isa = compiler->isaOfHWIntrinsic(intrinsicID); - + InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID); if (isa == InstructionSet_AVX || isa == InstructionSet_AVX2) { SetContainsAVXFlags(true, 32); } - GenTree* op1 = intrinsicTree->gtOp.gtOp1; GenTree* op2 = intrinsicTree->gtOp.gtOp2; info->srcCount = 0; @@ -2519,15 +2517,10 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, { if (op1->OperIsList()) { - int srcCount = 0; - for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest()) { - GenTree* listItem = list->Current(); - srcCount += GetOperandInfo(listItem); + info->srcCount += GetOperandInfo(list->Current()); } - - info->srcCount += srcCount; } else { @@ -2583,6 +2576,21 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, useList.Last()->info.isTgtPref = true; break; + case NI_SSE41_BlendVariable: + { + if (!compiler->canUseVexEncoding()) + { + // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 + LocationInfoListNode* op2Info = useList.Begin()->Next(); + LocationInfoListNode* op3Info = op2Info->Next(); + op2Info->info.isDelayFree = true; + op3Info->info.isDelayFree = true; + op3Info->info.setSrcCandidates(this, RBM_XMM0); + info->hasDelayFreeSrc = true; + } + break; + } + #ifdef _TARGET_X86_ case NI_SSE42_Crc32: { diff --git a/src/jit/namedintrinsiclist.h b/src/jit/namedintrinsiclist.h index 1144df6ad262..8d5aac28f3a2 100644 --- a/src/jit/namedintrinsiclist.h +++ b/src/jit/namedintrinsiclist.h @@ -16,10 +16,83 @@ enum NamedIntrinsic : unsigned int NI_System_Collections_Generic_EqualityComparer_get_Default = 4, #if FEATURE_HW_INTRINSICS NI_HW_INTRINSIC_START, -#define HARDWARE_INTRINSIC(id, name, isa) NI_##id, +#define HARDWARE_INTRINSIC(id, name, isa, ival, size, numarg, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, category, flag) \ + NI_##id, #include "hwintrinsiclistxarch.h" NI_HW_INTRINSIC_END #endif }; +#if FEATURE_HW_INTRINSICS && defined(_TARGET_XARCH_) +enum HWIntrinsicFlag : unsigned int +{ + HW_Flag_NoFlag = 0, + + // Commutative + // - if a binary-op intrinsic is commutative (e.g., Add, Multiply), its op1 can be contained + HW_Flag_Commutative = 0x1, + + // Full range IMM intrinsic + // - the immediate value is vaild on the full range of imm8 (0-255) + HW_Flag_FullRangeIMM = 0x2, + + // Generic + // - must throw NotSupportException if the type argument is not numeric type + HW_Flag_Generic = 0x4, + + // NoCodeGen + // - should be transformed in the compiler front-end, cannot reach CodeGen + HW_Flag_NoCodeGen = 0x8, + + // Unfixed SIMD-size + // - overloaded on multiple vector sizes (SIMD size in the table is unreliable) + HW_Flag_UnfixedSIMDSize = 0x10, + + // Complex overload + // - the codegen of overloads cannot be determined by intrinsicID and base type + HW_Flag_ComplexOverloads = 0x20, +}; + +inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2) +{ + return static_cast(static_cast(c1) | static_cast(c2)); +} + +enum HWIntrinsicCategory : unsigned int +{ + // Simple SIMD intrinsics + // - take Vector128/256 parameters + // - return a Vector128/256 + // - generate single instruction + // - the codegen of overloads can be determined by intrinsicID and base type of returned vector + HW_Category_SimpleSIMD, + + // IsSupported Property + // - each ISA class has an "IsSupported" property + HW_Category_IsSupportedProperty, + + // IMM intrinsics + // - some SIMD intrinsics requires immediate value (i.e. imm8) to generate instruction + HW_Category_IMM, + + // Scalar intrinsics + // - operate over general purpose registers, like crc32, lzcnt, popcnt, etc. + HW_Category_Scalar, + + // Memory access intrinsics + // - e.g., Avx.Load, Avx.Store, Sse.LoadAligned + HW_Category_MemoryLoad, + HW_Category_MemoryStore, + + // Helper intrinsics + // - do not directly correspond to a instruction, such as Avx.SetAllVector256 + HW_Category_Helper, + + // Special intrinsics + // - have to be addressed specially + HW_Category_Special +}; + +#endif // FEATURE_HW_INTRINSICS && defined(_TARGET_XARCH_) + #endif // _NAMEDINTRINSICLIST_H_ diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply.cs b/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply.cs new file mode 100644 index 000000000000..2c7bf210ba3c --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply.cs @@ -0,0 +1,108 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace IntelHardwareIntrinsicTest +{ + class Program + { + const int Pass = 100; + const int Fail = 0; + + static unsafe int Main(string[] args) + { + int testResult = Pass; + + if (Avx.IsSupported) + { + using (TestTable floatTable = new TestTable(new float[8] { 1, -5, 100, 0, 1, -5, 100, 0 }, new float[8] { 22, -1, -50, 0, 22, -1, -50, 0 }, new float[8])) + using (TestTable doubleTable = new TestTable(new double[4] { 1, -5, 100, 0 }, new double[4] { 22, -1, -50, 0 }, new double[4])) + { + var vf1 = Unsafe.Read>(floatTable.inArray1Ptr); + var vf2 = Unsafe.Read>(floatTable.inArray2Ptr); + var vf3 = Avx.Multiply(vf1, vf2); + Unsafe.Write(floatTable.outArrayPtr, vf3); + + var vd1 = Unsafe.Read>(doubleTable.inArray1Ptr); + var vd2 = Unsafe.Read>(doubleTable.inArray2Ptr); + var vd3 = Avx.Multiply(vd1, vd2); + Unsafe.Write(doubleTable.outArrayPtr, vd3); + + if (!floatTable.CheckResult((x, y, z) => x * y == z)) + { + Console.WriteLine("AVX Multiply failed on float:"); + foreach (var item in floatTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + + if (!doubleTable.CheckResult((x, y, z) => x * y == z)) + { + Console.WriteLine("AVX Multiply failed on double:"); + foreach (var item in doubleTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + testResult = Fail; + } + } + } + return testResult; + } + + public unsafe struct TestTable : IDisposable where T1 : struct where T2 : struct where T3 : struct + { + public T1[] inArray1; + public T2[] inArray2; + public T3[] outArray; + + public void* inArray1Ptr => inHandle1.AddrOfPinnedObject().ToPointer(); + public void* inArray2Ptr => inHandle2.AddrOfPinnedObject().ToPointer(); + public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer(); + + GCHandle inHandle1; + GCHandle inHandle2; + GCHandle outHandle; + public TestTable(T1[] a, T2[] b, T3[] c) + { + this.inArray1 = a; + this.inArray2 = b; + this.outArray = c; + + inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned); + inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned); + outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned); + } + public bool CheckResult(Func check) + { + for (int i = 0; i < inArray1.Length; i++) + { + if (!check(inArray1[i], inArray2[i], outArray[i])) + { + return false; + } + } + return true; + } + + public void Dispose() + { + inHandle1.Free(); + inHandle2.Free(); + outHandle.Free(); + } + } + + } +} \ No newline at end of file diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_r.csproj new file mode 100644 index 000000000000..7c151fec7959 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_r.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + + + + + + + + + + + \ No newline at end of file diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_ro.csproj new file mode 100644 index 000000000000..b6fbea2236af --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx/Multiply_ro.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + true + + + + + + + + + + \ No newline at end of file diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply.cs b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply.cs new file mode 100644 index 000000000000..4f3b6afb4500 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply.cs @@ -0,0 +1,116 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace IntelHardwareIntrinsicTest +{ + class Program + { + const int Pass = 100; + const int Fail = 0; + + static unsafe int Main(string[] args) + { + int testResult = Pass; + + if (Avx2.IsSupported) + { + using (TestTable intTable = new TestTable(new int[8] { 1, -5, 100, 0, 1, -5, 100, 0 }, new int[8] { 22, -1, -50, 0, 22, -1, -50, 0 }, new long[4])) + using (TestTable uintTable = new TestTable(new uint[8] { 1, 5, 100, 0, 1, 5, 100, 0 }, new uint[8] { 22, 1, 50, 0, 22, 1, 50, 0 }, new ulong[4])) + { + + var vi1 = Unsafe.Read>(intTable.inArray1Ptr); + var vi2 = Unsafe.Read>(intTable.inArray2Ptr); + var vi3 = Avx2.Multiply(vi1, vi2); + Unsafe.Write(intTable.outArrayPtr, vi3); + + var vui1 = Unsafe.Read>(uintTable.inArray1Ptr); + var vui2 = Unsafe.Read>(uintTable.inArray2Ptr); + var vui3 = Avx2.Multiply(vui1, vui2); + Unsafe.Write(uintTable.outArrayPtr, vui3); + + for (int i = 0; i < intTable.outArray.Length; i++) + { + if (intTable.inArray1[i * 2] * intTable.inArray2[i * 2] != intTable.outArray[i]) + { + Console.WriteLine("AVX2 Multiply failed on int:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + return Fail; + } + } + + for (int i = 0; i < uintTable.outArray.Length; i++) + { + if (uintTable.inArray1[i * 2] * uintTable.inArray2[i * 2] != uintTable.outArray[i]) + { + Console.WriteLine("AVX2 Multiply failed on uint:"); + foreach (var item in uintTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + return Fail; + } + } + } + } + + return testResult; + } + + public unsafe struct TestTable : IDisposable where T1 : struct where T2 : struct where T3 : struct + { + public T1[] inArray1; + public T2[] inArray2; + public T3[] outArray; + + public void* inArray1Ptr => inHandle1.AddrOfPinnedObject().ToPointer(); + public void* inArray2Ptr => inHandle2.AddrOfPinnedObject().ToPointer(); + public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer(); + + GCHandle inHandle1; + GCHandle inHandle2; + GCHandle outHandle; + public TestTable(T1[] a, T2[] b, T3[] c) + { + this.inArray1 = a; + this.inArray2 = b; + this.outArray = c; + + inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned); + inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned); + outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned); + } + public bool CheckResult(Func check) + { + for (int i = 0; i < inArray1.Length; i++) + { + if (!check(inArray1[i], inArray2[i], outArray[i])) + { + return false; + } + } + return true; + } + + public void Dispose() + { + inHandle1.Free(); + inHandle2.Free(); + outHandle.Free(); + } + } + + } +} \ No newline at end of file diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_r.csproj new file mode 100644 index 000000000000..7c151fec7959 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_r.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + + + + + + + + + + + \ No newline at end of file diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_ro.csproj new file mode 100644 index 000000000000..b6fbea2236af --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Avx2/Multiply_ro.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + true + + + + + + + + + + \ No newline at end of file diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply.cs b/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply.cs new file mode 100644 index 000000000000..09feca9a4e8a --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply.cs @@ -0,0 +1,95 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. +// + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics.X86; +using System.Runtime.Intrinsics; + +namespace IntelHardwareIntrinsicTest +{ + class Program + { + const int Pass = 100; + const int Fail = 0; + + static unsafe int Main(string[] args) + { + int testResult = Pass; + + if (Sse41.IsSupported) + { + using (TestTable intTable = new TestTable(new int[4] { 1, -5, 100, 0}, new int[4] { 22, -1, -50, 0}, new long[2])) + { + + var vi1 = Unsafe.Read>(intTable.inArray1Ptr); + var vi2 = Unsafe.Read>(intTable.inArray2Ptr); + var vi3 = Sse41.Multiply(vi1, vi2); + Unsafe.Write(intTable.outArrayPtr, vi3); + + for (int i = 0; i < intTable.outArray.Length; i++) + { + if (intTable.inArray1[i * 2] * intTable.inArray2[i * 2] != intTable.outArray[i]) + { + Console.WriteLine("SSE4.1 Multiply failed on int:"); + foreach (var item in intTable.outArray) + { + Console.Write(item + ", "); + } + Console.WriteLine(); + return Fail; + } + } + } + } + return testResult; + } + + public unsafe struct TestTable : IDisposable where T1 : struct where T2 : struct where T3 : struct + { + public T1[] inArray1; + public T2[] inArray2; + public T3[] outArray; + + public void* inArray1Ptr => inHandle1.AddrOfPinnedObject().ToPointer(); + public void* inArray2Ptr => inHandle2.AddrOfPinnedObject().ToPointer(); + public void* outArrayPtr => outHandle.AddrOfPinnedObject().ToPointer(); + + GCHandle inHandle1; + GCHandle inHandle2; + GCHandle outHandle; + public TestTable(T1[] a, T2[] b, T3[] c) + { + this.inArray1 = a; + this.inArray2 = b; + this.outArray = c; + + inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned); + inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned); + outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned); + } + public bool CheckResult(Func check) + { + for (int i = 0; i < inArray1.Length; i++) + { + if (!check(inArray1[i], inArray2[i], outArray[i])) + { + return false; + } + } + return true; + } + + public void Dispose() + { + inHandle1.Free(); + inHandle2.Free(); + outHandle.Free(); + } + } + + } +} \ No newline at end of file diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_r.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_r.csproj new file mode 100644 index 000000000000..7c151fec7959 --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_r.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + + + + + + + + + + + \ No newline at end of file diff --git a/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_ro.csproj b/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_ro.csproj new file mode 100644 index 000000000000..b6fbea2236af --- /dev/null +++ b/tests/src/JIT/HardwareIntrinsics/X86/Sse41/Multiply_ro.csproj @@ -0,0 +1,34 @@ + + + + + Debug + AnyCPU + 2.0 + {95DFC527-4DC1-495E-97D7-E94EE1F7140D} + Exe + {786C830F-07A1-408B-BD7F-6EE04809D6DB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + ..\..\ + true + + + + + + + False + + + + None + true + + + + + + + + + + \ No newline at end of file From 654a8d5ff0cf1a7e0968c68a4b84aecf03ed9c1c Mon Sep 17 00:00:00 2001 From: Fei Peng Date: Fri, 19 Jan 2018 00:01:21 -0800 Subject: [PATCH 2/2] Merge SSE intrinsics into the table-driven framework --- src/jit/compiler.h | 2 +- src/jit/emitxarch.cpp | 44 +++-- src/jit/emitxarch.h | 19 +- src/jit/hwintrinsiccodegenxarch.cpp | 216 ++++++-------------- src/jit/hwintrinsiclistxarch.h | 132 ++++++------- src/jit/hwintrinsicxarch.cpp | 292 ++++++++-------------------- src/jit/instrsxarch.h | 1 - src/jit/lowerxarch.cpp | 12 +- src/jit/lsraxarch.cpp | 3 - src/jit/namedintrinsiclist.h | 26 ++- 10 files changed, 267 insertions(+), 480 deletions(-) diff --git a/src/jit/compiler.h b/src/jit/compiler.h index ea1c6b2ac443..f48a7b796561 100644 --- a/src/jit/compiler.h +++ b/src/jit/compiler.h @@ -3122,7 +3122,7 @@ class Compiler static int numArgsOfHWIntrinsic(NamedIntrinsic intrinsic); static instruction insOfHWIntrinsic(NamedIntrinsic intrinsic, var_types type); static HWIntrinsicCategory categoryOfHWIntrinsic(NamedIntrinsic intrinsic); - static HWIntrinsicFlag flagOfHWIntrinsic(NamedIntrinsic intrinsic); + static HWIntrinsicFlag flagsOfHWIntrinsic(NamedIntrinsic intrinsic); GenTree* getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE argClass); GenTreeArgList* buildArgList(CORINFO_SIG_INFO* sig); #endif // _TARGET_XARCH_ diff --git a/src/jit/emitxarch.cpp b/src/jit/emitxarch.cpp index 0f6a9ff059b8..73f28d94c469 100644 --- a/src/jit/emitxarch.cpp +++ b/src/jit/emitxarch.cpp @@ -4228,6 +4228,7 @@ void emitter::emitIns_R_R_S_I( emitCurIGsize += sz; } +#ifdef DEBUG static bool isAvxBlendv(instruction ins) { return ins == INS_vblendvps || ins == INS_vblendvpd || ins == INS_vpblendvb; @@ -4237,6 +4238,7 @@ static bool isSse41Blendv(instruction ins) { return ins == INS_blendvps || ins == INS_blendvpd || ins == INS_pblendvb; } +#endif void emitter::emitIns_R_R_R_R( instruction ins, emitAttr attr, regNumber targetReg, regNumber reg1, regNumber reg2, regNumber reg3) @@ -5216,23 +5218,23 @@ void emitter::emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, { emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_A(ins, emitTypeSize(simdtype), reg, indir, IF_RRW_ARD); + emitIns_R_A(ins, attr, reg, indir, IF_RRW_ARD); } } -void emitter::emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype) +void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base) { if (UseVEXEncoding()) { - emitIns_R_R_AR(ins, emitTypeSize(simdtype), reg, reg1, base, 0); + emitIns_R_R_AR(ins, attr, reg, reg1, base, 0); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_AR(ins, emitTypeSize(simdtype), reg, base, 0); + emitIns_R_AR(ins, attr, reg, base, 0); } } @@ -5325,70 +5327,70 @@ void emitter::emitIns_SIMD_R_R_S(instruction ins, emitAttr attr, regNumber reg, } void emitter::emitIns_SIMD_R_R_A_I( - instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype) + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival) { if (UseVEXEncoding()) { - emitIns_R_R_A_I(ins, emitTypeSize(simdtype), reg, reg1, indir, ival, IF_RWR_RRD_ARD_CNS); + emitIns_R_R_A_I(ins, attr, reg, reg1, indir, ival, IF_RWR_RRD_ARD_CNS); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_A_I(ins, emitTypeSize(simdtype), reg, indir, ival); + emitIns_R_A_I(ins, attr, reg, indir, ival); } } void emitter::emitIns_SIMD_R_R_C_I( - instruction ins, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival, var_types simdtype) + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival) { if (UseVEXEncoding()) { - emitIns_R_R_C_I(ins, emitTypeSize(simdtype), reg, reg1, fldHnd, offs, ival); + emitIns_R_R_C_I(ins, attr, reg, reg1, fldHnd, offs, ival); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_C_I(ins, emitTypeSize(simdtype), reg, fldHnd, offs, ival); + emitIns_R_C_I(ins, attr, reg, fldHnd, offs, ival); } } void emitter::emitIns_SIMD_R_R_R_I( - instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype) + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int ival) { if (UseVEXEncoding()) { - emitIns_R_R_R_I(ins, emitTypeSize(simdtype), reg, reg1, reg2, ival); + emitIns_R_R_R_I(ins, attr, reg, reg1, reg2, ival); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_R_I(ins, emitTypeSize(simdtype), reg, reg2, ival); + emitIns_R_R_I(ins, attr, reg, reg2, ival); } } void emitter::emitIns_SIMD_R_R_S_I( - instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype) + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs, int ival) { if (UseVEXEncoding()) { - emitIns_R_R_S_I(ins, emitTypeSize(simdtype), reg, reg1, varx, offs, ival); + emitIns_R_R_S_I(ins, attr, reg, reg1, varx, offs, ival); } else { if (reg1 != reg) { - emitIns_R_R(INS_movaps, emitTypeSize(simdtype), reg, reg1); + emitIns_R_R(INS_movaps, attr, reg, reg1); } - emitIns_R_S_I(ins, emitTypeSize(simdtype), reg, varx, offs, ival); + emitIns_R_S_I(ins, attr, reg, varx, offs, ival); } } #endif diff --git a/src/jit/emitxarch.h b/src/jit/emitxarch.h index 047344787116..f2e426d07a7c 100644 --- a/src/jit/emitxarch.h +++ b/src/jit/emitxarch.h @@ -455,19 +455,12 @@ void emitIns_R_AX(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg, unsigned mul, int disp); #if FEATURE_HW_INTRINSICS -void emitIns_SIMD_R_R_AR(instruction ins, regNumber reg, regNumber reg1, regNumber base, var_types simdtype); -void emitIns_SIMD_R_R_A_I( - instruction ins, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival, var_types simdtype); -void emitIns_SIMD_R_R_C_I(instruction ins, - regNumber reg, - regNumber reg1, - CORINFO_FIELD_HANDLE fldHnd, - int offs, - int ival, - var_types simdtype); -void emitIns_SIMD_R_R_R_I(instruction ins, regNumber reg, regNumber reg1, regNumber reg2, int ival, var_types simdtype); -void emitIns_SIMD_R_R_S_I( - instruction ins, regNumber reg, regNumber reg1, int varx, int offs, int ival, var_types simdtype); +void emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber base); +void emitIns_SIMD_R_R_A_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir, int ival); +void emitIns_SIMD_R_R_C_I( + instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs, int ival); +void emitIns_SIMD_R_R_R_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, regNumber reg2, int ival); +void emitIns_SIMD_R_R_S_I(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, int varx, int offs, int ival); void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber reg, regNumber reg1, GenTreeIndir* indir); void emitIns_SIMD_R_R_C( instruction ins, emitAttr attr, regNumber reg, regNumber reg1, CORINFO_FIELD_HANDLE fldHnd, int offs); diff --git a/src/jit/hwintrinsiccodegenxarch.cpp b/src/jit/hwintrinsiccodegenxarch.cpp index 6ea0de7e2332..69b3cf54baeb 100644 --- a/src/jit/hwintrinsiccodegenxarch.cpp +++ b/src/jit/hwintrinsiccodegenxarch.cpp @@ -33,12 +33,14 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // Return Value: // returns true if this category can be table-driven in CodeGen // -static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category) +static bool genIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags) { // TODO - make more categories to the table-driven framework - const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD; - const bool nonTableDrivenIntrinsic = category == HW_Category_Special; - return tableDrivenIntrinsic && !nonTableDrivenIntrinsic; + // HW_Category_Helper and HW_Flag_MultiIns usually need manual codegen + const bool tableDrivenCategory = + category == HW_Category_SimpleSIMD || category == HW_Category_MemoryLoad || category == HW_Category_SIMDScalar; + const bool tableDrivenFlag = (flags & HW_Flag_MultiIns) == 0; + return tableDrivenCategory && tableDrivenFlag; } void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) @@ -46,12 +48,13 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; InstructionSet isa = Compiler::isaOfHWIntrinsic(intrinsicID); HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID); - HWIntrinsicFlag flag = Compiler::flagOfHWIntrinsic(intrinsicID); + HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID); + int ival = Compiler::ivalOfHWIntrinsic(intrinsicID); int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID); - assert((flag & HW_Flag_NoCodeGen) == 0); + assert((flags & HW_Flag_NoCodeGen) == 0); - if (genIsTableDrivenHWIntrinsic(category)) + if (genIsTableDrivenHWIntrinsic(category, flags)) { GenTree* op1 = node->gtGetOp1(); GenTree* op2 = node->gtGetOp2(); @@ -74,10 +77,35 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node) case 1: genConsumeOperands(node); op1Reg = op1->gtRegNum; - emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg); + if (category == HW_Category_MemoryLoad) + { + emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0); + } + else if (category == HW_Category_SIMDScalar && (flags & HW_Flag_CopyUpperBits) != 0) + { + emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg); + } + else + { + + emit->emitIns_R_R(ins, simdSize, targetReg, op1Reg); + } break; + case 2: - genHWIntrinsic_R_R_RM(node, ins); + genConsumeOperands(node); + if (ival != -1) + { + genHWIntrinsic_R_R_RM_I(node, ins); + } + else if (category == HW_Category_MemoryLoad) + { + emit->emitIns_SIMD_R_R_AR(ins, emitTypeSize(TYP_SIMD16), targetReg, op1->gtRegNum, op2->gtRegNum); + } + else + { + genHWIntrinsic_R_R_RM(node, ins); + } break; case 3: { @@ -329,14 +357,14 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins) case GT_CLS_VAR_ADDR: { - emit->emitIns_SIMD_R_R_C_I(ins, targetReg, op1Reg, memBase->gtClsVar.gtClsVarHnd, 0, ival, - targetType); + emit->emitIns_SIMD_R_R_C_I(ins, emitTypeSize(targetType), targetReg, op1Reg, + memBase->gtClsVar.gtClsVarHnd, 0, ival); return; } default: { - emit->emitIns_SIMD_R_R_A_I(ins, targetReg, op1Reg, memIndir, ival, targetType); + emit->emitIns_SIMD_R_R_A_I(ins, emitTypeSize(targetType), targetReg, op1Reg, memIndir, ival); return; } } @@ -374,11 +402,11 @@ void CodeGen::genHWIntrinsic_R_R_RM_I(GenTreeHWIntrinsic* node, instruction ins) assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr)); assert(offset != (unsigned)-1); - emit->emitIns_SIMD_R_R_S_I(ins, targetReg, op1Reg, varNum, offset, ival, targetType); + emit->emitIns_SIMD_R_R_S_I(ins, emitTypeSize(targetType), targetReg, op1Reg, varNum, offset, ival); } else { - emit->emitIns_SIMD_R_R_R_I(ins, targetReg, op1Reg, op2->gtRegNum, ival, targetType); + emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(targetType), targetReg, op1Reg, op2->gtRegNum, ival); } } @@ -392,7 +420,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) regNumber targetReg = node->gtRegNum; var_types targetType = node->TypeGet(); var_types baseType = node->gtSIMDBaseType; - instruction ins = INS_invalid; + instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); regNumber op1Reg = REG_NA; regNumber op2Reg = REG_NA; @@ -408,28 +436,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) switch (intrinsicID) { - case NI_SSE_Add: - case NI_SSE_AddScalar: - case NI_SSE_And: - case NI_SSE_AndNot: case NI_SSE_ConvertToVector128SingleScalar: - case NI_SSE_Divide: - case NI_SSE_DivideScalar: - case NI_SSE_Max: - case NI_SSE_MaxScalar: - case NI_SSE_Min: - case NI_SSE_MinScalar: - case NI_SSE_MoveHighToLow: - case NI_SSE_MoveLowToHigh: - case NI_SSE_MoveScalar: - case NI_SSE_Multiply: - case NI_SSE_MultiplyScalar: - case NI_SSE_Or: - case NI_SSE_Subtract: - case NI_SSE_SubtractScalar: - case NI_SSE_UnpackHigh: - case NI_SSE_UnpackLow: - case NI_SSE_Xor: { assert(node->TypeGet() == TYP_SIMD16); assert(node->gtSIMDBaseType == TYP_FLOAT); @@ -439,51 +446,14 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) genHWIntrinsic_R_R_RM(node, ins); break; } - - case NI_SSE_CompareEqual: - case NI_SSE_CompareEqualScalar: - case NI_SSE_CompareGreaterThan: - case NI_SSE_CompareGreaterThanScalar: - case NI_SSE_CompareGreaterThanOrEqual: - case NI_SSE_CompareGreaterThanOrEqualScalar: - case NI_SSE_CompareLessThan: - case NI_SSE_CompareLessThanScalar: - case NI_SSE_CompareLessThanOrEqual: - case NI_SSE_CompareLessThanOrEqualScalar: - case NI_SSE_CompareNotEqual: - case NI_SSE_CompareNotEqualScalar: - case NI_SSE_CompareNotGreaterThan: - case NI_SSE_CompareNotGreaterThanScalar: - case NI_SSE_CompareNotGreaterThanOrEqual: - case NI_SSE_CompareNotGreaterThanOrEqualScalar: - case NI_SSE_CompareNotLessThan: - case NI_SSE_CompareNotLessThanScalar: - case NI_SSE_CompareNotLessThanOrEqual: - case NI_SSE_CompareNotLessThanOrEqualScalar: - case NI_SSE_CompareOrdered: - case NI_SSE_CompareOrderedScalar: - case NI_SSE_CompareUnordered: - case NI_SSE_CompareUnorderedScalar: - { - assert(node->TypeGet() == TYP_SIMD16); - assert(node->gtSIMDBaseType == TYP_FLOAT); - assert(Compiler::ivalOfHWIntrinsic(intrinsicID) != -1); - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - genHWIntrinsic_R_R_RM_I(node, ins); - break; - } - case NI_SSE_CompareEqualOrderedScalar: case NI_SSE_CompareEqualUnorderedScalar: { assert(baseType == TYP_FLOAT); - op2Reg = op2->gtRegNum; - - regNumber tmpReg = node->GetSingleTempReg(); - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); + op2Reg = op2->gtRegNum; + regNumber tmpReg = node->GetSingleTempReg(); - emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg); emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg); emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg); emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg); @@ -498,8 +468,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg); emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); break; @@ -511,8 +480,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg); emit->emitIns_R(INS_setae, EA_1BYTE, targetReg); emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); break; @@ -524,8 +492,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, op2Reg, op1Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg); emit->emitIns_R(INS_seta, EA_1BYTE, targetReg); emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); break; @@ -537,8 +504,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, op2Reg, op1Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg); emit->emitIns_R(INS_setae, EA_1BYTE, targetReg); emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg); break; @@ -550,10 +516,9 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); op2Reg = op2->gtRegNum; - regNumber tmpReg = node->GetSingleTempReg(); - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); + regNumber tmpReg = node->GetSingleTempReg(); - emit->emitIns_SIMD_R_R(ins, op1Reg, op2Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg); emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg); emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg); emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg); @@ -562,84 +527,23 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) break; } - case NI_SSE_ConvertToInt32: - case NI_SSE_ConvertToInt32WithTruncation: - case NI_SSE_ConvertToInt64: - case NI_SSE_ConvertToInt64WithTruncation: - case NI_SSE_Reciprocal: - case NI_SSE_ReciprocalSqrt: - case NI_SSE_Sqrt: - { - assert(baseType == TYP_FLOAT); - assert(op2 == nullptr); - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_SIMD16); - break; - } - case NI_SSE_ConvertToSingle: case NI_SSE_StaticCast: { assert(op2 == nullptr); if (op1Reg != targetReg) { - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg); } break; } - case NI_SSE_LoadAlignedVector128: - case NI_SSE_LoadScalar: - case NI_SSE_LoadVector128: - { - assert(baseType == TYP_FLOAT); - assert(op2 == nullptr); - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_R_AR(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, 0); - break; - } - - case NI_SSE_LoadHigh: - case NI_SSE_LoadLow: - { - assert(baseType == TYP_FLOAT); - op2Reg = op2->gtRegNum; - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R_AR(ins, targetReg, op1Reg, op2Reg, TYP_SIMD16); - break; - } - case NI_SSE_MoveMask: { assert(baseType == TYP_FLOAT); assert(op2 == nullptr); - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R(ins, targetReg, op1Reg, TYP_INT); - break; - } - - case NI_SSE_ReciprocalScalar: - case NI_SSE_ReciprocalSqrtScalar: - case NI_SSE_SqrtScalar: - { - assert(baseType == TYP_FLOAT); - assert(op2 == nullptr); - - instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, node->gtSIMDBaseType); - emit->emitIns_SIMD_R_R_R(ins, targetReg, op1Reg, op1Reg, TYP_SIMD16); - break; - } - - case NI_SSE_SetAllVector128: - { - assert(baseType == TYP_FLOAT); - assert(op2 == nullptr); - emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op1Reg, 0, TYP_SIMD16); + emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg); break; } @@ -651,12 +555,12 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) if (op1Reg == targetReg) { regNumber tmpReg = node->GetSingleTempReg(); - emit->emitIns_SIMD_R_R(INS_movaps, tmpReg, op1Reg, TYP_SIMD16); + emit->emitIns_R_R(INS_movaps, emitTypeSize(TYP_SIMD16), tmpReg, op1Reg); op1Reg = tmpReg; } - emit->emitIns_SIMD_R_R_R(INS_xorps, targetReg, targetReg, targetReg, TYP_SIMD16); - emit->emitIns_SIMD_R_R_R(INS_movss, targetReg, targetReg, op1Reg, TYP_SIMD16); + emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg); + emit->emitIns_SIMD_R_R_R(INS_movss, emitTypeSize(TYP_SIMD16), targetReg, targetReg, op1Reg); break; } @@ -665,7 +569,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) assert(baseType == TYP_FLOAT); assert(op1 == nullptr); assert(op2 == nullptr); - emit->emitIns_SIMD_R_R_R(INS_xorps, targetReg, targetReg, targetReg, TYP_SIMD16); + emit->emitIns_SIMD_R_R_R(INS_xorps, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg); break; } @@ -699,7 +603,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) if (op3->IsCnsIntOrI()) { ssize_t ival = op3->AsIntConCommon()->IconValue(); - emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op2Reg, (int)ival, TYP_SIMD16); + emit->emitIns_SIMD_R_R_R_I(INS_shufps, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, (int)ival); } else { @@ -749,7 +653,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node) for (unsigned i = 0; i < jmpCount; i++) { genDefineTempLabel(jmpTable[i]); - emit->emitIns_SIMD_R_R_R_I(INS_shufps, targetReg, op1Reg, op2Reg, i, TYP_SIMD16); + emit->emitIns_SIMD_R_R_R_I(INS_shufps, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, i); emit->emitIns_J(INS_jmp, switchTableEnd); } diff --git a/src/jit/hwintrinsiclistxarch.h b/src/jit/hwintrinsiclistxarch.h index f9ccf7fd8c27..ec9c9d70b054 100644 --- a/src/jit/hwintrinsiclistxarch.h +++ b/src/jit/hwintrinsiclistxarch.h @@ -15,7 +15,7 @@ 1) Each hardware intrinsic has a unique Intrinsic ID with type of `enum NamedIntrinsic` 2) All the overloads of an intrinsic in an ISA class share one Intrinsic ID 3) The intrinsic that generates instructions with a fixed imm8 operand has a `ival` field with "not -1" value, e.g., Sse.CompareEqual(v1,v2) -> cmpps xmm0, xmm1, 0 - 4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128` (16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_) or `Vector256` + 4) SIMD intrinsics have a non-zero `SIMD size` field based-on that operate over `Vector128`(16) or `Vector256`(32) 5) Scalar intrinsics that operate over general purpose registers (e.g., Sse41.Crc32) have `SIMD size` with 0 6) Each intrinsic has a `NumArg` for number of parameters, and some intrinsics that are overloaded on multiple parameter numbers have this field with -1 7) Each intrinsic has 10 `instructions` fields that list the instructions should be generated based-on the base type @@ -29,92 +29,86 @@ // SSE Intrinsics HARDWARE_INTRINSIC(SSE_IsSupported, "get_IsSupported", SSE, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_Add, "Add", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_AddScalar, "AddScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_addss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE_And, "And", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) -HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Fixed) -HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_AndNot, "AndNot", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_andnps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareEqual, "CompareEqual", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_CompareEqualOrderedScalar, "CompareEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareEqualScalar, "CompareEqualScalar", SSE, 0, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative|HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE_CompareEqualUnorderedScalar, "CompareEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareGreaterThan, "CompareGreaterThan", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrderedScalar, "CompareGreaterThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanScalar, "CompareGreaterThanScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanUnorderedScalar, "CompareGreaterThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqual, "CompareGreaterThanOrEqual", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualOrderedScalar, "CompareGreaterThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualScalar, "CompareGreaterThanOrEqualScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareGreaterThanOrEqualUnorderedScalar, "CompareGreaterThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareLessThan, "CompareLessThan", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrderedScalar, "CompareLessThanOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareLessThanScalar, "CompareLessThanScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanUnorderedScalar, "CompareLessThanUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqual, "CompareLessThanOrEqual", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualOrderedScalar, "CompareLessThanOrEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualScalar, "CompareLessThanOrEqualScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareLessThanOrEqualUnorderedScalar, "CompareLessThanOrEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareNotEqual, "CompareNotEqual", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_CompareNotEqualOrderedScalar, "CompareNotEqualOrderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_comiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_CompareNotEqualScalar, "CompareNotEqualScalar", SSE, 4, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_CompareNotEqualUnorderedScalar, "CompareNotEqualUnorderedScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_ucomiss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_MultiIns|HW_Flag_NoContainment) HARDWARE_INTRINSIC(SSE_CompareNotGreaterThan, "CompareNotGreaterThan", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanScalar, "CompareNotGreaterThanScalar", SSE, 2, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqual, "CompareNotGreaterThanOrEqual", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotGreaterThanOrEqualScalar, "CompareNotGreaterThanOrEqualScalar", SSE, 1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareNotLessThan, "CompareNotLessThan", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotLessThanScalar, "CompareNotLessThanScalar", SSE, 5, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqual, "CompareNotLessThanOrEqual", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareNotLessThanOrEqualScalar, "CompareNotLessThanOrEqualScalar", SSE, 6, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareOrdered, "CompareOrdered", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareOrderedScalar, "CompareOrderedScalar", SSE, 7, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_CompareUnordered, "CompareUnordered", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_CompareUnorderedScalar, "CompareUnorderedScalar", SSE, 3, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cmpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToInt32, "ConvertToInt32", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_ConvertToInt64, "ConvertToInt64", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_ConvertToSingle, "ConvertToSingle", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_Helper, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToVector128SingleScalar, "ConvertToVector128SingleScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvtsi2ss, INS_invalid}, HW_Category_Special, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation, "ConvertToInt32WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_ConvertToInt64WithTruncation, "ConvertToInt64WithTruncation", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_cvttss2si, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_Divide, "Divide", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_DivideScalar, "DivideScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_divss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_LoadAlignedVector128, "LoadAlignedVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadHigh, "LoadHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadLow, "LoadLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadScalar, "LoadScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_LoadVector128, "LoadVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Max, "Max", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_MaxScalar, "MaxScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_maxss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_Min, "Min", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_MinScalar, "MinScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_minss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_MoveHighToLow, "MoveHighToLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_MoveLowToHigh, "MoveLowToHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_MoveMask, "MoveMask", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movmskps, INS_invalid}, HW_Category_Special, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_MoveScalar, "MoveScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoContainment) +HARDWARE_INTRINSIC(SSE_Multiply, "Multiply", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) +HARDWARE_INTRINSIC(SSE_MultiplyScalar, "MultiplyScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mulss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_Commutative) HARDWARE_INTRINSIC(SSE_Or, "Or", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_orps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_Reciprocal, "Reciprocal", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ReciprocalScalar, "ReciprocalScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rcpss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) HARDWARE_INTRINSIC(SSE_ReciprocalSqrt, "ReciprocalSqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE, -1, 16, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar, "ReciprocalSqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_rsqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE_SetAllVector128, "SetAllVector128", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(SSE_SetScalar, "SetScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_Helper, HW_Flag_MultiIns) +HARDWARE_INTRINSIC(SSE_SetVector128, "SetVector128", SSE, -1, 16, 4, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(SSE_SetZeroVector128, "SetZeroVector128", SSE, -1, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_Helper, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Shuffle, "Shuffle", SSE, -1, 16, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_shufps, INS_invalid}, HW_Category_IMM, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_Sqrt, "Sqrt", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_Store, "Store", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movups, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreAligned, "StoreAligned", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movaps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal, "StoreAlignedNonTemporal", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movntps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreHigh, "StoreHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreLow, "StoreLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movlps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -//HARDWARE_INTRINSIC(SSE_StoreScalar, "StoreScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SqrtScalar, "SqrtScalar", SSE, -1, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sqrtss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_CopyUpperBits) +HARDWARE_INTRINSIC(SSE_StaticCast, "StaticCast", SSE, -1, 16, 1, {INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps, INS_movaps}, HW_Category_Helper, HW_Flag_TwoTypeGeneric) HARDWARE_INTRINSIC(SSE_Subtract, "Subtract", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_SubtractScalar, "SubtractScalar", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_subss, INS_invalid}, HW_Category_SIMDScalar, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_UnpackHigh, "UnpackHigh", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpckhps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) HARDWARE_INTRINSIC(SSE_UnpackLow, "UnpackLow", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_unpcklps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) -HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_NoFlag) +HARDWARE_INTRINSIC(SSE_Xor, "Xor", SSE, -1, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_xorps, INS_invalid}, HW_Category_SimpleSIMD, HW_Flag_Commutative) // SSE2 Intrinsics HARDWARE_INTRINSIC(SSE2_IsSupported, "get_IsSupported", SSE2, -1, 0, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_IsSupportedProperty, HW_Flag_NoFlag) diff --git a/src/jit/hwintrinsicxarch.cpp b/src/jit/hwintrinsicxarch.cpp index a96952db6cb6..80091f189fac 100644 --- a/src/jit/hwintrinsicxarch.cpp +++ b/src/jit/hwintrinsicxarch.cpp @@ -16,7 +16,7 @@ struct HWIntrinsicInfo int numArgs; instruction ins[10]; HWIntrinsicCategory category; - HWIntrinsicFlag flag; + HWIntrinsicFlag flags; }; static const HWIntrinsicInfo hwIntrinsicInfoArray[] = { @@ -192,7 +192,7 @@ int Compiler::ivalOfHWIntrinsic(NamedIntrinsic intrinsic) static unsigned simdSizeOfHWIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig) { assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); - assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag & HW_Flag_UnfixedSIMDSize) == 0); + assert((hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags & HW_Flag_UnfixedSIMDSize) == 0); return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].simdSize; } @@ -248,19 +248,19 @@ HWIntrinsicCategory Compiler::categoryOfHWIntrinsic(NamedIntrinsic intrinsic) } //------------------------------------------------------------------------ -// HWIntrinsicFlag: get the flag of the given intrinsic +// HWIntrinsicFlag: get the flags of the given intrinsic // // Arguments: // intrinsic -- id of the intrinsic function. // // Return Value: -// the flag of the given intrinsic +// the flags of the given intrinsic // -HWIntrinsicFlag Compiler::flagOfHWIntrinsic(NamedIntrinsic intrinsic) +HWIntrinsicFlag Compiler::flagsOfHWIntrinsic(NamedIntrinsic intrinsic) { assert(intrinsic != NI_Illegal); assert(intrinsic > NI_HW_INTRINSIC_START && intrinsic < NI_HW_INTRINSIC_END); - return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flag; + return hwIntrinsicInfoArray[intrinsic - NI_HW_INTRINSIC_START - 1].flags; } //------------------------------------------------------------------------ @@ -290,7 +290,7 @@ GenTree* Compiler::getArgForHWIntrinsic(var_types argType, CORINFO_CLASS_HANDLE assert(varTypeIsArithmetic(argType)); arg = impPopStack().val; assert(varTypeIsArithmetic(arg->TypeGet())); - assert(genTypeSize(argType) <= genTypeSize(arg->TypeGet())); + assert(genActualType(arg->gtType) == genActualType(argType)); } return arg; } @@ -436,12 +436,10 @@ GenTree* Compiler::impUnsupportedHWIntrinsic(unsigned helper, // Return Value: // returns true if this category can be table-driven in the importer // -static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category) +static bool impIsTableDrivenHWIntrinsic(HWIntrinsicCategory category, HWIntrinsicFlag flags) { - // TODO - make more categories to the table-driven framework - const bool tableDrivenIntrinsic = category == HW_Category_SimpleSIMD; - const bool nonTableDrivenIntrinsic = category == HW_Category_Special; - return tableDrivenIntrinsic && !nonTableDrivenIntrinsic; + // HW_Flag_NoCodeGen implies this intrinsic should be manually morphed in the importer. + return category != HW_Category_Special && category != HW_Category_Scalar && (flags & HW_Flag_NoCodeGen) == 0; } //------------------------------------------------------------------------ @@ -462,15 +460,24 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic, { InstructionSet isa = isaOfHWIntrinsic(intrinsic); HWIntrinsicCategory category = categoryOfHWIntrinsic(intrinsic); + HWIntrinsicFlag flags = flagsOfHWIntrinsic(intrinsic); int numArgs = sig->numArgs; - var_types callType = JITtype2varType(sig->retType); + var_types retType = JITtype2varType(sig->retType); + var_types baseType = TYP_UNKNOWN; + if (retType == TYP_STRUCT && featureSIMD) + { + unsigned int sizeBytes; + baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes); + retType = getSIMDTypeForSize(sizeBytes); + assert(sizeBytes != 0 && baseType != TYP_UNKNOWN); + } // This intrinsic is supported if // - the ISA is available on the underlying hardware (compSupports returns true) // - the compiler supports this hardware intrinsics (compSupportsHWIntrinsic returns true) // - intrinsics do not require 64-bit registers (r64) on 32-bit platforms (isTypeSupportedForIntrinsic returns // true) - bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(callType); + bool issupported = compSupports(isa) && compSupportsHWIntrinsic(isa) && isTypeSupportedForIntrinsic(retType); if (category == HW_Category_IsSupportedProperty) { @@ -481,22 +488,59 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic, { return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); } + else if (category == HW_Category_IMM) + { + GenTree* lastOp = impStackTop().val; + if (!lastOp->IsCnsIntOrI() && !mustExpand) + { + // When the imm-argument is not a constant and we are not being forced to expand, we need to + // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The + // intrinsic method is recursive and will be forced to expand, at which point + // we emit some less efficient fallback code. + return nullptr; + } + } + + if ((flags & HW_Flag_Generic) != 0) + { + assert(baseType != TYP_UNKNOWN); + // When the type argument is not a numeric type (and we are not being forced to expand), we need to + // return nullptr so a GT_CALL to the intrinsic method is emitted that will throw NotSupportedException + if (!varTypeIsArithmetic(baseType)) + { + assert(!mustExpand); + return nullptr; + } + + if ((flags & HW_Flag_TwoTypeGeneric) != 0) + { + // StaticCast has two type parameters. + assert(!mustExpand); + assert(numArgs == 1); + var_types srcType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)); + assert(srcType != TYP_UNKNOWN); + if (!varTypeIsArithmetic(srcType)) + { + return nullptr; + } + } + } // table-driven importer of simple intrinsics - if (impIsTableDrivenHWIntrinsic(category)) + if (impIsTableDrivenHWIntrinsic(category, flags)) { - unsigned int sizeBytes; - var_types baseType = getBaseTypeAndSizeOfSIMDType(sig->retTypeSigClass, &sizeBytes); - assert(baseType != TYP_UNKNOWN && sizeBytes != 0); - var_types retType = getSIMDTypeForSize(sizeBytes); + if (!varTypeIsSIMD(retType)) + { + baseType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)); + assert(baseType != TYP_UNKNOWN); + } + unsigned simdSize = simdSizeOfHWIntrinsic(intrinsic, sig); CORINFO_ARG_LIST_HANDLE argList = sig->args; CORINFO_CLASS_HANDLE argClass; var_types argType = TYP_UNKNOWN; - assert(numArgs > 0); - assert(retType != TYP_UNDEF); - assert(retType == TYP_SIMD16 || retType == TYP_SIMD32); + assert(numArgs >= 0); assert(insOfHWIntrinsic(intrinsic, baseType) != INS_invalid); assert(simdSize == 32 || simdSize == 16); @@ -506,10 +550,12 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic, switch (numArgs) { + case 0: + retNode = gtNewSimdHWIntrinsicNode(retType, intrinsic, baseType, simdSize); + break; case 1: argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); op1 = getArgForHWIntrinsic(argType, argClass); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); break; case 2: @@ -537,8 +583,7 @@ GenTree* Compiler::impX86HWIntrinsic(NamedIntrinsic intrinsic, argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass))); op1 = getArgForHWIntrinsic(argType, argClass); - op1 = gtNewArgList(op1, op2, op3); - retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, op2, op3, intrinsic, baseType, simdSize); break; } default: @@ -653,11 +698,13 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, CORINFO_SIG_INFO* sig, bool mustExpand) { - GenTree* retNode = nullptr; - GenTree* op1 = nullptr; - GenTree* op2 = nullptr; - GenTree* op3 = nullptr; - GenTree* op4 = nullptr; + GenTree* retNode = nullptr; + GenTree* op1 = nullptr; + GenTree* op2 = nullptr; + GenTree* op3 = nullptr; + GenTree* op4 = nullptr; + int simdSize = simdSizeOfHWIntrinsic(intrinsic, sig); + assert(simdSize == 16); switch (intrinsic) { @@ -671,112 +718,14 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, op2 = impPopStack().val; op1 = impPopStack().val; - GenTree* left = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op4, op3, NI_SSE_UnpackLow, TYP_FLOAT, 16); - GenTree* right = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, op1, NI_SSE_UnpackLow, TYP_FLOAT, 16); + GenTree* left = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op4, op3, NI_SSE_UnpackLow, TYP_FLOAT, simdSize); + GenTree* right = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, op1, NI_SSE_UnpackLow, TYP_FLOAT, simdSize); GenTree* control = gtNewIconNode(68, TYP_UBYTE); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, left, right, control, NI_SSE_Shuffle, TYP_FLOAT, 16); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, left, right, control, NI_SSE_Shuffle, TYP_FLOAT, simdSize); break; } - case NI_SSE_Shuffle: - { - assert(sig->numArgs == 3); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - - op3 = impStackTop().val; - - if (op3->IsCnsIntOrI() || mustExpand) - { - impPopStack(); // Pop the value we peeked at - op2 = impSIMDPopStack(TYP_SIMD16); - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, op3, intrinsic, TYP_FLOAT, 16); - } - else - { - // When op3 is not a constant and we are not being forced to expand, we need to - // return nullptr so a GT_CALL to the intrinsic method is emitted instead. The - // intrinsic method is recursive and will be forced to expand, at which point - // we emit some less efficient fallback code. - - return nullptr; - } - break; - } - - case NI_SSE_Add: - case NI_SSE_AddScalar: - case NI_SSE_And: - case NI_SSE_AndNot: - case NI_SSE_CompareEqual: - case NI_SSE_CompareEqualScalar: - case NI_SSE_CompareGreaterThan: - case NI_SSE_CompareGreaterThanScalar: - case NI_SSE_CompareGreaterThanOrEqual: - case NI_SSE_CompareGreaterThanOrEqualScalar: - case NI_SSE_CompareLessThan: - case NI_SSE_CompareLessThanScalar: - case NI_SSE_CompareLessThanOrEqual: - case NI_SSE_CompareLessThanOrEqualScalar: - case NI_SSE_CompareNotEqual: - case NI_SSE_CompareNotEqualScalar: - case NI_SSE_CompareNotGreaterThan: - case NI_SSE_CompareNotGreaterThanScalar: - case NI_SSE_CompareNotGreaterThanOrEqual: - case NI_SSE_CompareNotGreaterThanOrEqualScalar: - case NI_SSE_CompareNotLessThan: - case NI_SSE_CompareNotLessThanScalar: - case NI_SSE_CompareNotLessThanOrEqual: - case NI_SSE_CompareNotLessThanOrEqualScalar: - case NI_SSE_CompareOrdered: - case NI_SSE_CompareOrderedScalar: - case NI_SSE_CompareUnordered: - case NI_SSE_CompareUnorderedScalar: - case NI_SSE_Divide: - case NI_SSE_DivideScalar: - case NI_SSE_Max: - case NI_SSE_MaxScalar: - case NI_SSE_Min: - case NI_SSE_MinScalar: - case NI_SSE_MoveHighToLow: - case NI_SSE_MoveLowToHigh: - case NI_SSE_MoveScalar: - case NI_SSE_Multiply: - case NI_SSE_MultiplyScalar: - case NI_SSE_Or: - case NI_SSE_Subtract: - case NI_SSE_SubtractScalar: - case NI_SSE_UnpackHigh: - case NI_SSE_UnpackLow: - case NI_SSE_Xor: - assert(sig->numArgs == 2); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - op2 = impSIMDPopStack(TYP_SIMD16); - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16); - break; - - case NI_SSE_CompareEqualOrderedScalar: - case NI_SSE_CompareEqualUnorderedScalar: - case NI_SSE_CompareGreaterThanOrderedScalar: - case NI_SSE_CompareGreaterThanUnorderedScalar: - case NI_SSE_CompareGreaterThanOrEqualOrderedScalar: - case NI_SSE_CompareGreaterThanOrEqualUnorderedScalar: - case NI_SSE_CompareLessThanOrderedScalar: - case NI_SSE_CompareLessThanUnorderedScalar: - case NI_SSE_CompareLessThanOrEqualOrderedScalar: - case NI_SSE_CompareLessThanOrEqualUnorderedScalar: - case NI_SSE_CompareNotEqualOrderedScalar: - case NI_SSE_CompareNotEqualUnorderedScalar: - assert(sig->numArgs == 2); - assert(JITtype2varType(sig->retType) == TYP_BOOL); - assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT); - op2 = impSIMDPopStack(TYP_SIMD16); - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_BOOL, op1, op2, intrinsic, TYP_FLOAT, 16); - break; - case NI_SSE_ConvertToVector128SingleScalar: { assert(sig->numArgs == 2); @@ -797,18 +746,7 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, op2 = impPopStack().val; op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16); - break; - } - - case NI_SSE_LoadHigh: - case NI_SSE_LoadLow: - { - assert(sig->numArgs == 2); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - op2 = impPopStack().val; - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, 16); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_FLOAT, simdSize); break; } @@ -817,77 +755,15 @@ GenTree* Compiler::impSSEIntrinsic(NamedIntrinsic intrinsic, assert(JITtype2varType(sig->retType) == TYP_INT); assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT); op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_INT, op1, intrinsic, TYP_FLOAT, 16); - break; - - case NI_SSE_StaticCast: - { - assert(sig->numArgs == 1); - var_types tgtType = getBaseTypeOfSIMDType(sig->retTypeSigClass); - var_types srcType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)); - - if (varTypeIsArithmetic(tgtType) && varTypeIsArithmetic(srcType)) - { - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, tgtType, 16); - } - else - { - return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); - } + retNode = gtNewSimdHWIntrinsicNode(TYP_INT, op1, intrinsic, TYP_FLOAT, simdSize); break; - } - case NI_SSE_LoadAlignedVector128: - case NI_SSE_LoadScalar: - case NI_SSE_LoadVector128: case NI_SSE_SetAllVector128: - case NI_SSE_SetScalar: assert(sig->numArgs == 1); assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); op1 = impPopStack().val; - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, TYP_FLOAT, 16); - break; - - case NI_SSE_Reciprocal: - case NI_SSE_ReciprocalScalar: - case NI_SSE_ReciprocalSqrt: - case NI_SSE_ReciprocalSqrtScalar: - case NI_SSE_Sqrt: - case NI_SSE_SqrtScalar: - assert(sig->numArgs == 1); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, TYP_FLOAT, 16); - break; - - case NI_SSE_ConvertToInt32: - case NI_SSE_ConvertToInt32WithTruncation: - case NI_SSE_ConvertToInt64: - case NI_SSE_ConvertToInt64WithTruncation: - case NI_SSE_ConvertToSingle: - { - assert(sig->numArgs == 1); - assert(getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args)) == TYP_FLOAT); - var_types callType = JITtype2varType(sig->retType); - -#ifdef _TARGET_X86_ - if (varTypeIsLong(callType)) - { - assert(intrinsic == NI_SSE_ConvertToInt64 || intrinsic == NI_SSE_ConvertToInt64WithTruncation); - return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand); - } -#endif // _TARGET_X86_ - - op1 = impSIMDPopStack(TYP_SIMD16); - retNode = gtNewSimdHWIntrinsicNode(callType, op1, intrinsic, TYP_FLOAT, 16); - break; - } - - case NI_SSE_SetZeroVector128: - assert(sig->numArgs == 0); - assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT); - retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, intrinsic, TYP_FLOAT, 16); + retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtCloneExpr(op1), gtNewIconNode(0), NI_SSE_Shuffle, + TYP_FLOAT, simdSize); break; default: diff --git a/src/jit/instrsxarch.h b/src/jit/instrsxarch.h index f48a6ce6ef4b..1b8533248e83 100644 --- a/src/jit/instrsxarch.h +++ b/src/jit/instrsxarch.h @@ -252,7 +252,6 @@ INST3( andnpd, "andnpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x55)) / INST3( orps, "orps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x56)) // Or packed singles INST3( orpd, "orpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x56)) // Or packed doubles INST3( haddpd, "haddpd", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKDBL(0x7C)) // Horizontal add packed doubles -INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocals of Packed Singles // SSE 2 approx arith INST3( rcpps, "rcpps", 0, IUM_WR, 0, 0, BAD_CODE, BAD_CODE, PCKFLT(0x53)) // Reciprocal of packed singles diff --git a/src/jit/lowerxarch.cpp b/src/jit/lowerxarch.cpp index 4169b4a4106e..a046704c91be 100644 --- a/src/jit/lowerxarch.cpp +++ b/src/jit/lowerxarch.cpp @@ -2305,14 +2305,17 @@ void Lowering::ContainCheckSIMD(GenTreeSIMD* simdNode) void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) { NamedIntrinsic intrinsicID = node->gtHWIntrinsicId; - HWIntrinsicCategory category = comp->categoryOfHWIntrinsic(intrinsicID); - int numArgs = comp->numArgsOfHWIntrinsic(intrinsicID); + HWIntrinsicCategory category = Compiler::categoryOfHWIntrinsic(intrinsicID); + HWIntrinsicFlag flags = Compiler::flagsOfHWIntrinsic(intrinsicID); + int numArgs = Compiler::numArgsOfHWIntrinsic(intrinsicID); GenTree* op1 = node->gtGetOp1(); GenTree* op2 = node->gtGetOp2(); // TODO-XArch-CQ: Non-VEX encoded instructions can have both ops contained // TODO-XArch-CQ: Non-VEX encoded instructions require memory ops to be aligned - if (category == HW_Category_SimpleSIMD && numArgs == 2 && comp->canUseVexEncoding()) + + if (comp->canUseVexEncoding() && numArgs == 2 && (flags & HW_Flag_NoContainment) == 0 && + category == HW_Category_SimpleSIMD) { if (IsContainableMemoryOp(op2)) { @@ -2325,7 +2328,8 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node) } } - if (NamedIntrinsic == NI_SSE_Shuffle) + // TODO - change to all IMM intrinsics + if (intrinsicID == NI_SSE_Shuffle) { assert(op1->OperIsList()); GenTree* op3 = op1->AsArgList()->Rest()->Rest()->Current(); diff --git a/src/jit/lsraxarch.cpp b/src/jit/lsraxarch.cpp index f56a36a7d077..fbadd566c8ef 100644 --- a/src/jit/lsraxarch.cpp +++ b/src/jit/lsraxarch.cpp @@ -2564,7 +2564,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, info->internalIntCount = 2; info->setInternalCandidates(this, allRegs(TYP_INT)); - break; } break; } @@ -2577,7 +2576,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, break; case NI_SSE41_BlendVariable: - { if (!compiler->canUseVexEncoding()) { // SSE4.1 blendv* hardcode the mask vector (op3) in XMM0 @@ -2589,7 +2587,6 @@ void LinearScan::TreeNodeInfoInitHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, info->hasDelayFreeSrc = true; } break; - } #ifdef _TARGET_X86_ case NI_SSE42_Crc32: diff --git a/src/jit/namedintrinsiclist.h b/src/jit/namedintrinsiclist.h index 8d5aac28f3a2..6387f60cbe51 100644 --- a/src/jit/namedintrinsiclist.h +++ b/src/jit/namedintrinsiclist.h @@ -39,18 +39,33 @@ enum HWIntrinsicFlag : unsigned int // Generic // - must throw NotSupportException if the type argument is not numeric type HW_Flag_Generic = 0x4, + // Two-type Generic + // - the intrinsic has two type parameters + HW_Flag_TwoTypeGeneric = 0xC, // NoCodeGen // - should be transformed in the compiler front-end, cannot reach CodeGen - HW_Flag_NoCodeGen = 0x8, + HW_Flag_NoCodeGen = 0x10, // Unfixed SIMD-size // - overloaded on multiple vector sizes (SIMD size in the table is unreliable) - HW_Flag_UnfixedSIMDSize = 0x10, + HW_Flag_UnfixedSIMDSize = 0x20, // Complex overload // - the codegen of overloads cannot be determined by intrinsicID and base type - HW_Flag_ComplexOverloads = 0x20, + HW_Flag_ComplexOverloads = 0x40, + + // Multi-instruction + // - that one intrinsic can generate multiple instructions + HW_Flag_MultiIns = 0x80, + + // NoContainment + // the intrinsic cannot be contained + HW_Flag_NoContainment = 0x100, + + // Copy Upper bits + // some SIMD scalar intrinsics need the semantics of copying upper bits from the source operand + HW_Flag_CopyUpperBits = 0x200, }; inline HWIntrinsicFlag operator|(HWIntrinsicFlag c1, HWIntrinsicFlag c2) @@ -63,7 +78,6 @@ enum HWIntrinsicCategory : unsigned int // Simple SIMD intrinsics // - take Vector128/256 parameters // - return a Vector128/256 - // - generate single instruction // - the codegen of overloads can be determined by intrinsicID and base type of returned vector HW_Category_SimpleSIMD, @@ -79,6 +93,10 @@ enum HWIntrinsicCategory : unsigned int // - operate over general purpose registers, like crc32, lzcnt, popcnt, etc. HW_Category_Scalar, + // SIMD scalar + // - operate over vector registers(XMM), but just compute on the first element + HW_Category_SIMDScalar, + // Memory access intrinsics // - e.g., Avx.Load, Avx.Store, Sse.LoadAligned HW_Category_MemoryLoad,