-
Notifications
You must be signed in to change notification settings - Fork 5.3k
[Draft] Accelerate Half with FP16 ISA
#122649
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -127,8 +127,9 @@ bool emitter::Is3OpRmwInstruction(instruction ins) | |
| default: | ||
| { | ||
| return ((ins >= FIRST_FMA_INSTRUCTION) && (ins <= LAST_FMA_INSTRUCTION)) || | ||
| (IsAVXVNNIFamilyInstruction(ins)) || | ||
| ((ins >= FIRST_AVXIFMA_INSTRUCTION) && (ins <= LAST_AVXIFMA_INSTRUCTION)); | ||
| IsAVXVNNIFamilyInstruction(ins) || | ||
| ((ins >= FIRST_AVXIFMA_INSTRUCTION) && (ins <= LAST_AVXIFMA_INSTRUCTION)) || | ||
| ((ins >= FIRST_AVX10V1_FMA_INSTR) && (ins <= LAST_AVX10V1_FMA_INSTR)); | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -3077,7 +3078,7 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co | |
| // 1. An escape byte 0F (For isa before AVX10.2) | ||
| // 2. A map number from 0 to 7 (For AVX10.2 and above) | ||
| leadingBytes = check; | ||
| assert((leadingBytes == 0x0F) || ((m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2) || | ||
| assert((leadingBytes == 0x0F) || ((m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v1) || | ||
| (m_compiler->compIsaSupportedDebugOnly(InstructionSet_APX))) && | ||
| (leadingBytes >= 0x00) && (leadingBytes <= 0x07))); | ||
|
|
||
|
|
@@ -3159,15 +3160,21 @@ emitter::code_t emitter::emitExtractEvexPrefix(instruction ins, code_t& code) co | |
|
|
||
| case 0x05: | ||
| { | ||
| assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v2)); | ||
| assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); | ||
| evexPrefix |= (0x05 << 16); | ||
| break; | ||
| } | ||
|
|
||
| case 0x06: | ||
| { | ||
| assert(m_compiler->compIsaSupportedDebugOnly(InstructionSet_AVX10v1)); | ||
| evexPrefix |= (0x06 << 16); | ||
| break; | ||
| } | ||
|
|
||
| case 0x01: | ||
| case 0x02: | ||
| case 0x03: | ||
| case 0x06: | ||
| case 0x07: | ||
| default: | ||
| { | ||
|
|
@@ -5388,10 +5395,8 @@ UNATIVE_OFFSET emitter::emitInsSizeAM(instrDesc* id, code_t code) | |
|
|
||
| assert((attrSize == EA_4BYTE) || (attrSize == EA_PTRSIZE) // Only for x64 | ||
| || (attrSize == EA_16BYTE) || (attrSize == EA_32BYTE) || (attrSize == EA_64BYTE) // only for x64 | ||
| || (ins == INS_movzx) || (ins == INS_movsx) || | ||
| (ins == INS_cmpxchg) | ||
| // kmov instructions reach this path with EA_8BYTE size, even on x86 | ||
| || IsKMOVInstruction(ins) | ||
|
Comment on lines
-5393
to
-5394
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the reason for removing this part of the assert?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Think that was an error, will fix. |
||
| || (ins == INS_movzx) || (ins == INS_movsx) || (ins == INS_vmovsh) || (ins == INS_cmpxchg) || | ||
| IsKMOVInstruction(ins) | ||
| // The prefetch instructions are always 3 bytes and have part of their modr/m byte hardcoded | ||
| || isPrefetch(ins)); | ||
|
|
||
|
|
@@ -7424,6 +7429,7 @@ bool emitter::IsMovInstruction(instruction ins) | |
| case INS_kmovw_gpr: | ||
| case INS_kmovd_gpr: | ||
| case INS_kmovq_gpr: | ||
| case INS_vmovsh: | ||
| { | ||
| return true; | ||
| } | ||
|
|
@@ -7622,6 +7628,13 @@ bool emitter::HasSideEffect(instruction ins, emitAttr size) | |
| break; | ||
| } | ||
|
|
||
| case INS_vmovsh: | ||
| { | ||
| // Clears the upper bits | ||
| hasSideEffect = true; | ||
| break; | ||
| } | ||
|
|
||
| default: | ||
| { | ||
| unreached(); | ||
|
|
@@ -7895,6 +7908,12 @@ bool emitter::emitIns_Mov( | |
| break; | ||
| } | ||
|
|
||
| case INS_vmovsh: | ||
| { | ||
| assert(isFloatReg(dstReg) && isFloatReg(srcReg)); | ||
| break; | ||
| } | ||
|
|
||
| default: | ||
| { | ||
| unreached(); | ||
|
|
@@ -11797,6 +11816,10 @@ const char* emitter::emitRegName(regNumber reg, emitAttr attr, bool varName) con | |
|
|
||
| case EA_2BYTE: | ||
| { | ||
| if (IsXMMReg(reg)) | ||
| { | ||
| return emitXMMregName(reg); | ||
| } | ||
|
Comment on lines
+11819
to
+11822
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't be |
||
| #if defined(TARGET_AMD64) | ||
| if (reg > REG_RDI) | ||
| { | ||
|
|
@@ -14522,7 +14545,7 @@ BYTE* emitter::emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) | |
| // Is this a 'big' opcode? | ||
| else if (code & 0xFF000000) | ||
| { | ||
| if (size == EA_2BYTE) | ||
| if (size == EA_2BYTE && ins != INS_vmovsh) | ||
| { | ||
| assert(ins == INS_movbe); | ||
|
|
||
|
|
@@ -15390,7 +15413,7 @@ BYTE* emitter::emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc) | |
| // Is this a 'big' opcode? | ||
| else if (code & 0xFF000000) | ||
| { | ||
| if (size == EA_2BYTE) | ||
| if (size == EA_2BYTE && !IsSimdInstruction(ins)) | ||
| { | ||
| assert(ins == INS_movbe); | ||
|
|
||
|
|
@@ -20894,28 +20917,29 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins | |
| case INS_movups: | ||
| case INS_movapd: | ||
| case INS_movupd: | ||
| { | ||
| if (memAccessKind == PERFSCORE_MEMORY_NONE) | ||
| // todo-xarch-half: come back to fix | ||
| { | ||
| // ins reg, reg | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_4X; | ||
| result.insLatency = PERFSCORE_LATENCY_ZERO; | ||
| } | ||
| else if (memAccessKind == PERFSCORE_MEMORY_READ) | ||
| { | ||
| // ins reg, mem | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_2X; | ||
| result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C; | ||
| } | ||
| else | ||
| { | ||
| // ins mem, reg | ||
| assert(memAccessKind == PERFSCORE_MEMORY_WRITE); | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_1C; | ||
| result.insLatency += PERFSCORE_LATENCY_2C; | ||
| if (memAccessKind == PERFSCORE_MEMORY_NONE) | ||
| { | ||
| // ins reg, reg | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_4X; | ||
| result.insLatency = PERFSCORE_LATENCY_ZERO; | ||
| } | ||
| else if (memAccessKind == PERFSCORE_MEMORY_READ) | ||
| { | ||
| // ins reg, mem | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_2X; | ||
| result.insLatency += opSize == EA_32BYTE ? PERFSCORE_LATENCY_3C : PERFSCORE_LATENCY_2C; | ||
| } | ||
| else | ||
| { | ||
| // ins mem, reg | ||
| assert(memAccessKind == PERFSCORE_MEMORY_WRITE); | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_1C; | ||
| result.insLatency += PERFSCORE_LATENCY_2C; | ||
| } | ||
| break; | ||
| } | ||
| break; | ||
| } | ||
|
|
||
| case INS_movhps: | ||
| case INS_movhpd: | ||
|
|
@@ -20946,6 +20970,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins | |
| case INS_movss: | ||
| case INS_movsd_simd: | ||
| case INS_movddup: | ||
| case INS_vmovsh: | ||
| { | ||
| if (memAccessKind == PERFSCORE_MEMORY_NONE) | ||
| { | ||
|
|
@@ -21377,6 +21402,67 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins | |
| break; | ||
| } | ||
|
|
||
| case INS_vaddsh: | ||
| case INS_vsubsh: | ||
| case INS_vmulsh: | ||
| case INS_vfmadd213sh: | ||
| case INS_vmaxsh: | ||
| case INS_vminsh: | ||
| case INS_vcvtsh2ss: | ||
| result.insLatency = PERFSCORE_LATENCY_4C; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_2X; | ||
| break; | ||
|
|
||
| case INS_vdivsh: | ||
| result.insLatency = PERFSCORE_LATENCY_14C; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_4C; | ||
| break; | ||
|
|
||
| case INS_vsqrtsh: | ||
| result.insLatency = PERFSCORE_LATENCY_14C; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_4P5C; | ||
| break; | ||
|
|
||
| case INS_vrsqrtsh: | ||
| case INS_vcomish: | ||
| case INS_vucomish: | ||
| case INS_vrcpsh: | ||
| result.insLatency = PERFSCORE_LATENCY_4C; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_1C; | ||
| break; | ||
|
|
||
| case INS_vrndscalesh: | ||
| result.insLatency = PERFSCORE_LATENCY_8C; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_1C; | ||
| break; | ||
|
|
||
| case INS_vcvtss2sh: | ||
| result.insLatency = PERFSCORE_LATENCY_6C; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_1P5X; | ||
| break; | ||
|
|
||
| case INS_vcvtsd2sh: | ||
| result.insLatency = PERFSCORE_THROUGHPUT_ILLEGAL; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_ILLEGAL; | ||
| break; | ||
|
|
||
| case INS_vcvtsh2sd: | ||
| result.insLatency = PERFSCORE_LATENCY_10C; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_1C; | ||
| break; | ||
|
|
||
| case INS_vcvtsi2sh32: | ||
| case INS_vcvtsi2sh64: | ||
| case INS_vcvtsh2si32: | ||
| case INS_vcvtsh2si64: | ||
| case INS_vcvtusi2sh32: | ||
| case INS_vcvtusi2sh64: | ||
| case INS_vcvtsh2usi32: | ||
| case INS_vcvtsh2usi64: | ||
| result.insLatency = PERFSCORE_LATENCY_7C; | ||
| result.insThroughput = PERFSCORE_THROUGHPUT_1C; | ||
| break; | ||
|
|
||
| default: | ||
| { | ||
| assert((unsigned)ins < ArrLen(insThroughputInfos)); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.